qemu/fpu/softfloat.c
<<
>>
Prefs
   1/*
   2 * QEMU float support
   3 *
   4 * The code in this source file is derived from release 2a of the SoftFloat
   5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6 * some later contributions) are provided under that license, as detailed below.
   7 * It has subsequently been modified by contributors to the QEMU Project,
   8 * so some portions are provided under:
   9 *  the SoftFloat-2a license
  10 *  the BSD license
  11 *  GPL-v2-or-later
  12 *
  13 * Any future contributions to this file after December 1st 2014 will be
  14 * taken to be licensed under the Softfloat-2a license unless specifically
  15 * indicated otherwise.
  16 */
  17
  18/*
  19===============================================================================
  20This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21Arithmetic Package, Release 2a.
  22
  23Written by John R. Hauser.  This work was made possible in part by the
  24International Computer Science Institute, located at Suite 600, 1947 Center
  25Street, Berkeley, California 94704.  Funding was partially provided by the
  26National Science Foundation under grant MIP-9311980.  The original version
  27of this code was written as part of a project to build a fixed-point vector
  28processor in collaboration with the University of California at Berkeley,
  29overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31arithmetic/SoftFloat.html'.
  32
  33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39Derivative works are acceptable, even for commercial purposes, so long as
  40(1) they include prominent notice that the work is derivative, and (2) they
  41include prominent notice akin to these four paragraphs for those parts of
  42this code that are retained.
  43
  44===============================================================================
  45*/
  46
  47/* BSD licensing:
  48 * Copyright (c) 2006, Fabrice Bellard
  49 * All rights reserved.
  50 *
  51 * Redistribution and use in source and binary forms, with or without
  52 * modification, are permitted provided that the following conditions are met:
  53 *
  54 * 1. Redistributions of source code must retain the above copyright notice,
  55 * this list of conditions and the following disclaimer.
  56 *
  57 * 2. Redistributions in binary form must reproduce the above copyright notice,
  58 * this list of conditions and the following disclaimer in the documentation
  59 * and/or other materials provided with the distribution.
  60 *
  61 * 3. Neither the name of the copyright holder nor the names of its contributors
  62 * may be used to endorse or promote products derived from this software without
  63 * specific prior written permission.
  64 *
  65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75 * THE POSSIBILITY OF SUCH DAMAGE.
  76 */
  77
  78/* Portions of this work are licensed under the terms of the GNU GPL,
  79 * version 2 or later. See the COPYING file in the top-level directory.
  80 */
  81
  82/* softfloat (and in particular the code in softfloat-specialize.h) is
  83 * target-dependent and needs the TARGET_* macros.
  84 */
  85#include "qemu/osdep.h"
  86
  87#include "fpu/softfloat.h"
  88
  89/* We only need stdlib for abort() */
  90
  91/*----------------------------------------------------------------------------
  92| Primitive arithmetic functions, including multi-word arithmetic, and
  93| division and square root approximations.  (Can be specialized to target if
  94| desired.)
  95*----------------------------------------------------------------------------*/
  96#include "softfloat-macros.h"
  97
  98/*----------------------------------------------------------------------------
  99| Functions and definitions to determine:  (1) whether tininess for underflow
 100| is detected before or after rounding by default, (2) what (if anything)
 101| happens when exceptions are raised, (3) how signaling NaNs are distinguished
 102| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 103| are propagated from function inputs to output.  These details are target-
 104| specific.
 105*----------------------------------------------------------------------------*/
 106#include "softfloat-specialize.h"
 107
 108/*----------------------------------------------------------------------------
 109| Returns the fraction bits of the half-precision floating-point value `a'.
 110*----------------------------------------------------------------------------*/
 111
 112static inline uint32_t extractFloat16Frac(float16 a)
 113{
 114    return float16_val(a) & 0x3ff;
 115}
 116
 117/*----------------------------------------------------------------------------
 118| Returns the exponent bits of the half-precision floating-point value `a'.
 119*----------------------------------------------------------------------------*/
 120
 121static inline int extractFloat16Exp(float16 a)
 122{
 123    return (float16_val(a) >> 10) & 0x1f;
 124}
 125
 126/*----------------------------------------------------------------------------
 127| Returns the sign bit of the single-precision floating-point value `a'.
 128*----------------------------------------------------------------------------*/
 129
 130static inline flag extractFloat16Sign(float16 a)
 131{
 132    return float16_val(a)>>15;
 133}
 134
 135/*----------------------------------------------------------------------------
 136| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 137| and 7, and returns the properly rounded 32-bit integer corresponding to the
 138| input.  If `zSign' is 1, the input is negated before being converted to an
 139| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 140| is simply rounded to an integer, with the inexact exception raised if the
 141| input cannot be represented exactly as an integer.  However, if the fixed-
 142| point input is too large, the invalid exception is raised and the largest
 143| positive or negative integer is returned.
 144*----------------------------------------------------------------------------*/
 145
 146static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
 147{
 148    int8_t roundingMode;
 149    flag roundNearestEven;
 150    int8_t roundIncrement, roundBits;
 151    int32_t z;
 152
 153    roundingMode = status->float_rounding_mode;
 154    roundNearestEven = ( roundingMode == float_round_nearest_even );
 155    switch (roundingMode) {
 156    case float_round_nearest_even:
 157    case float_round_ties_away:
 158        roundIncrement = 0x40;
 159        break;
 160    case float_round_to_zero:
 161        roundIncrement = 0;
 162        break;
 163    case float_round_up:
 164        roundIncrement = zSign ? 0 : 0x7f;
 165        break;
 166    case float_round_down:
 167        roundIncrement = zSign ? 0x7f : 0;
 168        break;
 169    default:
 170        abort();
 171    }
 172    roundBits = absZ & 0x7F;
 173    absZ = ( absZ + roundIncrement )>>7;
 174    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 175    z = absZ;
 176    if ( zSign ) z = - z;
 177    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 178        float_raise(float_flag_invalid, status);
 179        return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 180    }
 181    if (roundBits) {
 182        status->float_exception_flags |= float_flag_inexact;
 183    }
 184    return z;
 185
 186}
 187
 188/*----------------------------------------------------------------------------
 189| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 190| `absZ1', with binary point between bits 63 and 64 (between the input words),
 191| and returns the properly rounded 64-bit integer corresponding to the input.
 192| If `zSign' is 1, the input is negated before being converted to an integer.
 193| Ordinarily, the fixed-point input is simply rounded to an integer, with
 194| the inexact exception raised if the input cannot be represented exactly as
 195| an integer.  However, if the fixed-point input is too large, the invalid
 196| exception is raised and the largest positive or negative integer is
 197| returned.
 198*----------------------------------------------------------------------------*/
 199
 200static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
 201                               float_status *status)
 202{
 203    int8_t roundingMode;
 204    flag roundNearestEven, increment;
 205    int64_t z;
 206
 207    roundingMode = status->float_rounding_mode;
 208    roundNearestEven = ( roundingMode == float_round_nearest_even );
 209    switch (roundingMode) {
 210    case float_round_nearest_even:
 211    case float_round_ties_away:
 212        increment = ((int64_t) absZ1 < 0);
 213        break;
 214    case float_round_to_zero:
 215        increment = 0;
 216        break;
 217    case float_round_up:
 218        increment = !zSign && absZ1;
 219        break;
 220    case float_round_down:
 221        increment = zSign && absZ1;
 222        break;
 223    default:
 224        abort();
 225    }
 226    if ( increment ) {
 227        ++absZ0;
 228        if ( absZ0 == 0 ) goto overflow;
 229        absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 230    }
 231    z = absZ0;
 232    if ( zSign ) z = - z;
 233    if ( z && ( ( z < 0 ) ^ zSign ) ) {
 234 overflow:
 235        float_raise(float_flag_invalid, status);
 236        return
 237              zSign ? (int64_t) LIT64( 0x8000000000000000 )
 238            : LIT64( 0x7FFFFFFFFFFFFFFF );
 239    }
 240    if (absZ1) {
 241        status->float_exception_flags |= float_flag_inexact;
 242    }
 243    return z;
 244
 245}
 246
 247/*----------------------------------------------------------------------------
 248| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 249| `absZ1', with binary point between bits 63 and 64 (between the input words),
 250| and returns the properly rounded 64-bit unsigned integer corresponding to the
 251| input.  Ordinarily, the fixed-point input is simply rounded to an integer,
 252| with the inexact exception raised if the input cannot be represented exactly
 253| as an integer.  However, if the fixed-point input is too large, the invalid
 254| exception is raised and the largest unsigned integer is returned.
 255*----------------------------------------------------------------------------*/
 256
 257static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
 258                                uint64_t absZ1, float_status *status)
 259{
 260    int8_t roundingMode;
 261    flag roundNearestEven, increment;
 262
 263    roundingMode = status->float_rounding_mode;
 264    roundNearestEven = (roundingMode == float_round_nearest_even);
 265    switch (roundingMode) {
 266    case float_round_nearest_even:
 267    case float_round_ties_away:
 268        increment = ((int64_t)absZ1 < 0);
 269        break;
 270    case float_round_to_zero:
 271        increment = 0;
 272        break;
 273    case float_round_up:
 274        increment = !zSign && absZ1;
 275        break;
 276    case float_round_down:
 277        increment = zSign && absZ1;
 278        break;
 279    default:
 280        abort();
 281    }
 282    if (increment) {
 283        ++absZ0;
 284        if (absZ0 == 0) {
 285            float_raise(float_flag_invalid, status);
 286            return LIT64(0xFFFFFFFFFFFFFFFF);
 287        }
 288        absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
 289    }
 290
 291    if (zSign && absZ0) {
 292        float_raise(float_flag_invalid, status);
 293        return 0;
 294    }
 295
 296    if (absZ1) {
 297        status->float_exception_flags |= float_flag_inexact;
 298    }
 299    return absZ0;
 300}
 301
 302/*----------------------------------------------------------------------------
 303| Returns the fraction bits of the single-precision floating-point value `a'.
 304*----------------------------------------------------------------------------*/
 305
 306static inline uint32_t extractFloat32Frac( float32 a )
 307{
 308
 309    return float32_val(a) & 0x007FFFFF;
 310
 311}
 312
 313/*----------------------------------------------------------------------------
 314| Returns the exponent bits of the single-precision floating-point value `a'.
 315*----------------------------------------------------------------------------*/
 316
 317static inline int extractFloat32Exp(float32 a)
 318{
 319
 320    return ( float32_val(a)>>23 ) & 0xFF;
 321
 322}
 323
 324/*----------------------------------------------------------------------------
 325| Returns the sign bit of the single-precision floating-point value `a'.
 326*----------------------------------------------------------------------------*/
 327
 328static inline flag extractFloat32Sign( float32 a )
 329{
 330
 331    return float32_val(a)>>31;
 332
 333}
 334
 335/*----------------------------------------------------------------------------
 336| If `a' is denormal and we are in flush-to-zero mode then set the
 337| input-denormal exception and return zero. Otherwise just return the value.
 338*----------------------------------------------------------------------------*/
 339float32 float32_squash_input_denormal(float32 a, float_status *status)
 340{
 341    if (status->flush_inputs_to_zero) {
 342        if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 343            float_raise(float_flag_input_denormal, status);
 344            return make_float32(float32_val(a) & 0x80000000);
 345        }
 346    }
 347    return a;
 348}
 349
 350/*----------------------------------------------------------------------------
 351| Normalizes the subnormal single-precision floating-point value represented
 352| by the denormalized significand `aSig'.  The normalized exponent and
 353| significand are stored at the locations pointed to by `zExpPtr' and
 354| `zSigPtr', respectively.
 355*----------------------------------------------------------------------------*/
 356
 357static void
 358 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
 359{
 360    int8_t shiftCount;
 361
 362    shiftCount = countLeadingZeros32( aSig ) - 8;
 363    *zSigPtr = aSig<<shiftCount;
 364    *zExpPtr = 1 - shiftCount;
 365
 366}
 367
 368/*----------------------------------------------------------------------------
 369| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 370| single-precision floating-point value, returning the result.  After being
 371| shifted into the proper positions, the three fields are simply added
 372| together to form the result.  This means that any integer portion of `zSig'
 373| will be added into the exponent.  Since a properly normalized significand
 374| will have an integer portion equal to 1, the `zExp' input should be 1 less
 375| than the desired result exponent whenever `zSig' is a complete, normalized
 376| significand.
 377*----------------------------------------------------------------------------*/
 378
 379static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
 380{
 381
 382    return make_float32(
 383          ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 384
 385}
 386
 387/*----------------------------------------------------------------------------
 388| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 389| and significand `zSig', and returns the proper single-precision floating-
 390| point value corresponding to the abstract input.  Ordinarily, the abstract
 391| value is simply rounded and packed into the single-precision format, with
 392| the inexact exception raised if the abstract input cannot be represented
 393| exactly.  However, if the abstract value is too large, the overflow and
 394| inexact exceptions are raised and an infinity or maximal finite value is
 395| returned.  If the abstract value is too small, the input value is rounded to
 396| a subnormal number, and the underflow and inexact exceptions are raised if
 397| the abstract input cannot be represented exactly as a subnormal single-
 398| precision floating-point number.
 399|     The input significand `zSig' has its binary point between bits 30
 400| and 29, which is 7 bits to the left of the usual location.  This shifted
 401| significand must be normalized or smaller.  If `zSig' is not normalized,
 402| `zExp' must be 0; in that case, the result returned is a subnormal number,
 403| and it must not require rounding.  In the usual case that `zSig' is
 404| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 405| The handling of underflow and overflow follows the IEC/IEEE Standard for
 406| Binary Floating-Point Arithmetic.
 407*----------------------------------------------------------------------------*/
 408
 409static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
 410                                   float_status *status)
 411{
 412    int8_t roundingMode;
 413    flag roundNearestEven;
 414    int8_t roundIncrement, roundBits;
 415    flag isTiny;
 416
 417    roundingMode = status->float_rounding_mode;
 418    roundNearestEven = ( roundingMode == float_round_nearest_even );
 419    switch (roundingMode) {
 420    case float_round_nearest_even:
 421    case float_round_ties_away:
 422        roundIncrement = 0x40;
 423        break;
 424    case float_round_to_zero:
 425        roundIncrement = 0;
 426        break;
 427    case float_round_up:
 428        roundIncrement = zSign ? 0 : 0x7f;
 429        break;
 430    case float_round_down:
 431        roundIncrement = zSign ? 0x7f : 0;
 432        break;
 433    default:
 434        abort();
 435        break;
 436    }
 437    roundBits = zSig & 0x7F;
 438    if ( 0xFD <= (uint16_t) zExp ) {
 439        if (    ( 0xFD < zExp )
 440             || (    ( zExp == 0xFD )
 441                  && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 442           ) {
 443            float_raise(float_flag_overflow | float_flag_inexact, status);
 444            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 445        }
 446        if ( zExp < 0 ) {
 447            if (status->flush_to_zero) {
 448                float_raise(float_flag_output_denormal, status);
 449                return packFloat32(zSign, 0, 0);
 450            }
 451            isTiny =
 452                (status->float_detect_tininess
 453                 == float_tininess_before_rounding)
 454                || ( zExp < -1 )
 455                || ( zSig + roundIncrement < 0x80000000 );
 456            shift32RightJamming( zSig, - zExp, &zSig );
 457            zExp = 0;
 458            roundBits = zSig & 0x7F;
 459            if (isTiny && roundBits) {
 460                float_raise(float_flag_underflow, status);
 461            }
 462        }
 463    }
 464    if (roundBits) {
 465        status->float_exception_flags |= float_flag_inexact;
 466    }
 467    zSig = ( zSig + roundIncrement )>>7;
 468    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 469    if ( zSig == 0 ) zExp = 0;
 470    return packFloat32( zSign, zExp, zSig );
 471
 472}
 473
 474/*----------------------------------------------------------------------------
 475| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 476| and significand `zSig', and returns the proper single-precision floating-
 477| point value corresponding to the abstract input.  This routine is just like
 478| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 479| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 480| floating-point exponent.
 481*----------------------------------------------------------------------------*/
 482
 483static float32
 484 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
 485                              float_status *status)
 486{
 487    int8_t shiftCount;
 488
 489    shiftCount = countLeadingZeros32( zSig ) - 1;
 490    return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
 491                               status);
 492
 493}
 494
 495/*----------------------------------------------------------------------------
 496| Returns the fraction bits of the double-precision floating-point value `a'.
 497*----------------------------------------------------------------------------*/
 498
 499static inline uint64_t extractFloat64Frac( float64 a )
 500{
 501
 502    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 503
 504}
 505
 506/*----------------------------------------------------------------------------
 507| Returns the exponent bits of the double-precision floating-point value `a'.
 508*----------------------------------------------------------------------------*/
 509
 510static inline int extractFloat64Exp(float64 a)
 511{
 512
 513    return ( float64_val(a)>>52 ) & 0x7FF;
 514
 515}
 516
 517/*----------------------------------------------------------------------------
 518| Returns the sign bit of the double-precision floating-point value `a'.
 519*----------------------------------------------------------------------------*/
 520
 521static inline flag extractFloat64Sign( float64 a )
 522{
 523
 524    return float64_val(a)>>63;
 525
 526}
 527
 528/*----------------------------------------------------------------------------
 529| If `a' is denormal and we are in flush-to-zero mode then set the
 530| input-denormal exception and return zero. Otherwise just return the value.
 531*----------------------------------------------------------------------------*/
 532float64 float64_squash_input_denormal(float64 a, float_status *status)
 533{
 534    if (status->flush_inputs_to_zero) {
 535        if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 536            float_raise(float_flag_input_denormal, status);
 537            return make_float64(float64_val(a) & (1ULL << 63));
 538        }
 539    }
 540    return a;
 541}
 542
 543/*----------------------------------------------------------------------------
 544| Normalizes the subnormal double-precision floating-point value represented
 545| by the denormalized significand `aSig'.  The normalized exponent and
 546| significand are stored at the locations pointed to by `zExpPtr' and
 547| `zSigPtr', respectively.
 548*----------------------------------------------------------------------------*/
 549
 550static void
 551 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
 552{
 553    int8_t shiftCount;
 554
 555    shiftCount = countLeadingZeros64( aSig ) - 11;
 556    *zSigPtr = aSig<<shiftCount;
 557    *zExpPtr = 1 - shiftCount;
 558
 559}
 560
 561/*----------------------------------------------------------------------------
 562| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 563| double-precision floating-point value, returning the result.  After being
 564| shifted into the proper positions, the three fields are simply added
 565| together to form the result.  This means that any integer portion of `zSig'
 566| will be added into the exponent.  Since a properly normalized significand
 567| will have an integer portion equal to 1, the `zExp' input should be 1 less
 568| than the desired result exponent whenever `zSig' is a complete, normalized
 569| significand.
 570*----------------------------------------------------------------------------*/
 571
 572static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
 573{
 574
 575    return make_float64(
 576        ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 577
 578}
 579
 580/*----------------------------------------------------------------------------
 581| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 582| and significand `zSig', and returns the proper double-precision floating-
 583| point value corresponding to the abstract input.  Ordinarily, the abstract
 584| value is simply rounded and packed into the double-precision format, with
 585| the inexact exception raised if the abstract input cannot be represented
 586| exactly.  However, if the abstract value is too large, the overflow and
 587| inexact exceptions are raised and an infinity or maximal finite value is
 588| returned.  If the abstract value is too small, the input value is rounded to
 589| a subnormal number, and the underflow and inexact exceptions are raised if
 590| the abstract input cannot be represented exactly as a subnormal double-
 591| precision floating-point number.
 592|     The input significand `zSig' has its binary point between bits 62
 593| and 61, which is 10 bits to the left of the usual location.  This shifted
 594| significand must be normalized or smaller.  If `zSig' is not normalized,
 595| `zExp' must be 0; in that case, the result returned is a subnormal number,
 596| and it must not require rounding.  In the usual case that `zSig' is
 597| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 598| The handling of underflow and overflow follows the IEC/IEEE Standard for
 599| Binary Floating-Point Arithmetic.
 600*----------------------------------------------------------------------------*/
 601
 602static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
 603                                   float_status *status)
 604{
 605    int8_t roundingMode;
 606    flag roundNearestEven;
 607    int roundIncrement, roundBits;
 608    flag isTiny;
 609
 610    roundingMode = status->float_rounding_mode;
 611    roundNearestEven = ( roundingMode == float_round_nearest_even );
 612    switch (roundingMode) {
 613    case float_round_nearest_even:
 614    case float_round_ties_away:
 615        roundIncrement = 0x200;
 616        break;
 617    case float_round_to_zero:
 618        roundIncrement = 0;
 619        break;
 620    case float_round_up:
 621        roundIncrement = zSign ? 0 : 0x3ff;
 622        break;
 623    case float_round_down:
 624        roundIncrement = zSign ? 0x3ff : 0;
 625        break;
 626    default:
 627        abort();
 628    }
 629    roundBits = zSig & 0x3FF;
 630    if ( 0x7FD <= (uint16_t) zExp ) {
 631        if (    ( 0x7FD < zExp )
 632             || (    ( zExp == 0x7FD )
 633                  && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 634           ) {
 635            float_raise(float_flag_overflow | float_flag_inexact, status);
 636            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 637        }
 638        if ( zExp < 0 ) {
 639            if (status->flush_to_zero) {
 640                float_raise(float_flag_output_denormal, status);
 641                return packFloat64(zSign, 0, 0);
 642            }
 643            isTiny =
 644                   (status->float_detect_tininess
 645                    == float_tininess_before_rounding)
 646                || ( zExp < -1 )
 647                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 648            shift64RightJamming( zSig, - zExp, &zSig );
 649            zExp = 0;
 650            roundBits = zSig & 0x3FF;
 651            if (isTiny && roundBits) {
 652                float_raise(float_flag_underflow, status);
 653            }
 654        }
 655    }
 656    if (roundBits) {
 657        status->float_exception_flags |= float_flag_inexact;
 658    }
 659    zSig = ( zSig + roundIncrement )>>10;
 660    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 661    if ( zSig == 0 ) zExp = 0;
 662    return packFloat64( zSign, zExp, zSig );
 663
 664}
 665
 666/*----------------------------------------------------------------------------
 667| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 668| and significand `zSig', and returns the proper double-precision floating-
 669| point value corresponding to the abstract input.  This routine is just like
 670| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 671| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 672| floating-point exponent.
 673*----------------------------------------------------------------------------*/
 674
 675static float64
 676 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
 677                              float_status *status)
 678{
 679    int8_t shiftCount;
 680
 681    shiftCount = countLeadingZeros64( zSig ) - 1;
 682    return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
 683                               status);
 684
 685}
 686
 687/*----------------------------------------------------------------------------
 688| Returns the fraction bits of the extended double-precision floating-point
 689| value `a'.
 690*----------------------------------------------------------------------------*/
 691
 692static inline uint64_t extractFloatx80Frac( floatx80 a )
 693{
 694
 695    return a.low;
 696
 697}
 698
 699/*----------------------------------------------------------------------------
 700| Returns the exponent bits of the extended double-precision floating-point
 701| value `a'.
 702*----------------------------------------------------------------------------*/
 703
 704static inline int32_t extractFloatx80Exp( floatx80 a )
 705{
 706
 707    return a.high & 0x7FFF;
 708
 709}
 710
 711/*----------------------------------------------------------------------------
 712| Returns the sign bit of the extended double-precision floating-point value
 713| `a'.
 714*----------------------------------------------------------------------------*/
 715
 716static inline flag extractFloatx80Sign( floatx80 a )
 717{
 718
 719    return a.high>>15;
 720
 721}
 722
 723/*----------------------------------------------------------------------------
 724| Normalizes the subnormal extended double-precision floating-point value
 725| represented by the denormalized significand `aSig'.  The normalized exponent
 726| and significand are stored at the locations pointed to by `zExpPtr' and
 727| `zSigPtr', respectively.
 728*----------------------------------------------------------------------------*/
 729
 730static void
 731 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
 732{
 733    int8_t shiftCount;
 734
 735    shiftCount = countLeadingZeros64( aSig );
 736    *zSigPtr = aSig<<shiftCount;
 737    *zExpPtr = 1 - shiftCount;
 738
 739}
 740
 741/*----------------------------------------------------------------------------
 742| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 743| extended double-precision floating-point value, returning the result.
 744*----------------------------------------------------------------------------*/
 745
 746static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
 747{
 748    floatx80 z;
 749
 750    z.low = zSig;
 751    z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 752    return z;
 753
 754}
 755
 756/*----------------------------------------------------------------------------
 757| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 758| and extended significand formed by the concatenation of `zSig0' and `zSig1',
 759| and returns the proper extended double-precision floating-point value
 760| corresponding to the abstract input.  Ordinarily, the abstract value is
 761| rounded and packed into the extended double-precision format, with the
 762| inexact exception raised if the abstract input cannot be represented
 763| exactly.  However, if the abstract value is too large, the overflow and
 764| inexact exceptions are raised and an infinity or maximal finite value is
 765| returned.  If the abstract value is too small, the input value is rounded to
 766| a subnormal number, and the underflow and inexact exceptions are raised if
 767| the abstract input cannot be represented exactly as a subnormal extended
 768| double-precision floating-point number.
 769|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 770| number of bits as single or double precision, respectively.  Otherwise, the
 771| result is rounded to the full precision of the extended double-precision
 772| format.
 773|     The input significand must be normalized or smaller.  If the input
 774| significand is not normalized, `zExp' must be 0; in that case, the result
 775| returned is a subnormal number, and it must not require rounding.  The
 776| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 777| Floating-Point Arithmetic.
 778*----------------------------------------------------------------------------*/
 779
 780static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
 781                                     int32_t zExp, uint64_t zSig0, uint64_t zSig1,
 782                                     float_status *status)
 783{
 784    int8_t roundingMode;
 785    flag roundNearestEven, increment, isTiny;
 786    int64_t roundIncrement, roundMask, roundBits;
 787
 788    roundingMode = status->float_rounding_mode;
 789    roundNearestEven = ( roundingMode == float_round_nearest_even );
 790    if ( roundingPrecision == 80 ) goto precision80;
 791    if ( roundingPrecision == 64 ) {
 792        roundIncrement = LIT64( 0x0000000000000400 );
 793        roundMask = LIT64( 0x00000000000007FF );
 794    }
 795    else if ( roundingPrecision == 32 ) {
 796        roundIncrement = LIT64( 0x0000008000000000 );
 797        roundMask = LIT64( 0x000000FFFFFFFFFF );
 798    }
 799    else {
 800        goto precision80;
 801    }
 802    zSig0 |= ( zSig1 != 0 );
 803    switch (roundingMode) {
 804    case float_round_nearest_even:
 805    case float_round_ties_away:
 806        break;
 807    case float_round_to_zero:
 808        roundIncrement = 0;
 809        break;
 810    case float_round_up:
 811        roundIncrement = zSign ? 0 : roundMask;
 812        break;
 813    case float_round_down:
 814        roundIncrement = zSign ? roundMask : 0;
 815        break;
 816    default:
 817        abort();
 818    }
 819    roundBits = zSig0 & roundMask;
 820    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 821        if (    ( 0x7FFE < zExp )
 822             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 823           ) {
 824            goto overflow;
 825        }
 826        if ( zExp <= 0 ) {
 827            if (status->flush_to_zero) {
 828                float_raise(float_flag_output_denormal, status);
 829                return packFloatx80(zSign, 0, 0);
 830            }
 831            isTiny =
 832                   (status->float_detect_tininess
 833                    == float_tininess_before_rounding)
 834                || ( zExp < 0 )
 835                || ( zSig0 <= zSig0 + roundIncrement );
 836            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 837            zExp = 0;
 838            roundBits = zSig0 & roundMask;
 839            if (isTiny && roundBits) {
 840                float_raise(float_flag_underflow, status);
 841            }
 842            if (roundBits) {
 843                status->float_exception_flags |= float_flag_inexact;
 844            }
 845            zSig0 += roundIncrement;
 846            if ( (int64_t) zSig0 < 0 ) zExp = 1;
 847            roundIncrement = roundMask + 1;
 848            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 849                roundMask |= roundIncrement;
 850            }
 851            zSig0 &= ~ roundMask;
 852            return packFloatx80( zSign, zExp, zSig0 );
 853        }
 854    }
 855    if (roundBits) {
 856        status->float_exception_flags |= float_flag_inexact;
 857    }
 858    zSig0 += roundIncrement;
 859    if ( zSig0 < roundIncrement ) {
 860        ++zExp;
 861        zSig0 = LIT64( 0x8000000000000000 );
 862    }
 863    roundIncrement = roundMask + 1;
 864    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 865        roundMask |= roundIncrement;
 866    }
 867    zSig0 &= ~ roundMask;
 868    if ( zSig0 == 0 ) zExp = 0;
 869    return packFloatx80( zSign, zExp, zSig0 );
 870 precision80:
 871    switch (roundingMode) {
 872    case float_round_nearest_even:
 873    case float_round_ties_away:
 874        increment = ((int64_t)zSig1 < 0);
 875        break;
 876    case float_round_to_zero:
 877        increment = 0;
 878        break;
 879    case float_round_up:
 880        increment = !zSign && zSig1;
 881        break;
 882    case float_round_down:
 883        increment = zSign && zSig1;
 884        break;
 885    default:
 886        abort();
 887    }
 888    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 889        if (    ( 0x7FFE < zExp )
 890             || (    ( zExp == 0x7FFE )
 891                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 892                  && increment
 893                )
 894           ) {
 895            roundMask = 0;
 896 overflow:
 897            float_raise(float_flag_overflow | float_flag_inexact, status);
 898            if (    ( roundingMode == float_round_to_zero )
 899                 || ( zSign && ( roundingMode == float_round_up ) )
 900                 || ( ! zSign && ( roundingMode == float_round_down ) )
 901               ) {
 902                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 903            }
 904            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 905        }
 906        if ( zExp <= 0 ) {
 907            isTiny =
 908                   (status->float_detect_tininess
 909                    == float_tininess_before_rounding)
 910                || ( zExp < 0 )
 911                || ! increment
 912                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 913            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 914            zExp = 0;
 915            if (isTiny && zSig1) {
 916                float_raise(float_flag_underflow, status);
 917            }
 918            if (zSig1) {
 919                status->float_exception_flags |= float_flag_inexact;
 920            }
 921            switch (roundingMode) {
 922            case float_round_nearest_even:
 923            case float_round_ties_away:
 924                increment = ((int64_t)zSig1 < 0);
 925                break;
 926            case float_round_to_zero:
 927                increment = 0;
 928                break;
 929            case float_round_up:
 930                increment = !zSign && zSig1;
 931                break;
 932            case float_round_down:
 933                increment = zSign && zSig1;
 934                break;
 935            default:
 936                abort();
 937            }
 938            if ( increment ) {
 939                ++zSig0;
 940                zSig0 &=
 941                    ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 942                if ( (int64_t) zSig0 < 0 ) zExp = 1;
 943            }
 944            return packFloatx80( zSign, zExp, zSig0 );
 945        }
 946    }
 947    if (zSig1) {
 948        status->float_exception_flags |= float_flag_inexact;
 949    }
 950    if ( increment ) {
 951        ++zSig0;
 952        if ( zSig0 == 0 ) {
 953            ++zExp;
 954            zSig0 = LIT64( 0x8000000000000000 );
 955        }
 956        else {
 957            zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 958        }
 959    }
 960    else {
 961        if ( zSig0 == 0 ) zExp = 0;
 962    }
 963    return packFloatx80( zSign, zExp, zSig0 );
 964
 965}
 966
 967/*----------------------------------------------------------------------------
 968| Takes an abstract floating-point value having sign `zSign', exponent
 969| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 970| and returns the proper extended double-precision floating-point value
 971| corresponding to the abstract input.  This routine is just like
 972| `roundAndPackFloatx80' except that the input significand does not have to be
 973| normalized.
 974*----------------------------------------------------------------------------*/
 975
 976static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
 977                                              flag zSign, int32_t zExp,
 978                                              uint64_t zSig0, uint64_t zSig1,
 979                                              float_status *status)
 980{
 981    int8_t shiftCount;
 982
 983    if ( zSig0 == 0 ) {
 984        zSig0 = zSig1;
 985        zSig1 = 0;
 986        zExp -= 64;
 987    }
 988    shiftCount = countLeadingZeros64( zSig0 );
 989    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 990    zExp -= shiftCount;
 991    return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
 992                                zSig0, zSig1, status);
 993
 994}
 995
 996/*----------------------------------------------------------------------------
 997| Returns the least-significant 64 fraction bits of the quadruple-precision
 998| floating-point value `a'.
 999*----------------------------------------------------------------------------*/
1000
1001static inline uint64_t extractFloat128Frac1( float128 a )
1002{
1003
1004    return a.low;
1005
1006}
1007
1008/*----------------------------------------------------------------------------
1009| Returns the most-significant 48 fraction bits of the quadruple-precision
1010| floating-point value `a'.
1011*----------------------------------------------------------------------------*/
1012
1013static inline uint64_t extractFloat128Frac0( float128 a )
1014{
1015
1016    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1017
1018}
1019
1020/*----------------------------------------------------------------------------
1021| Returns the exponent bits of the quadruple-precision floating-point value
1022| `a'.
1023*----------------------------------------------------------------------------*/
1024
1025static inline int32_t extractFloat128Exp( float128 a )
1026{
1027
1028    return ( a.high>>48 ) & 0x7FFF;
1029
1030}
1031
1032/*----------------------------------------------------------------------------
1033| Returns the sign bit of the quadruple-precision floating-point value `a'.
1034*----------------------------------------------------------------------------*/
1035
1036static inline flag extractFloat128Sign( float128 a )
1037{
1038
1039    return a.high>>63;
1040
1041}
1042
1043/*----------------------------------------------------------------------------
1044| Normalizes the subnormal quadruple-precision floating-point value
1045| represented by the denormalized significand formed by the concatenation of
1046| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
1047| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
1048| significand are stored at the location pointed to by `zSig0Ptr', and the
1049| least significant 64 bits of the normalized significand are stored at the
1050| location pointed to by `zSig1Ptr'.
1051*----------------------------------------------------------------------------*/
1052
1053static void
1054 normalizeFloat128Subnormal(
1055     uint64_t aSig0,
1056     uint64_t aSig1,
1057     int32_t *zExpPtr,
1058     uint64_t *zSig0Ptr,
1059     uint64_t *zSig1Ptr
1060 )
1061{
1062    int8_t shiftCount;
1063
1064    if ( aSig0 == 0 ) {
1065        shiftCount = countLeadingZeros64( aSig1 ) - 15;
1066        if ( shiftCount < 0 ) {
1067            *zSig0Ptr = aSig1>>( - shiftCount );
1068            *zSig1Ptr = aSig1<<( shiftCount & 63 );
1069        }
1070        else {
1071            *zSig0Ptr = aSig1<<shiftCount;
1072            *zSig1Ptr = 0;
1073        }
1074        *zExpPtr = - shiftCount - 63;
1075    }
1076    else {
1077        shiftCount = countLeadingZeros64( aSig0 ) - 15;
1078        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1079        *zExpPtr = 1 - shiftCount;
1080    }
1081
1082}
1083
1084/*----------------------------------------------------------------------------
1085| Packs the sign `zSign', the exponent `zExp', and the significand formed
1086| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1087| floating-point value, returning the result.  After being shifted into the
1088| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1089| added together to form the most significant 32 bits of the result.  This
1090| means that any integer portion of `zSig0' will be added into the exponent.
1091| Since a properly normalized significand will have an integer portion equal
1092| to 1, the `zExp' input should be 1 less than the desired result exponent
1093| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1094| significand.
1095*----------------------------------------------------------------------------*/
1096
1097static inline float128
1098 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
1099{
1100    float128 z;
1101
1102    z.low = zSig1;
1103    z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1104    return z;
1105
1106}
1107
1108/*----------------------------------------------------------------------------
1109| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1110| and extended significand formed by the concatenation of `zSig0', `zSig1',
1111| and `zSig2', and returns the proper quadruple-precision floating-point value
1112| corresponding to the abstract input.  Ordinarily, the abstract value is
1113| simply rounded and packed into the quadruple-precision format, with the
1114| inexact exception raised if the abstract input cannot be represented
1115| exactly.  However, if the abstract value is too large, the overflow and
1116| inexact exceptions are raised and an infinity or maximal finite value is
1117| returned.  If the abstract value is too small, the input value is rounded to
1118| a subnormal number, and the underflow and inexact exceptions are raised if
1119| the abstract input cannot be represented exactly as a subnormal quadruple-
1120| precision floating-point number.
1121|     The input significand must be normalized or smaller.  If the input
1122| significand is not normalized, `zExp' must be 0; in that case, the result
1123| returned is a subnormal number, and it must not require rounding.  In the
1124| usual case that the input significand is normalized, `zExp' must be 1 less
1125| than the ``true'' floating-point exponent.  The handling of underflow and
1126| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127*----------------------------------------------------------------------------*/
1128
1129static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
1130                                     uint64_t zSig0, uint64_t zSig1,
1131                                     uint64_t zSig2, float_status *status)
1132{
1133    int8_t roundingMode;
1134    flag roundNearestEven, increment, isTiny;
1135
1136    roundingMode = status->float_rounding_mode;
1137    roundNearestEven = ( roundingMode == float_round_nearest_even );
1138    switch (roundingMode) {
1139    case float_round_nearest_even:
1140    case float_round_ties_away:
1141        increment = ((int64_t)zSig2 < 0);
1142        break;
1143    case float_round_to_zero:
1144        increment = 0;
1145        break;
1146    case float_round_up:
1147        increment = !zSign && zSig2;
1148        break;
1149    case float_round_down:
1150        increment = zSign && zSig2;
1151        break;
1152    default:
1153        abort();
1154    }
1155    if ( 0x7FFD <= (uint32_t) zExp ) {
1156        if (    ( 0x7FFD < zExp )
1157             || (    ( zExp == 0x7FFD )
1158                  && eq128(
1159                         LIT64( 0x0001FFFFFFFFFFFF ),
1160                         LIT64( 0xFFFFFFFFFFFFFFFF ),
1161                         zSig0,
1162                         zSig1
1163                     )
1164                  && increment
1165                )
1166           ) {
1167            float_raise(float_flag_overflow | float_flag_inexact, status);
1168            if (    ( roundingMode == float_round_to_zero )
1169                 || ( zSign && ( roundingMode == float_round_up ) )
1170                 || ( ! zSign && ( roundingMode == float_round_down ) )
1171               ) {
1172                return
1173                    packFloat128(
1174                        zSign,
1175                        0x7FFE,
1176                        LIT64( 0x0000FFFFFFFFFFFF ),
1177                        LIT64( 0xFFFFFFFFFFFFFFFF )
1178                    );
1179            }
1180            return packFloat128( zSign, 0x7FFF, 0, 0 );
1181        }
1182        if ( zExp < 0 ) {
1183            if (status->flush_to_zero) {
1184                float_raise(float_flag_output_denormal, status);
1185                return packFloat128(zSign, 0, 0, 0);
1186            }
1187            isTiny =
1188                   (status->float_detect_tininess
1189                    == float_tininess_before_rounding)
1190                || ( zExp < -1 )
1191                || ! increment
1192                || lt128(
1193                       zSig0,
1194                       zSig1,
1195                       LIT64( 0x0001FFFFFFFFFFFF ),
1196                       LIT64( 0xFFFFFFFFFFFFFFFF )
1197                   );
1198            shift128ExtraRightJamming(
1199                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1200            zExp = 0;
1201            if (isTiny && zSig2) {
1202                float_raise(float_flag_underflow, status);
1203            }
1204            switch (roundingMode) {
1205            case float_round_nearest_even:
1206            case float_round_ties_away:
1207                increment = ((int64_t)zSig2 < 0);
1208                break;
1209            case float_round_to_zero:
1210                increment = 0;
1211                break;
1212            case float_round_up:
1213                increment = !zSign && zSig2;
1214                break;
1215            case float_round_down:
1216                increment = zSign && zSig2;
1217                break;
1218            default:
1219                abort();
1220            }
1221        }
1222    }
1223    if (zSig2) {
1224        status->float_exception_flags |= float_flag_inexact;
1225    }
1226    if ( increment ) {
1227        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1228        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1229    }
1230    else {
1231        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1232    }
1233    return packFloat128( zSign, zExp, zSig0, zSig1 );
1234
1235}
1236
1237/*----------------------------------------------------------------------------
1238| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1239| and significand formed by the concatenation of `zSig0' and `zSig1', and
1240| returns the proper quadruple-precision floating-point value corresponding
1241| to the abstract input.  This routine is just like `roundAndPackFloat128'
1242| except that the input significand has fewer bits and does not have to be
1243| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1244| point exponent.
1245*----------------------------------------------------------------------------*/
1246
1247static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
1248                                              uint64_t zSig0, uint64_t zSig1,
1249                                              float_status *status)
1250{
1251    int8_t shiftCount;
1252    uint64_t zSig2;
1253
1254    if ( zSig0 == 0 ) {
1255        zSig0 = zSig1;
1256        zSig1 = 0;
1257        zExp -= 64;
1258    }
1259    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1260    if ( 0 <= shiftCount ) {
1261        zSig2 = 0;
1262        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1263    }
1264    else {
1265        shift128ExtraRightJamming(
1266            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1267    }
1268    zExp -= shiftCount;
1269    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
1270
1271}
1272
1273/*----------------------------------------------------------------------------
1274| Returns the result of converting the 32-bit two's complement integer `a'
1275| to the single-precision floating-point format.  The conversion is performed
1276| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1277*----------------------------------------------------------------------------*/
1278
1279float32 int32_to_float32(int32_t a, float_status *status)
1280{
1281    flag zSign;
1282
1283    if ( a == 0 ) return float32_zero;
1284    if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1285    zSign = ( a < 0 );
1286    return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
1287}
1288
1289/*----------------------------------------------------------------------------
1290| Returns the result of converting the 32-bit two's complement integer `a'
1291| to the double-precision floating-point format.  The conversion is performed
1292| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1293*----------------------------------------------------------------------------*/
1294
1295float64 int32_to_float64(int32_t a, float_status *status)
1296{
1297    flag zSign;
1298    uint32_t absA;
1299    int8_t shiftCount;
1300    uint64_t zSig;
1301
1302    if ( a == 0 ) return float64_zero;
1303    zSign = ( a < 0 );
1304    absA = zSign ? - a : a;
1305    shiftCount = countLeadingZeros32( absA ) + 21;
1306    zSig = absA;
1307    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1308
1309}
1310
1311/*----------------------------------------------------------------------------
1312| Returns the result of converting the 32-bit two's complement integer `a'
1313| to the extended double-precision floating-point format.  The conversion
1314| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1315| Arithmetic.
1316*----------------------------------------------------------------------------*/
1317
1318floatx80 int32_to_floatx80(int32_t a, float_status *status)
1319{
1320    flag zSign;
1321    uint32_t absA;
1322    int8_t shiftCount;
1323    uint64_t zSig;
1324
1325    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1326    zSign = ( a < 0 );
1327    absA = zSign ? - a : a;
1328    shiftCount = countLeadingZeros32( absA ) + 32;
1329    zSig = absA;
1330    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1331
1332}
1333
1334/*----------------------------------------------------------------------------
1335| Returns the result of converting the 32-bit two's complement integer `a' to
1336| the quadruple-precision floating-point format.  The conversion is performed
1337| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1338*----------------------------------------------------------------------------*/
1339
1340float128 int32_to_float128(int32_t a, float_status *status)
1341{
1342    flag zSign;
1343    uint32_t absA;
1344    int8_t shiftCount;
1345    uint64_t zSig0;
1346
1347    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1348    zSign = ( a < 0 );
1349    absA = zSign ? - a : a;
1350    shiftCount = countLeadingZeros32( absA ) + 17;
1351    zSig0 = absA;
1352    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1353
1354}
1355
1356/*----------------------------------------------------------------------------
1357| Returns the result of converting the 64-bit two's complement integer `a'
1358| to the single-precision floating-point format.  The conversion is performed
1359| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1360*----------------------------------------------------------------------------*/
1361
1362float32 int64_to_float32(int64_t a, float_status *status)
1363{
1364    flag zSign;
1365    uint64_t absA;
1366    int8_t shiftCount;
1367
1368    if ( a == 0 ) return float32_zero;
1369    zSign = ( a < 0 );
1370    absA = zSign ? - a : a;
1371    shiftCount = countLeadingZeros64( absA ) - 40;
1372    if ( 0 <= shiftCount ) {
1373        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1374    }
1375    else {
1376        shiftCount += 7;
1377        if ( shiftCount < 0 ) {
1378            shift64RightJamming( absA, - shiftCount, &absA );
1379        }
1380        else {
1381            absA <<= shiftCount;
1382        }
1383        return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
1384    }
1385
1386}
1387
1388/*----------------------------------------------------------------------------
1389| Returns the result of converting the 64-bit two's complement integer `a'
1390| to the double-precision floating-point format.  The conversion is performed
1391| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1392*----------------------------------------------------------------------------*/
1393
1394float64 int64_to_float64(int64_t a, float_status *status)
1395{
1396    flag zSign;
1397
1398    if ( a == 0 ) return float64_zero;
1399    if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1400        return packFloat64( 1, 0x43E, 0 );
1401    }
1402    zSign = ( a < 0 );
1403    return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
1404}
1405
1406/*----------------------------------------------------------------------------
1407| Returns the result of converting the 64-bit two's complement integer `a'
1408| to the extended double-precision floating-point format.  The conversion
1409| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1410| Arithmetic.
1411*----------------------------------------------------------------------------*/
1412
1413floatx80 int64_to_floatx80(int64_t a, float_status *status)
1414{
1415    flag zSign;
1416    uint64_t absA;
1417    int8_t shiftCount;
1418
1419    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1420    zSign = ( a < 0 );
1421    absA = zSign ? - a : a;
1422    shiftCount = countLeadingZeros64( absA );
1423    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1424
1425}
1426
1427/*----------------------------------------------------------------------------
1428| Returns the result of converting the 64-bit two's complement integer `a' to
1429| the quadruple-precision floating-point format.  The conversion is performed
1430| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1431*----------------------------------------------------------------------------*/
1432
1433float128 int64_to_float128(int64_t a, float_status *status)
1434{
1435    flag zSign;
1436    uint64_t absA;
1437    int8_t shiftCount;
1438    int32_t zExp;
1439    uint64_t zSig0, zSig1;
1440
1441    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1442    zSign = ( a < 0 );
1443    absA = zSign ? - a : a;
1444    shiftCount = countLeadingZeros64( absA ) + 49;
1445    zExp = 0x406E - shiftCount;
1446    if ( 64 <= shiftCount ) {
1447        zSig1 = 0;
1448        zSig0 = absA;
1449        shiftCount -= 64;
1450    }
1451    else {
1452        zSig1 = absA;
1453        zSig0 = 0;
1454    }
1455    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1456    return packFloat128( zSign, zExp, zSig0, zSig1 );
1457
1458}
1459
1460/*----------------------------------------------------------------------------
1461| Returns the result of converting the 64-bit unsigned integer `a'
1462| to the single-precision floating-point format.  The conversion is performed
1463| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1464*----------------------------------------------------------------------------*/
1465
1466float32 uint64_to_float32(uint64_t a, float_status *status)
1467{
1468    int shiftcount;
1469
1470    if (a == 0) {
1471        return float32_zero;
1472    }
1473
1474    /* Determine (left) shift needed to put first set bit into bit posn 23
1475     * (since packFloat32() expects the binary point between bits 23 and 22);
1476     * this is the fast case for smallish numbers.
1477     */
1478    shiftcount = countLeadingZeros64(a) - 40;
1479    if (shiftcount >= 0) {
1480        return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1481    }
1482    /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1483     * expects the binary point between bits 30 and 29, hence the + 7.
1484     */
1485    shiftcount += 7;
1486    if (shiftcount < 0) {
1487        shift64RightJamming(a, -shiftcount, &a);
1488    } else {
1489        a <<= shiftcount;
1490    }
1491
1492    return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
1493}
1494
1495/*----------------------------------------------------------------------------
1496| Returns the result of converting the 64-bit unsigned integer `a'
1497| to the double-precision floating-point format.  The conversion is performed
1498| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1499*----------------------------------------------------------------------------*/
1500
1501float64 uint64_to_float64(uint64_t a, float_status *status)
1502{
1503    int exp = 0x43C;
1504    int shiftcount;
1505
1506    if (a == 0) {
1507        return float64_zero;
1508    }
1509
1510    shiftcount = countLeadingZeros64(a) - 1;
1511    if (shiftcount < 0) {
1512        shift64RightJamming(a, -shiftcount, &a);
1513    } else {
1514        a <<= shiftcount;
1515    }
1516    return roundAndPackFloat64(0, exp - shiftcount, a, status);
1517}
1518
1519/*----------------------------------------------------------------------------
1520| Returns the result of converting the 64-bit unsigned integer `a'
1521| to the quadruple-precision floating-point format.  The conversion is performed
1522| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1523*----------------------------------------------------------------------------*/
1524
1525float128 uint64_to_float128(uint64_t a, float_status *status)
1526{
1527    if (a == 0) {
1528        return float128_zero;
1529    }
1530    return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1531}
1532
1533/*----------------------------------------------------------------------------
1534| Returns the result of converting the single-precision floating-point value
1535| `a' to the 32-bit two's complement integer format.  The conversion is
1536| performed according to the IEC/IEEE Standard for Binary Floating-Point
1537| Arithmetic---which means in particular that the conversion is rounded
1538| according to the current rounding mode.  If `a' is a NaN, the largest
1539| positive integer is returned.  Otherwise, if the conversion overflows, the
1540| largest integer with the same sign as `a' is returned.
1541*----------------------------------------------------------------------------*/
1542
1543int32_t float32_to_int32(float32 a, float_status *status)
1544{
1545    flag aSign;
1546    int aExp;
1547    int shiftCount;
1548    uint32_t aSig;
1549    uint64_t aSig64;
1550
1551    a = float32_squash_input_denormal(a, status);
1552    aSig = extractFloat32Frac( a );
1553    aExp = extractFloat32Exp( a );
1554    aSign = extractFloat32Sign( a );
1555    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1556    if ( aExp ) aSig |= 0x00800000;
1557    shiftCount = 0xAF - aExp;
1558    aSig64 = aSig;
1559    aSig64 <<= 32;
1560    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1561    return roundAndPackInt32(aSign, aSig64, status);
1562
1563}
1564
1565/*----------------------------------------------------------------------------
1566| Returns the result of converting the single-precision floating-point value
1567| `a' to the 32-bit two's complement integer format.  The conversion is
1568| performed according to the IEC/IEEE Standard for Binary Floating-Point
1569| Arithmetic, except that the conversion is always rounded toward zero.
1570| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1571| the conversion overflows, the largest integer with the same sign as `a' is
1572| returned.
1573*----------------------------------------------------------------------------*/
1574
1575int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
1576{
1577    flag aSign;
1578    int aExp;
1579    int shiftCount;
1580    uint32_t aSig;
1581    int32_t z;
1582    a = float32_squash_input_denormal(a, status);
1583
1584    aSig = extractFloat32Frac( a );
1585    aExp = extractFloat32Exp( a );
1586    aSign = extractFloat32Sign( a );
1587    shiftCount = aExp - 0x9E;
1588    if ( 0 <= shiftCount ) {
1589        if ( float32_val(a) != 0xCF000000 ) {
1590            float_raise(float_flag_invalid, status);
1591            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1592        }
1593        return (int32_t) 0x80000000;
1594    }
1595    else if ( aExp <= 0x7E ) {
1596        if (aExp | aSig) {
1597            status->float_exception_flags |= float_flag_inexact;
1598        }
1599        return 0;
1600    }
1601    aSig = ( aSig | 0x00800000 )<<8;
1602    z = aSig>>( - shiftCount );
1603    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1604        status->float_exception_flags |= float_flag_inexact;
1605    }
1606    if ( aSign ) z = - z;
1607    return z;
1608
1609}
1610
1611/*----------------------------------------------------------------------------
1612| Returns the result of converting the single-precision floating-point value
1613| `a' to the 16-bit two's complement integer format.  The conversion is
1614| performed according to the IEC/IEEE Standard for Binary Floating-Point
1615| Arithmetic, except that the conversion is always rounded toward zero.
1616| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1617| the conversion overflows, the largest integer with the same sign as `a' is
1618| returned.
1619*----------------------------------------------------------------------------*/
1620
1621int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
1622{
1623    flag aSign;
1624    int aExp;
1625    int shiftCount;
1626    uint32_t aSig;
1627    int32_t z;
1628
1629    aSig = extractFloat32Frac( a );
1630    aExp = extractFloat32Exp( a );
1631    aSign = extractFloat32Sign( a );
1632    shiftCount = aExp - 0x8E;
1633    if ( 0 <= shiftCount ) {
1634        if ( float32_val(a) != 0xC7000000 ) {
1635            float_raise(float_flag_invalid, status);
1636            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1637                return 0x7FFF;
1638            }
1639        }
1640        return (int32_t) 0xffff8000;
1641    }
1642    else if ( aExp <= 0x7E ) {
1643        if ( aExp | aSig ) {
1644            status->float_exception_flags |= float_flag_inexact;
1645        }
1646        return 0;
1647    }
1648    shiftCount -= 0x10;
1649    aSig = ( aSig | 0x00800000 )<<8;
1650    z = aSig>>( - shiftCount );
1651    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1652        status->float_exception_flags |= float_flag_inexact;
1653    }
1654    if ( aSign ) {
1655        z = - z;
1656    }
1657    return z;
1658
1659}
1660
1661/*----------------------------------------------------------------------------
1662| Returns the result of converting the single-precision floating-point value
1663| `a' to the 64-bit two's complement integer format.  The conversion is
1664| performed according to the IEC/IEEE Standard for Binary Floating-Point
1665| Arithmetic---which means in particular that the conversion is rounded
1666| according to the current rounding mode.  If `a' is a NaN, the largest
1667| positive integer is returned.  Otherwise, if the conversion overflows, the
1668| largest integer with the same sign as `a' is returned.
1669*----------------------------------------------------------------------------*/
1670
1671int64_t float32_to_int64(float32 a, float_status *status)
1672{
1673    flag aSign;
1674    int aExp;
1675    int shiftCount;
1676    uint32_t aSig;
1677    uint64_t aSig64, aSigExtra;
1678    a = float32_squash_input_denormal(a, status);
1679
1680    aSig = extractFloat32Frac( a );
1681    aExp = extractFloat32Exp( a );
1682    aSign = extractFloat32Sign( a );
1683    shiftCount = 0xBE - aExp;
1684    if ( shiftCount < 0 ) {
1685        float_raise(float_flag_invalid, status);
1686        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1687            return LIT64( 0x7FFFFFFFFFFFFFFF );
1688        }
1689        return (int64_t) LIT64( 0x8000000000000000 );
1690    }
1691    if ( aExp ) aSig |= 0x00800000;
1692    aSig64 = aSig;
1693    aSig64 <<= 40;
1694    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1695    return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
1696
1697}
1698
1699/*----------------------------------------------------------------------------
1700| Returns the result of converting the single-precision floating-point value
1701| `a' to the 64-bit unsigned integer format.  The conversion is
1702| performed according to the IEC/IEEE Standard for Binary Floating-Point
1703| Arithmetic---which means in particular that the conversion is rounded
1704| according to the current rounding mode.  If `a' is a NaN, the largest
1705| unsigned integer is returned.  Otherwise, if the conversion overflows, the
1706| largest unsigned integer is returned.  If the 'a' is negative, the result
1707| is rounded and zero is returned; values that do not round to zero will
1708| raise the inexact exception flag.
1709*----------------------------------------------------------------------------*/
1710
1711uint64_t float32_to_uint64(float32 a, float_status *status)
1712{
1713    flag aSign;
1714    int aExp;
1715    int shiftCount;
1716    uint32_t aSig;
1717    uint64_t aSig64, aSigExtra;
1718    a = float32_squash_input_denormal(a, status);
1719
1720    aSig = extractFloat32Frac(a);
1721    aExp = extractFloat32Exp(a);
1722    aSign = extractFloat32Sign(a);
1723    if ((aSign) && (aExp > 126)) {
1724        float_raise(float_flag_invalid, status);
1725        if (float32_is_any_nan(a)) {
1726            return LIT64(0xFFFFFFFFFFFFFFFF);
1727        } else {
1728            return 0;
1729        }
1730    }
1731    shiftCount = 0xBE - aExp;
1732    if (aExp) {
1733        aSig |= 0x00800000;
1734    }
1735    if (shiftCount < 0) {
1736        float_raise(float_flag_invalid, status);
1737        return LIT64(0xFFFFFFFFFFFFFFFF);
1738    }
1739
1740    aSig64 = aSig;
1741    aSig64 <<= 40;
1742    shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1743    return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
1744}
1745
1746/*----------------------------------------------------------------------------
1747| Returns the result of converting the single-precision floating-point value
1748| `a' to the 64-bit unsigned integer format.  The conversion is
1749| performed according to the IEC/IEEE Standard for Binary Floating-Point
1750| Arithmetic, except that the conversion is always rounded toward zero.  If
1751| `a' is a NaN, the largest unsigned integer is returned.  Otherwise, if the
1752| conversion overflows, the largest unsigned integer is returned.  If the
1753| 'a' is negative, the result is rounded and zero is returned; values that do
1754| not round to zero will raise the inexact flag.
1755*----------------------------------------------------------------------------*/
1756
1757uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
1758{
1759    signed char current_rounding_mode = status->float_rounding_mode;
1760    set_float_rounding_mode(float_round_to_zero, status);
1761    int64_t v = float32_to_uint64(a, status);
1762    set_float_rounding_mode(current_rounding_mode, status);
1763    return v;
1764}
1765
1766/*----------------------------------------------------------------------------
1767| Returns the result of converting the single-precision floating-point value
1768| `a' to the 64-bit two's complement integer format.  The conversion is
1769| performed according to the IEC/IEEE Standard for Binary Floating-Point
1770| Arithmetic, except that the conversion is always rounded toward zero.  If
1771| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1772| conversion overflows, the largest integer with the same sign as `a' is
1773| returned.
1774*----------------------------------------------------------------------------*/
1775
1776int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
1777{
1778    flag aSign;
1779    int aExp;
1780    int shiftCount;
1781    uint32_t aSig;
1782    uint64_t aSig64;
1783    int64_t z;
1784    a = float32_squash_input_denormal(a, status);
1785
1786    aSig = extractFloat32Frac( a );
1787    aExp = extractFloat32Exp( a );
1788    aSign = extractFloat32Sign( a );
1789    shiftCount = aExp - 0xBE;
1790    if ( 0 <= shiftCount ) {
1791        if ( float32_val(a) != 0xDF000000 ) {
1792            float_raise(float_flag_invalid, status);
1793            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1794                return LIT64( 0x7FFFFFFFFFFFFFFF );
1795            }
1796        }
1797        return (int64_t) LIT64( 0x8000000000000000 );
1798    }
1799    else if ( aExp <= 0x7E ) {
1800        if (aExp | aSig) {
1801            status->float_exception_flags |= float_flag_inexact;
1802        }
1803        return 0;
1804    }
1805    aSig64 = aSig | 0x00800000;
1806    aSig64 <<= 40;
1807    z = aSig64>>( - shiftCount );
1808    if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1809        status->float_exception_flags |= float_flag_inexact;
1810    }
1811    if ( aSign ) z = - z;
1812    return z;
1813
1814}
1815
1816/*----------------------------------------------------------------------------
1817| Returns the result of converting the single-precision floating-point value
1818| `a' to the double-precision floating-point format.  The conversion is
1819| performed according to the IEC/IEEE Standard for Binary Floating-Point
1820| Arithmetic.
1821*----------------------------------------------------------------------------*/
1822
1823float64 float32_to_float64(float32 a, float_status *status)
1824{
1825    flag aSign;
1826    int aExp;
1827    uint32_t aSig;
1828    a = float32_squash_input_denormal(a, status);
1829
1830    aSig = extractFloat32Frac( a );
1831    aExp = extractFloat32Exp( a );
1832    aSign = extractFloat32Sign( a );
1833    if ( aExp == 0xFF ) {
1834        if (aSig) {
1835            return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1836        }
1837        return packFloat64( aSign, 0x7FF, 0 );
1838    }
1839    if ( aExp == 0 ) {
1840        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1841        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1842        --aExp;
1843    }
1844    return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1845
1846}
1847
1848/*----------------------------------------------------------------------------
1849| Returns the result of converting the single-precision floating-point value
1850| `a' to the extended double-precision floating-point format.  The conversion
1851| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1852| Arithmetic.
1853*----------------------------------------------------------------------------*/
1854
1855floatx80 float32_to_floatx80(float32 a, float_status *status)
1856{
1857    flag aSign;
1858    int aExp;
1859    uint32_t aSig;
1860
1861    a = float32_squash_input_denormal(a, status);
1862    aSig = extractFloat32Frac( a );
1863    aExp = extractFloat32Exp( a );
1864    aSign = extractFloat32Sign( a );
1865    if ( aExp == 0xFF ) {
1866        if (aSig) {
1867            return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1868        }
1869        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1870    }
1871    if ( aExp == 0 ) {
1872        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1873        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1874    }
1875    aSig |= 0x00800000;
1876    return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1877
1878}
1879
1880/*----------------------------------------------------------------------------
1881| Returns the result of converting the single-precision floating-point value
1882| `a' to the double-precision floating-point format.  The conversion is
1883| performed according to the IEC/IEEE Standard for Binary Floating-Point
1884| Arithmetic.
1885*----------------------------------------------------------------------------*/
1886
1887float128 float32_to_float128(float32 a, float_status *status)
1888{
1889    flag aSign;
1890    int aExp;
1891    uint32_t aSig;
1892
1893    a = float32_squash_input_denormal(a, status);
1894    aSig = extractFloat32Frac( a );
1895    aExp = extractFloat32Exp( a );
1896    aSign = extractFloat32Sign( a );
1897    if ( aExp == 0xFF ) {
1898        if (aSig) {
1899            return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1900        }
1901        return packFloat128( aSign, 0x7FFF, 0, 0 );
1902    }
1903    if ( aExp == 0 ) {
1904        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1905        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1906        --aExp;
1907    }
1908    return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1909
1910}
1911
1912/*----------------------------------------------------------------------------
1913| Rounds the single-precision floating-point value `a' to an integer, and
1914| returns the result as a single-precision floating-point value.  The
1915| operation is performed according to the IEC/IEEE Standard for Binary
1916| Floating-Point Arithmetic.
1917*----------------------------------------------------------------------------*/
1918
1919float32 float32_round_to_int(float32 a, float_status *status)
1920{
1921    flag aSign;
1922    int aExp;
1923    uint32_t lastBitMask, roundBitsMask;
1924    uint32_t z;
1925    a = float32_squash_input_denormal(a, status);
1926
1927    aExp = extractFloat32Exp( a );
1928    if ( 0x96 <= aExp ) {
1929        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1930            return propagateFloat32NaN(a, a, status);
1931        }
1932        return a;
1933    }
1934    if ( aExp <= 0x7E ) {
1935        if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1936        status->float_exception_flags |= float_flag_inexact;
1937        aSign = extractFloat32Sign( a );
1938        switch (status->float_rounding_mode) {
1939         case float_round_nearest_even:
1940            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1941                return packFloat32( aSign, 0x7F, 0 );
1942            }
1943            break;
1944        case float_round_ties_away:
1945            if (aExp == 0x7E) {
1946                return packFloat32(aSign, 0x7F, 0);
1947            }
1948            break;
1949         case float_round_down:
1950            return make_float32(aSign ? 0xBF800000 : 0);
1951         case float_round_up:
1952            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1953        }
1954        return packFloat32( aSign, 0, 0 );
1955    }
1956    lastBitMask = 1;
1957    lastBitMask <<= 0x96 - aExp;
1958    roundBitsMask = lastBitMask - 1;
1959    z = float32_val(a);
1960    switch (status->float_rounding_mode) {
1961    case float_round_nearest_even:
1962        z += lastBitMask>>1;
1963        if ((z & roundBitsMask) == 0) {
1964            z &= ~lastBitMask;
1965        }
1966        break;
1967    case float_round_ties_away:
1968        z += lastBitMask >> 1;
1969        break;
1970    case float_round_to_zero:
1971        break;
1972    case float_round_up:
1973        if (!extractFloat32Sign(make_float32(z))) {
1974            z += roundBitsMask;
1975        }
1976        break;
1977    case float_round_down:
1978        if (extractFloat32Sign(make_float32(z))) {
1979            z += roundBitsMask;
1980        }
1981        break;
1982    default:
1983        abort();
1984    }
1985    z &= ~ roundBitsMask;
1986    if (z != float32_val(a)) {
1987        status->float_exception_flags |= float_flag_inexact;
1988    }
1989    return make_float32(z);
1990
1991}
1992
1993/*----------------------------------------------------------------------------
1994| Returns the result of adding the absolute values of the single-precision
1995| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1996| before being returned.  `zSign' is ignored if the result is a NaN.
1997| The addition is performed according to the IEC/IEEE Standard for Binary
1998| Floating-Point Arithmetic.
1999*----------------------------------------------------------------------------*/
2000
2001static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
2002                              float_status *status)
2003{
2004    int aExp, bExp, zExp;
2005    uint32_t aSig, bSig, zSig;
2006    int expDiff;
2007
2008    aSig = extractFloat32Frac( a );
2009    aExp = extractFloat32Exp( a );
2010    bSig = extractFloat32Frac( b );
2011    bExp = extractFloat32Exp( b );
2012    expDiff = aExp - bExp;
2013    aSig <<= 6;
2014    bSig <<= 6;
2015    if ( 0 < expDiff ) {
2016        if ( aExp == 0xFF ) {
2017            if (aSig) {
2018                return propagateFloat32NaN(a, b, status);
2019            }
2020            return a;
2021        }
2022        if ( bExp == 0 ) {
2023            --expDiff;
2024        }
2025        else {
2026            bSig |= 0x20000000;
2027        }
2028        shift32RightJamming( bSig, expDiff, &bSig );
2029        zExp = aExp;
2030    }
2031    else if ( expDiff < 0 ) {
2032        if ( bExp == 0xFF ) {
2033            if (bSig) {
2034                return propagateFloat32NaN(a, b, status);
2035            }
2036            return packFloat32( zSign, 0xFF, 0 );
2037        }
2038        if ( aExp == 0 ) {
2039            ++expDiff;
2040        }
2041        else {
2042            aSig |= 0x20000000;
2043        }
2044        shift32RightJamming( aSig, - expDiff, &aSig );
2045        zExp = bExp;
2046    }
2047    else {
2048        if ( aExp == 0xFF ) {
2049            if (aSig | bSig) {
2050                return propagateFloat32NaN(a, b, status);
2051            }
2052            return a;
2053        }
2054        if ( aExp == 0 ) {
2055            if (status->flush_to_zero) {
2056                if (aSig | bSig) {
2057                    float_raise(float_flag_output_denormal, status);
2058                }
2059                return packFloat32(zSign, 0, 0);
2060            }
2061            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2062        }
2063        zSig = 0x40000000 + aSig + bSig;
2064        zExp = aExp;
2065        goto roundAndPack;
2066    }
2067    aSig |= 0x20000000;
2068    zSig = ( aSig + bSig )<<1;
2069    --zExp;
2070    if ( (int32_t) zSig < 0 ) {
2071        zSig = aSig + bSig;
2072        ++zExp;
2073    }
2074 roundAndPack:
2075    return roundAndPackFloat32(zSign, zExp, zSig, status);
2076
2077}
2078
2079/*----------------------------------------------------------------------------
2080| Returns the result of subtracting the absolute values of the single-
2081| precision floating-point values `a' and `b'.  If `zSign' is 1, the
2082| difference is negated before being returned.  `zSign' is ignored if the
2083| result is a NaN.  The subtraction is performed according to the IEC/IEEE
2084| Standard for Binary Floating-Point Arithmetic.
2085*----------------------------------------------------------------------------*/
2086
2087static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2088                              float_status *status)
2089{
2090    int aExp, bExp, zExp;
2091    uint32_t aSig, bSig, zSig;
2092    int expDiff;
2093
2094    aSig = extractFloat32Frac( a );
2095    aExp = extractFloat32Exp( a );
2096    bSig = extractFloat32Frac( b );
2097    bExp = extractFloat32Exp( b );
2098    expDiff = aExp - bExp;
2099    aSig <<= 7;
2100    bSig <<= 7;
2101    if ( 0 < expDiff ) goto aExpBigger;
2102    if ( expDiff < 0 ) goto bExpBigger;
2103    if ( aExp == 0xFF ) {
2104        if (aSig | bSig) {
2105            return propagateFloat32NaN(a, b, status);
2106        }
2107        float_raise(float_flag_invalid, status);
2108        return float32_default_nan;
2109    }
2110    if ( aExp == 0 ) {
2111        aExp = 1;
2112        bExp = 1;
2113    }
2114    if ( bSig < aSig ) goto aBigger;
2115    if ( aSig < bSig ) goto bBigger;
2116    return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
2117 bExpBigger:
2118    if ( bExp == 0xFF ) {
2119        if (bSig) {
2120            return propagateFloat32NaN(a, b, status);
2121        }
2122        return packFloat32( zSign ^ 1, 0xFF, 0 );
2123    }
2124    if ( aExp == 0 ) {
2125        ++expDiff;
2126    }
2127    else {
2128        aSig |= 0x40000000;
2129    }
2130    shift32RightJamming( aSig, - expDiff, &aSig );
2131    bSig |= 0x40000000;
2132 bBigger:
2133    zSig = bSig - aSig;
2134    zExp = bExp;
2135    zSign ^= 1;
2136    goto normalizeRoundAndPack;
2137 aExpBigger:
2138    if ( aExp == 0xFF ) {
2139        if (aSig) {
2140            return propagateFloat32NaN(a, b, status);
2141        }
2142        return a;
2143    }
2144    if ( bExp == 0 ) {
2145        --expDiff;
2146    }
2147    else {
2148        bSig |= 0x40000000;
2149    }
2150    shift32RightJamming( bSig, expDiff, &bSig );
2151    aSig |= 0x40000000;
2152 aBigger:
2153    zSig = aSig - bSig;
2154    zExp = aExp;
2155 normalizeRoundAndPack:
2156    --zExp;
2157    return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
2158
2159}
2160
2161/*----------------------------------------------------------------------------
2162| Returns the result of adding the single-precision floating-point values `a'
2163| and `b'.  The operation is performed according to the IEC/IEEE Standard for
2164| Binary Floating-Point Arithmetic.
2165*----------------------------------------------------------------------------*/
2166
2167float32 float32_add(float32 a, float32 b, float_status *status)
2168{
2169    flag aSign, bSign;
2170    a = float32_squash_input_denormal(a, status);
2171    b = float32_squash_input_denormal(b, status);
2172
2173    aSign = extractFloat32Sign( a );
2174    bSign = extractFloat32Sign( b );
2175    if ( aSign == bSign ) {
2176        return addFloat32Sigs(a, b, aSign, status);
2177    }
2178    else {
2179        return subFloat32Sigs(a, b, aSign, status);
2180    }
2181
2182}
2183
2184/*----------------------------------------------------------------------------
2185| Returns the result of subtracting the single-precision floating-point values
2186| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2187| for Binary Floating-Point Arithmetic.
2188*----------------------------------------------------------------------------*/
2189
2190float32 float32_sub(float32 a, float32 b, float_status *status)
2191{
2192    flag aSign, bSign;
2193    a = float32_squash_input_denormal(a, status);
2194    b = float32_squash_input_denormal(b, status);
2195
2196    aSign = extractFloat32Sign( a );
2197    bSign = extractFloat32Sign( b );
2198    if ( aSign == bSign ) {
2199        return subFloat32Sigs(a, b, aSign, status);
2200    }
2201    else {
2202        return addFloat32Sigs(a, b, aSign, status);
2203    }
2204
2205}
2206
2207/*----------------------------------------------------------------------------
2208| Returns the result of multiplying the single-precision floating-point values
2209| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2210| for Binary Floating-Point Arithmetic.
2211*----------------------------------------------------------------------------*/
2212
2213float32 float32_mul(float32 a, float32 b, float_status *status)
2214{
2215    flag aSign, bSign, zSign;
2216    int aExp, bExp, zExp;
2217    uint32_t aSig, bSig;
2218    uint64_t zSig64;
2219    uint32_t zSig;
2220
2221    a = float32_squash_input_denormal(a, status);
2222    b = float32_squash_input_denormal(b, status);
2223
2224    aSig = extractFloat32Frac( a );
2225    aExp = extractFloat32Exp( a );
2226    aSign = extractFloat32Sign( a );
2227    bSig = extractFloat32Frac( b );
2228    bExp = extractFloat32Exp( b );
2229    bSign = extractFloat32Sign( b );
2230    zSign = aSign ^ bSign;
2231    if ( aExp == 0xFF ) {
2232        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2233            return propagateFloat32NaN(a, b, status);
2234        }
2235        if ( ( bExp | bSig ) == 0 ) {
2236            float_raise(float_flag_invalid, status);
2237            return float32_default_nan;
2238        }
2239        return packFloat32( zSign, 0xFF, 0 );
2240    }
2241    if ( bExp == 0xFF ) {
2242        if (bSig) {
2243            return propagateFloat32NaN(a, b, status);
2244        }
2245        if ( ( aExp | aSig ) == 0 ) {
2246            float_raise(float_flag_invalid, status);
2247            return float32_default_nan;
2248        }
2249        return packFloat32( zSign, 0xFF, 0 );
2250    }
2251    if ( aExp == 0 ) {
2252        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2253        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2254    }
2255    if ( bExp == 0 ) {
2256        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2257        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2258    }
2259    zExp = aExp + bExp - 0x7F;
2260    aSig = ( aSig | 0x00800000 )<<7;
2261    bSig = ( bSig | 0x00800000 )<<8;
2262    shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2263    zSig = zSig64;
2264    if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2265        zSig <<= 1;
2266        --zExp;
2267    }
2268    return roundAndPackFloat32(zSign, zExp, zSig, status);
2269
2270}
2271
2272/*----------------------------------------------------------------------------
2273| Returns the result of dividing the single-precision floating-point value `a'
2274| by the corresponding value `b'.  The operation is performed according to the
2275| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2276*----------------------------------------------------------------------------*/
2277
2278float32 float32_div(float32 a, float32 b, float_status *status)
2279{
2280    flag aSign, bSign, zSign;
2281    int aExp, bExp, zExp;
2282    uint32_t aSig, bSig, zSig;
2283    a = float32_squash_input_denormal(a, status);
2284    b = float32_squash_input_denormal(b, status);
2285
2286    aSig = extractFloat32Frac( a );
2287    aExp = extractFloat32Exp( a );
2288    aSign = extractFloat32Sign( a );
2289    bSig = extractFloat32Frac( b );
2290    bExp = extractFloat32Exp( b );
2291    bSign = extractFloat32Sign( b );
2292    zSign = aSign ^ bSign;
2293    if ( aExp == 0xFF ) {
2294        if (aSig) {
2295            return propagateFloat32NaN(a, b, status);
2296        }
2297        if ( bExp == 0xFF ) {
2298            if (bSig) {
2299                return propagateFloat32NaN(a, b, status);
2300            }
2301            float_raise(float_flag_invalid, status);
2302            return float32_default_nan;
2303        }
2304        return packFloat32( zSign, 0xFF, 0 );
2305    }
2306    if ( bExp == 0xFF ) {
2307        if (bSig) {
2308            return propagateFloat32NaN(a, b, status);
2309        }
2310        return packFloat32( zSign, 0, 0 );
2311    }
2312    if ( bExp == 0 ) {
2313        if ( bSig == 0 ) {
2314            if ( ( aExp | aSig ) == 0 ) {
2315                float_raise(float_flag_invalid, status);
2316                return float32_default_nan;
2317            }
2318            float_raise(float_flag_divbyzero, status);
2319            return packFloat32( zSign, 0xFF, 0 );
2320        }
2321        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2322    }
2323    if ( aExp == 0 ) {
2324        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2325        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2326    }
2327    zExp = aExp - bExp + 0x7D;
2328    aSig = ( aSig | 0x00800000 )<<7;
2329    bSig = ( bSig | 0x00800000 )<<8;
2330    if ( bSig <= ( aSig + aSig ) ) {
2331        aSig >>= 1;
2332        ++zExp;
2333    }
2334    zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2335    if ( ( zSig & 0x3F ) == 0 ) {
2336        zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2337    }
2338    return roundAndPackFloat32(zSign, zExp, zSig, status);
2339
2340}
2341
2342/*----------------------------------------------------------------------------
2343| Returns the remainder of the single-precision floating-point value `a'
2344| with respect to the corresponding value `b'.  The operation is performed
2345| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2346*----------------------------------------------------------------------------*/
2347
2348float32 float32_rem(float32 a, float32 b, float_status *status)
2349{
2350    flag aSign, zSign;
2351    int aExp, bExp, expDiff;
2352    uint32_t aSig, bSig;
2353    uint32_t q;
2354    uint64_t aSig64, bSig64, q64;
2355    uint32_t alternateASig;
2356    int32_t sigMean;
2357    a = float32_squash_input_denormal(a, status);
2358    b = float32_squash_input_denormal(b, status);
2359
2360    aSig = extractFloat32Frac( a );
2361    aExp = extractFloat32Exp( a );
2362    aSign = extractFloat32Sign( a );
2363    bSig = extractFloat32Frac( b );
2364    bExp = extractFloat32Exp( b );
2365    if ( aExp == 0xFF ) {
2366        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2367            return propagateFloat32NaN(a, b, status);
2368        }
2369        float_raise(float_flag_invalid, status);
2370        return float32_default_nan;
2371    }
2372    if ( bExp == 0xFF ) {
2373        if (bSig) {
2374            return propagateFloat32NaN(a, b, status);
2375        }
2376        return a;
2377    }
2378    if ( bExp == 0 ) {
2379        if ( bSig == 0 ) {
2380            float_raise(float_flag_invalid, status);
2381            return float32_default_nan;
2382        }
2383        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2384    }
2385    if ( aExp == 0 ) {
2386        if ( aSig == 0 ) return a;
2387        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2388    }
2389    expDiff = aExp - bExp;
2390    aSig |= 0x00800000;
2391    bSig |= 0x00800000;
2392    if ( expDiff < 32 ) {
2393        aSig <<= 8;
2394        bSig <<= 8;
2395        if ( expDiff < 0 ) {
2396            if ( expDiff < -1 ) return a;
2397            aSig >>= 1;
2398        }
2399        q = ( bSig <= aSig );
2400        if ( q ) aSig -= bSig;
2401        if ( 0 < expDiff ) {
2402            q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2403            q >>= 32 - expDiff;
2404            bSig >>= 2;
2405            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2406        }
2407        else {
2408            aSig >>= 2;
2409            bSig >>= 2;
2410        }
2411    }
2412    else {
2413        if ( bSig <= aSig ) aSig -= bSig;
2414        aSig64 = ( (uint64_t) aSig )<<40;
2415        bSig64 = ( (uint64_t) bSig )<<40;
2416        expDiff -= 64;
2417        while ( 0 < expDiff ) {
2418            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2419            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2420            aSig64 = - ( ( bSig * q64 )<<38 );
2421            expDiff -= 62;
2422        }
2423        expDiff += 64;
2424        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2425        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2426        q = q64>>( 64 - expDiff );
2427        bSig <<= 6;
2428        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2429    }
2430    do {
2431        alternateASig = aSig;
2432        ++q;
2433        aSig -= bSig;
2434    } while ( 0 <= (int32_t) aSig );
2435    sigMean = aSig + alternateASig;
2436    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2437        aSig = alternateASig;
2438    }
2439    zSign = ( (int32_t) aSig < 0 );
2440    if ( zSign ) aSig = - aSig;
2441    return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
2442}
2443
2444/*----------------------------------------------------------------------------
2445| Returns the result of multiplying the single-precision floating-point values
2446| `a' and `b' then adding 'c', with no intermediate rounding step after the
2447| multiplication.  The operation is performed according to the IEC/IEEE
2448| Standard for Binary Floating-Point Arithmetic 754-2008.
2449| The flags argument allows the caller to select negation of the
2450| addend, the intermediate product, or the final result. (The difference
2451| between this and having the caller do a separate negation is that negating
2452| externally will flip the sign bit on NaNs.)
2453*----------------------------------------------------------------------------*/
2454
2455float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2456                       float_status *status)
2457{
2458    flag aSign, bSign, cSign, zSign;
2459    int aExp, bExp, cExp, pExp, zExp, expDiff;
2460    uint32_t aSig, bSig, cSig;
2461    flag pInf, pZero, pSign;
2462    uint64_t pSig64, cSig64, zSig64;
2463    uint32_t pSig;
2464    int shiftcount;
2465    flag signflip, infzero;
2466
2467    a = float32_squash_input_denormal(a, status);
2468    b = float32_squash_input_denormal(b, status);
2469    c = float32_squash_input_denormal(c, status);
2470    aSig = extractFloat32Frac(a);
2471    aExp = extractFloat32Exp(a);
2472    aSign = extractFloat32Sign(a);
2473    bSig = extractFloat32Frac(b);
2474    bExp = extractFloat32Exp(b);
2475    bSign = extractFloat32Sign(b);
2476    cSig = extractFloat32Frac(c);
2477    cExp = extractFloat32Exp(c);
2478    cSign = extractFloat32Sign(c);
2479
2480    infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2481               (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2482
2483    /* It is implementation-defined whether the cases of (0,inf,qnan)
2484     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2485     * they return if they do), so we have to hand this information
2486     * off to the target-specific pick-a-NaN routine.
2487     */
2488    if (((aExp == 0xff) && aSig) ||
2489        ((bExp == 0xff) && bSig) ||
2490        ((cExp == 0xff) && cSig)) {
2491        return propagateFloat32MulAddNaN(a, b, c, infzero, status);
2492    }
2493
2494    if (infzero) {
2495        float_raise(float_flag_invalid, status);
2496        return float32_default_nan;
2497    }
2498
2499    if (flags & float_muladd_negate_c) {
2500        cSign ^= 1;
2501    }
2502
2503    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2504
2505    /* Work out the sign and type of the product */
2506    pSign = aSign ^ bSign;
2507    if (flags & float_muladd_negate_product) {
2508        pSign ^= 1;
2509    }
2510    pInf = (aExp == 0xff) || (bExp == 0xff);
2511    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2512
2513    if (cExp == 0xff) {
2514        if (pInf && (pSign ^ cSign)) {
2515            /* addition of opposite-signed infinities => InvalidOperation */
2516            float_raise(float_flag_invalid, status);
2517            return float32_default_nan;
2518        }
2519        /* Otherwise generate an infinity of the same sign */
2520        return packFloat32(cSign ^ signflip, 0xff, 0);
2521    }
2522
2523    if (pInf) {
2524        return packFloat32(pSign ^ signflip, 0xff, 0);
2525    }
2526
2527    if (pZero) {
2528        if (cExp == 0) {
2529            if (cSig == 0) {
2530                /* Adding two exact zeroes */
2531                if (pSign == cSign) {
2532                    zSign = pSign;
2533                } else if (status->float_rounding_mode == float_round_down) {
2534                    zSign = 1;
2535                } else {
2536                    zSign = 0;
2537                }
2538                return packFloat32(zSign ^ signflip, 0, 0);
2539            }
2540            /* Exact zero plus a denorm */
2541            if (status->flush_to_zero) {
2542                float_raise(float_flag_output_denormal, status);
2543                return packFloat32(cSign ^ signflip, 0, 0);
2544            }
2545        }
2546        /* Zero plus something non-zero : just return the something */
2547        if (flags & float_muladd_halve_result) {
2548            if (cExp == 0) {
2549                normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2550            }
2551            /* Subtract one to halve, and one again because roundAndPackFloat32
2552             * wants one less than the true exponent.
2553             */
2554            cExp -= 2;
2555            cSig = (cSig | 0x00800000) << 7;
2556            return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
2557        }
2558        return packFloat32(cSign ^ signflip, cExp, cSig);
2559    }
2560
2561    if (aExp == 0) {
2562        normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2563    }
2564    if (bExp == 0) {
2565        normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2566    }
2567
2568    /* Calculate the actual result a * b + c */
2569
2570    /* Multiply first; this is easy. */
2571    /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2572     * because we want the true exponent, not the "one-less-than"
2573     * flavour that roundAndPackFloat32() takes.
2574     */
2575    pExp = aExp + bExp - 0x7e;
2576    aSig = (aSig | 0x00800000) << 7;
2577    bSig = (bSig | 0x00800000) << 8;
2578    pSig64 = (uint64_t)aSig * bSig;
2579    if ((int64_t)(pSig64 << 1) >= 0) {
2580        pSig64 <<= 1;
2581        pExp--;
2582    }
2583
2584    zSign = pSign ^ signflip;
2585
2586    /* Now pSig64 is the significand of the multiply, with the explicit bit in
2587     * position 62.
2588     */
2589    if (cExp == 0) {
2590        if (!cSig) {
2591            /* Throw out the special case of c being an exact zero now */
2592            shift64RightJamming(pSig64, 32, &pSig64);
2593            pSig = pSig64;
2594            if (flags & float_muladd_halve_result) {
2595                pExp--;
2596            }
2597            return roundAndPackFloat32(zSign, pExp - 1,
2598                                       pSig, status);
2599        }
2600        normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2601    }
2602
2603    cSig64 = (uint64_t)cSig << (62 - 23);
2604    cSig64 |= LIT64(0x4000000000000000);
2605    expDiff = pExp - cExp;
2606
2607    if (pSign == cSign) {
2608        /* Addition */
2609        if (expDiff > 0) {
2610            /* scale c to match p */
2611            shift64RightJamming(cSig64, expDiff, &cSig64);
2612            zExp = pExp;
2613        } else if (expDiff < 0) {
2614            /* scale p to match c */
2615            shift64RightJamming(pSig64, -expDiff, &pSig64);
2616            zExp = cExp;
2617        } else {
2618            /* no scaling needed */
2619            zExp = cExp;
2620        }
2621        /* Add significands and make sure explicit bit ends up in posn 62 */
2622        zSig64 = pSig64 + cSig64;
2623        if ((int64_t)zSig64 < 0) {
2624            shift64RightJamming(zSig64, 1, &zSig64);
2625        } else {
2626            zExp--;
2627        }
2628    } else {
2629        /* Subtraction */
2630        if (expDiff > 0) {
2631            shift64RightJamming(cSig64, expDiff, &cSig64);
2632            zSig64 = pSig64 - cSig64;
2633            zExp = pExp;
2634        } else if (expDiff < 0) {
2635            shift64RightJamming(pSig64, -expDiff, &pSig64);
2636            zSig64 = cSig64 - pSig64;
2637            zExp = cExp;
2638            zSign ^= 1;
2639        } else {
2640            zExp = pExp;
2641            if (cSig64 < pSig64) {
2642                zSig64 = pSig64 - cSig64;
2643            } else if (pSig64 < cSig64) {
2644                zSig64 = cSig64 - pSig64;
2645                zSign ^= 1;
2646            } else {
2647                /* Exact zero */
2648                zSign = signflip;
2649                if (status->float_rounding_mode == float_round_down) {
2650                    zSign ^= 1;
2651                }
2652                return packFloat32(zSign, 0, 0);
2653            }
2654        }
2655        --zExp;
2656        /* Normalize to put the explicit bit back into bit 62. */
2657        shiftcount = countLeadingZeros64(zSig64) - 1;
2658        zSig64 <<= shiftcount;
2659        zExp -= shiftcount;
2660    }
2661    if (flags & float_muladd_halve_result) {
2662        zExp--;
2663    }
2664
2665    shift64RightJamming(zSig64, 32, &zSig64);
2666    return roundAndPackFloat32(zSign, zExp, zSig64, status);
2667}
2668
2669
2670/*----------------------------------------------------------------------------
2671| Returns the square root of the single-precision floating-point value `a'.
2672| The operation is performed according to the IEC/IEEE Standard for Binary
2673| Floating-Point Arithmetic.
2674*----------------------------------------------------------------------------*/
2675
2676float32 float32_sqrt(float32 a, float_status *status)
2677{
2678    flag aSign;
2679    int aExp, zExp;
2680    uint32_t aSig, zSig;
2681    uint64_t rem, term;
2682    a = float32_squash_input_denormal(a, status);
2683
2684    aSig = extractFloat32Frac( a );
2685    aExp = extractFloat32Exp( a );
2686    aSign = extractFloat32Sign( a );
2687    if ( aExp == 0xFF ) {
2688        if (aSig) {
2689            return propagateFloat32NaN(a, float32_zero, status);
2690        }
2691        if ( ! aSign ) return a;
2692        float_raise(float_flag_invalid, status);
2693        return float32_default_nan;
2694    }
2695    if ( aSign ) {
2696        if ( ( aExp | aSig ) == 0 ) return a;
2697        float_raise(float_flag_invalid, status);
2698        return float32_default_nan;
2699    }
2700    if ( aExp == 0 ) {
2701        if ( aSig == 0 ) return float32_zero;
2702        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2703    }
2704    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2705    aSig = ( aSig | 0x00800000 )<<8;
2706    zSig = estimateSqrt32( aExp, aSig ) + 2;
2707    if ( ( zSig & 0x7F ) <= 5 ) {
2708        if ( zSig < 2 ) {
2709            zSig = 0x7FFFFFFF;
2710            goto roundAndPack;
2711        }
2712        aSig >>= aExp & 1;
2713        term = ( (uint64_t) zSig ) * zSig;
2714        rem = ( ( (uint64_t) aSig )<<32 ) - term;
2715        while ( (int64_t) rem < 0 ) {
2716            --zSig;
2717            rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2718        }
2719        zSig |= ( rem != 0 );
2720    }
2721    shift32RightJamming( zSig, 1, &zSig );
2722 roundAndPack:
2723    return roundAndPackFloat32(0, zExp, zSig, status);
2724
2725}
2726
2727/*----------------------------------------------------------------------------
2728| Returns the binary exponential of the single-precision floating-point value
2729| `a'. The operation is performed according to the IEC/IEEE Standard for
2730| Binary Floating-Point Arithmetic.
2731|
2732| Uses the following identities:
2733|
2734| 1. -------------------------------------------------------------------------
2735|      x    x*ln(2)
2736|     2  = e
2737|
2738| 2. -------------------------------------------------------------------------
2739|                      2     3     4     5           n
2740|      x        x     x     x     x     x           x
2741|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2742|               1!    2!    3!    4!    5!          n!
2743*----------------------------------------------------------------------------*/
2744
2745static const float64 float32_exp2_coefficients[15] =
2746{
2747    const_float64( 0x3ff0000000000000ll ), /*  1 */
2748    const_float64( 0x3fe0000000000000ll ), /*  2 */
2749    const_float64( 0x3fc5555555555555ll ), /*  3 */
2750    const_float64( 0x3fa5555555555555ll ), /*  4 */
2751    const_float64( 0x3f81111111111111ll ), /*  5 */
2752    const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2753    const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2754    const_float64( 0x3efa01a01a01a01all ), /*  8 */
2755    const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2756    const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2757    const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2758    const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2759    const_float64( 0x3de6124613a86d09ll ), /* 13 */
2760    const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2761    const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2762};
2763
2764float32 float32_exp2(float32 a, float_status *status)
2765{
2766    flag aSign;
2767    int aExp;
2768    uint32_t aSig;
2769    float64 r, x, xn;
2770    int i;
2771    a = float32_squash_input_denormal(a, status);
2772
2773    aSig = extractFloat32Frac( a );
2774    aExp = extractFloat32Exp( a );
2775    aSign = extractFloat32Sign( a );
2776
2777    if ( aExp == 0xFF) {
2778        if (aSig) {
2779            return propagateFloat32NaN(a, float32_zero, status);
2780        }
2781        return (aSign) ? float32_zero : a;
2782    }
2783    if (aExp == 0) {
2784        if (aSig == 0) return float32_one;
2785    }
2786
2787    float_raise(float_flag_inexact, status);
2788
2789    /* ******************************* */
2790    /* using float64 for approximation */
2791    /* ******************************* */
2792    x = float32_to_float64(a, status);
2793    x = float64_mul(x, float64_ln2, status);
2794
2795    xn = x;
2796    r = float64_one;
2797    for (i = 0 ; i < 15 ; i++) {
2798        float64 f;
2799
2800        f = float64_mul(xn, float32_exp2_coefficients[i], status);
2801        r = float64_add(r, f, status);
2802
2803        xn = float64_mul(xn, x, status);
2804    }
2805
2806    return float64_to_float32(r, status);
2807}
2808
2809/*----------------------------------------------------------------------------
2810| Returns the binary log of the single-precision floating-point value `a'.
2811| The operation is performed according to the IEC/IEEE Standard for Binary
2812| Floating-Point Arithmetic.
2813*----------------------------------------------------------------------------*/
2814float32 float32_log2(float32 a, float_status *status)
2815{
2816    flag aSign, zSign;
2817    int aExp;
2818    uint32_t aSig, zSig, i;
2819
2820    a = float32_squash_input_denormal(a, status);
2821    aSig = extractFloat32Frac( a );
2822    aExp = extractFloat32Exp( a );
2823    aSign = extractFloat32Sign( a );
2824
2825    if ( aExp == 0 ) {
2826        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2827        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2828    }
2829    if ( aSign ) {
2830        float_raise(float_flag_invalid, status);
2831        return float32_default_nan;
2832    }
2833    if ( aExp == 0xFF ) {
2834        if (aSig) {
2835            return propagateFloat32NaN(a, float32_zero, status);
2836        }
2837        return a;
2838    }
2839
2840    aExp -= 0x7F;
2841    aSig |= 0x00800000;
2842    zSign = aExp < 0;
2843    zSig = aExp << 23;
2844
2845    for (i = 1 << 22; i > 0; i >>= 1) {
2846        aSig = ( (uint64_t)aSig * aSig ) >> 23;
2847        if ( aSig & 0x01000000 ) {
2848            aSig >>= 1;
2849            zSig |= i;
2850        }
2851    }
2852
2853    if ( zSign )
2854        zSig = -zSig;
2855
2856    return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
2857}
2858
2859/*----------------------------------------------------------------------------
2860| Returns 1 if the single-precision floating-point value `a' is equal to
2861| the corresponding value `b', and 0 otherwise.  The invalid exception is
2862| raised if either operand is a NaN.  Otherwise, the comparison is performed
2863| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2864*----------------------------------------------------------------------------*/
2865
2866int float32_eq(float32 a, float32 b, float_status *status)
2867{
2868    uint32_t av, bv;
2869    a = float32_squash_input_denormal(a, status);
2870    b = float32_squash_input_denormal(b, status);
2871
2872    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2873         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2874       ) {
2875        float_raise(float_flag_invalid, status);
2876        return 0;
2877    }
2878    av = float32_val(a);
2879    bv = float32_val(b);
2880    return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2881}
2882
2883/*----------------------------------------------------------------------------
2884| Returns 1 if the single-precision floating-point value `a' is less than
2885| or equal to the corresponding value `b', and 0 otherwise.  The invalid
2886| exception is raised if either operand is a NaN.  The comparison is performed
2887| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2888*----------------------------------------------------------------------------*/
2889
2890int float32_le(float32 a, float32 b, float_status *status)
2891{
2892    flag aSign, bSign;
2893    uint32_t av, bv;
2894    a = float32_squash_input_denormal(a, status);
2895    b = float32_squash_input_denormal(b, status);
2896
2897    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2898         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2899       ) {
2900        float_raise(float_flag_invalid, status);
2901        return 0;
2902    }
2903    aSign = extractFloat32Sign( a );
2904    bSign = extractFloat32Sign( b );
2905    av = float32_val(a);
2906    bv = float32_val(b);
2907    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2908    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2909
2910}
2911
2912/*----------------------------------------------------------------------------
2913| Returns 1 if the single-precision floating-point value `a' is less than
2914| the corresponding value `b', and 0 otherwise.  The invalid exception is
2915| raised if either operand is a NaN.  The comparison is performed according
2916| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2917*----------------------------------------------------------------------------*/
2918
2919int float32_lt(float32 a, float32 b, float_status *status)
2920{
2921    flag aSign, bSign;
2922    uint32_t av, bv;
2923    a = float32_squash_input_denormal(a, status);
2924    b = float32_squash_input_denormal(b, status);
2925
2926    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2927         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2928       ) {
2929        float_raise(float_flag_invalid, status);
2930        return 0;
2931    }
2932    aSign = extractFloat32Sign( a );
2933    bSign = extractFloat32Sign( b );
2934    av = float32_val(a);
2935    bv = float32_val(b);
2936    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2937    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2938
2939}
2940
2941/*----------------------------------------------------------------------------
2942| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2943| be compared, and 0 otherwise.  The invalid exception is raised if either
2944| operand is a NaN.  The comparison is performed according to the IEC/IEEE
2945| Standard for Binary Floating-Point Arithmetic.
2946*----------------------------------------------------------------------------*/
2947
2948int float32_unordered(float32 a, float32 b, float_status *status)
2949{
2950    a = float32_squash_input_denormal(a, status);
2951    b = float32_squash_input_denormal(b, status);
2952
2953    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2954         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2955       ) {
2956        float_raise(float_flag_invalid, status);
2957        return 1;
2958    }
2959    return 0;
2960}
2961
2962/*----------------------------------------------------------------------------
2963| Returns 1 if the single-precision floating-point value `a' is equal to
2964| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2965| exception.  The comparison is performed according to the IEC/IEEE Standard
2966| for Binary Floating-Point Arithmetic.
2967*----------------------------------------------------------------------------*/
2968
2969int float32_eq_quiet(float32 a, float32 b, float_status *status)
2970{
2971    a = float32_squash_input_denormal(a, status);
2972    b = float32_squash_input_denormal(b, status);
2973
2974    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2975         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2976       ) {
2977        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2978            float_raise(float_flag_invalid, status);
2979        }
2980        return 0;
2981    }
2982    return ( float32_val(a) == float32_val(b) ) ||
2983            ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2984}
2985
2986/*----------------------------------------------------------------------------
2987| Returns 1 if the single-precision floating-point value `a' is less than or
2988| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2989| cause an exception.  Otherwise, the comparison is performed according to the
2990| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2991*----------------------------------------------------------------------------*/
2992
2993int float32_le_quiet(float32 a, float32 b, float_status *status)
2994{
2995    flag aSign, bSign;
2996    uint32_t av, bv;
2997    a = float32_squash_input_denormal(a, status);
2998    b = float32_squash_input_denormal(b, status);
2999
3000    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3001         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3002       ) {
3003        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
3004            float_raise(float_flag_invalid, status);
3005        }
3006        return 0;
3007    }
3008    aSign = extractFloat32Sign( a );
3009    bSign = extractFloat32Sign( b );
3010    av = float32_val(a);
3011    bv = float32_val(b);
3012    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3013    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3014
3015}
3016
3017/*----------------------------------------------------------------------------
3018| Returns 1 if the single-precision floating-point value `a' is less than
3019| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3020| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3021| Standard for Binary Floating-Point Arithmetic.
3022*----------------------------------------------------------------------------*/
3023
3024int float32_lt_quiet(float32 a, float32 b, float_status *status)
3025{
3026    flag aSign, bSign;
3027    uint32_t av, bv;
3028    a = float32_squash_input_denormal(a, status);
3029    b = float32_squash_input_denormal(b, status);
3030
3031    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3032         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3033       ) {
3034        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
3035            float_raise(float_flag_invalid, status);
3036        }
3037        return 0;
3038    }
3039    aSign = extractFloat32Sign( a );
3040    bSign = extractFloat32Sign( b );
3041    av = float32_val(a);
3042    bv = float32_val(b);
3043    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3044    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3045
3046}
3047
3048/*----------------------------------------------------------------------------
3049| Returns 1 if the single-precision floating-point values `a' and `b' cannot
3050| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3051| comparison is performed according to the IEC/IEEE Standard for Binary
3052| Floating-Point Arithmetic.
3053*----------------------------------------------------------------------------*/
3054
3055int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3056{
3057    a = float32_squash_input_denormal(a, status);
3058    b = float32_squash_input_denormal(b, status);
3059
3060    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3061         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3062       ) {
3063        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
3064            float_raise(float_flag_invalid, status);
3065        }
3066        return 1;
3067    }
3068    return 0;
3069}
3070
3071/*----------------------------------------------------------------------------
3072| Returns the result of converting the double-precision floating-point value
3073| `a' to the 32-bit two's complement integer format.  The conversion is
3074| performed according to the IEC/IEEE Standard for Binary Floating-Point
3075| Arithmetic---which means in particular that the conversion is rounded
3076| according to the current rounding mode.  If `a' is a NaN, the largest
3077| positive integer is returned.  Otherwise, if the conversion overflows, the
3078| largest integer with the same sign as `a' is returned.
3079*----------------------------------------------------------------------------*/
3080
3081int32_t float64_to_int32(float64 a, float_status *status)
3082{
3083    flag aSign;
3084    int aExp;
3085    int shiftCount;
3086    uint64_t aSig;
3087    a = float64_squash_input_denormal(a, status);
3088
3089    aSig = extractFloat64Frac( a );
3090    aExp = extractFloat64Exp( a );
3091    aSign = extractFloat64Sign( a );
3092    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3093    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3094    shiftCount = 0x42C - aExp;
3095    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3096    return roundAndPackInt32(aSign, aSig, status);
3097
3098}
3099
3100/*----------------------------------------------------------------------------
3101| Returns the result of converting the double-precision floating-point value
3102| `a' to the 32-bit two's complement integer format.  The conversion is
3103| performed according to the IEC/IEEE Standard for Binary Floating-Point
3104| Arithmetic, except that the conversion is always rounded toward zero.
3105| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3106| the conversion overflows, the largest integer with the same sign as `a' is
3107| returned.
3108*----------------------------------------------------------------------------*/
3109
3110int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
3111{
3112    flag aSign;
3113    int aExp;
3114    int shiftCount;
3115    uint64_t aSig, savedASig;
3116    int32_t z;
3117    a = float64_squash_input_denormal(a, status);
3118
3119    aSig = extractFloat64Frac( a );
3120    aExp = extractFloat64Exp( a );
3121    aSign = extractFloat64Sign( a );
3122    if ( 0x41E < aExp ) {
3123        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3124        goto invalid;
3125    }
3126    else if ( aExp < 0x3FF ) {
3127        if (aExp || aSig) {
3128            status->float_exception_flags |= float_flag_inexact;
3129        }
3130        return 0;
3131    }
3132    aSig |= LIT64( 0x0010000000000000 );
3133    shiftCount = 0x433 - aExp;
3134    savedASig = aSig;
3135    aSig >>= shiftCount;
3136    z = aSig;
3137    if ( aSign ) z = - z;
3138    if ( ( z < 0 ) ^ aSign ) {
3139 invalid:
3140        float_raise(float_flag_invalid, status);
3141        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3142    }
3143    if ( ( aSig<<shiftCount ) != savedASig ) {
3144        status->float_exception_flags |= float_flag_inexact;
3145    }
3146    return z;
3147
3148}
3149
3150/*----------------------------------------------------------------------------
3151| Returns the result of converting the double-precision floating-point value
3152| `a' to the 16-bit two's complement integer format.  The conversion is
3153| performed according to the IEC/IEEE Standard for Binary Floating-Point
3154| Arithmetic, except that the conversion is always rounded toward zero.
3155| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3156| the conversion overflows, the largest integer with the same sign as `a' is
3157| returned.
3158*----------------------------------------------------------------------------*/
3159
3160int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3161{
3162    flag aSign;
3163    int aExp;
3164    int shiftCount;
3165    uint64_t aSig, savedASig;
3166    int32_t z;
3167
3168    aSig = extractFloat64Frac( a );
3169    aExp = extractFloat64Exp( a );
3170    aSign = extractFloat64Sign( a );
3171    if ( 0x40E < aExp ) {
3172        if ( ( aExp == 0x7FF ) && aSig ) {
3173            aSign = 0;
3174        }
3175        goto invalid;
3176    }
3177    else if ( aExp < 0x3FF ) {
3178        if ( aExp || aSig ) {
3179            status->float_exception_flags |= float_flag_inexact;
3180        }
3181        return 0;
3182    }
3183    aSig |= LIT64( 0x0010000000000000 );
3184    shiftCount = 0x433 - aExp;
3185    savedASig = aSig;
3186    aSig >>= shiftCount;
3187    z = aSig;
3188    if ( aSign ) {
3189        z = - z;
3190    }
3191    if ( ( (int16_t)z < 0 ) ^ aSign ) {
3192 invalid:
3193        float_raise(float_flag_invalid, status);
3194        return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3195    }
3196    if ( ( aSig<<shiftCount ) != savedASig ) {
3197        status->float_exception_flags |= float_flag_inexact;
3198    }
3199    return z;
3200}
3201
3202/*----------------------------------------------------------------------------
3203| Returns the result of converting the double-precision floating-point value
3204| `a' to the 64-bit two's complement integer format.  The conversion is
3205| performed according to the IEC/IEEE Standard for Binary Floating-Point
3206| Arithmetic---which means in particular that the conversion is rounded
3207| according to the current rounding mode.  If `a' is a NaN, the largest
3208| positive integer is returned.  Otherwise, if the conversion overflows, the
3209| largest integer with the same sign as `a' is returned.
3210*----------------------------------------------------------------------------*/
3211
3212int64_t float64_to_int64(float64 a, float_status *status)
3213{
3214    flag aSign;
3215    int aExp;
3216    int shiftCount;
3217    uint64_t aSig, aSigExtra;
3218    a = float64_squash_input_denormal(a, status);
3219
3220    aSig = extractFloat64Frac( a );
3221    aExp = extractFloat64Exp( a );
3222    aSign = extractFloat64Sign( a );
3223    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3224    shiftCount = 0x433 - aExp;
3225    if ( shiftCount <= 0 ) {
3226        if ( 0x43E < aExp ) {
3227            float_raise(float_flag_invalid, status);
3228            if (    ! aSign
3229                 || (    ( aExp == 0x7FF )
3230                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
3231               ) {
3232                return LIT64( 0x7FFFFFFFFFFFFFFF );
3233            }
3234            return (int64_t) LIT64( 0x8000000000000000 );
3235        }
3236        aSigExtra = 0;
3237        aSig <<= - shiftCount;
3238    }
3239    else {
3240        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3241    }
3242    return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3243
3244}
3245
3246/*----------------------------------------------------------------------------
3247| Returns the result of converting the double-precision floating-point value
3248| `a' to the 64-bit two's complement integer format.  The conversion is
3249| performed according to the IEC/IEEE Standard for Binary Floating-Point
3250| Arithmetic, except that the conversion is always rounded toward zero.
3251| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3252| the conversion overflows, the largest integer with the same sign as `a' is
3253| returned.
3254*----------------------------------------------------------------------------*/
3255
3256int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
3257{
3258    flag aSign;
3259    int aExp;
3260    int shiftCount;
3261    uint64_t aSig;
3262    int64_t z;
3263    a = float64_squash_input_denormal(a, status);
3264
3265    aSig = extractFloat64Frac( a );
3266    aExp = extractFloat64Exp( a );
3267    aSign = extractFloat64Sign( a );
3268    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3269    shiftCount = aExp - 0x433;
3270    if ( 0 <= shiftCount ) {
3271        if ( 0x43E <= aExp ) {
3272            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3273                float_raise(float_flag_invalid, status);
3274                if (    ! aSign
3275                     || (    ( aExp == 0x7FF )
3276                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
3277                   ) {
3278                    return LIT64( 0x7FFFFFFFFFFFFFFF );
3279                }
3280            }
3281            return (int64_t) LIT64( 0x8000000000000000 );
3282        }
3283        z = aSig<<shiftCount;
3284    }
3285    else {
3286        if ( aExp < 0x3FE ) {
3287            if (aExp | aSig) {
3288                status->float_exception_flags |= float_flag_inexact;
3289            }
3290            return 0;
3291        }
3292        z = aSig>>( - shiftCount );
3293        if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3294            status->float_exception_flags |= float_flag_inexact;
3295        }
3296    }
3297    if ( aSign ) z = - z;
3298    return z;
3299
3300}
3301
3302/*----------------------------------------------------------------------------
3303| Returns the result of converting the double-precision floating-point value
3304| `a' to the single-precision floating-point format.  The conversion is
3305| performed according to the IEC/IEEE Standard for Binary Floating-Point
3306| Arithmetic.
3307*----------------------------------------------------------------------------*/
3308
3309float32 float64_to_float32(float64 a, float_status *status)
3310{
3311    flag aSign;
3312    int aExp;
3313    uint64_t aSig;
3314    uint32_t zSig;
3315    a = float64_squash_input_denormal(a, status);
3316
3317    aSig = extractFloat64Frac( a );
3318    aExp = extractFloat64Exp( a );
3319    aSign = extractFloat64Sign( a );
3320    if ( aExp == 0x7FF ) {
3321        if (aSig) {
3322            return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3323        }
3324        return packFloat32( aSign, 0xFF, 0 );
3325    }
3326    shift64RightJamming( aSig, 22, &aSig );
3327    zSig = aSig;
3328    if ( aExp || zSig ) {
3329        zSig |= 0x40000000;
3330        aExp -= 0x381;
3331    }
3332    return roundAndPackFloat32(aSign, aExp, zSig, status);
3333
3334}
3335
3336
3337/*----------------------------------------------------------------------------
3338| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3339| half-precision floating-point value, returning the result.  After being
3340| shifted into the proper positions, the three fields are simply added
3341| together to form the result.  This means that any integer portion of `zSig'
3342| will be added into the exponent.  Since a properly normalized significand
3343| will have an integer portion equal to 1, the `zExp' input should be 1 less
3344| than the desired result exponent whenever `zSig' is a complete, normalized
3345| significand.
3346*----------------------------------------------------------------------------*/
3347static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3348{
3349    return make_float16(
3350        (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3351}
3352
3353/*----------------------------------------------------------------------------
3354| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3355| and significand `zSig', and returns the proper half-precision floating-
3356| point value corresponding to the abstract input.  Ordinarily, the abstract
3357| value is simply rounded and packed into the half-precision format, with
3358| the inexact exception raised if the abstract input cannot be represented
3359| exactly.  However, if the abstract value is too large, the overflow and
3360| inexact exceptions are raised and an infinity or maximal finite value is
3361| returned.  If the abstract value is too small, the input value is rounded to
3362| a subnormal number, and the underflow and inexact exceptions are raised if
3363| the abstract input cannot be represented exactly as a subnormal half-
3364| precision floating-point number.
3365| The `ieee' flag indicates whether to use IEEE standard half precision, or
3366| ARM-style "alternative representation", which omits the NaN and Inf
3367| encodings in order to raise the maximum representable exponent by one.
3368|     The input significand `zSig' has its binary point between bits 22
3369| and 23, which is 13 bits to the left of the usual location.  This shifted
3370| significand must be normalized or smaller.  If `zSig' is not normalized,
3371| `zExp' must be 0; in that case, the result returned is a subnormal number,
3372| and it must not require rounding.  In the usual case that `zSig' is
3373| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3374| Note the slightly odd position of the binary point in zSig compared with the
3375| other roundAndPackFloat functions. This should probably be fixed if we
3376| need to implement more float16 routines than just conversion.
3377| The handling of underflow and overflow follows the IEC/IEEE Standard for
3378| Binary Floating-Point Arithmetic.
3379*----------------------------------------------------------------------------*/
3380
3381static float16 roundAndPackFloat16(flag zSign, int zExp,
3382                                   uint32_t zSig, flag ieee,
3383                                   float_status *status)
3384{
3385    int maxexp = ieee ? 29 : 30;
3386    uint32_t mask;
3387    uint32_t increment;
3388    bool rounding_bumps_exp;
3389    bool is_tiny = false;
3390
3391    /* Calculate the mask of bits of the mantissa which are not
3392     * representable in half-precision and will be lost.
3393     */
3394    if (zExp < 1) {
3395        /* Will be denormal in halfprec */
3396        mask = 0x00ffffff;
3397        if (zExp >= -11) {
3398            mask >>= 11 + zExp;
3399        }
3400    } else {
3401        /* Normal number in halfprec */
3402        mask = 0x00001fff;
3403    }
3404
3405    switch (status->float_rounding_mode) {
3406    case float_round_nearest_even:
3407        increment = (mask + 1) >> 1;
3408        if ((zSig & mask) == increment) {
3409            increment = zSig & (increment << 1);
3410        }
3411        break;
3412    case float_round_ties_away:
3413        increment = (mask + 1) >> 1;
3414        break;
3415    case float_round_up:
3416        increment = zSign ? 0 : mask;
3417        break;
3418    case float_round_down:
3419        increment = zSign ? mask : 0;
3420        break;
3421    default: /* round_to_zero */
3422        increment = 0;
3423        break;
3424    }
3425
3426    rounding_bumps_exp = (zSig + increment >= 0x01000000);
3427
3428    if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3429        if (ieee) {
3430            float_raise(float_flag_overflow | float_flag_inexact, status);
3431            return packFloat16(zSign, 0x1f, 0);
3432        } else {
3433            float_raise(float_flag_invalid, status);
3434            return packFloat16(zSign, 0x1f, 0x3ff);
3435        }
3436    }
3437
3438    if (zExp < 0) {
3439        /* Note that flush-to-zero does not affect half-precision results */
3440        is_tiny =
3441            (status->float_detect_tininess == float_tininess_before_rounding)
3442            || (zExp < -1)
3443            || (!rounding_bumps_exp);
3444    }
3445    if (zSig & mask) {
3446        float_raise(float_flag_inexact, status);
3447        if (is_tiny) {
3448            float_raise(float_flag_underflow, status);
3449        }
3450    }
3451
3452    zSig += increment;
3453    if (rounding_bumps_exp) {
3454        zSig >>= 1;
3455        zExp++;
3456    }
3457
3458    if (zExp < -10) {
3459        return packFloat16(zSign, 0, 0);
3460    }
3461    if (zExp < 0) {
3462        zSig >>= -zExp;
3463        zExp = 0;
3464    }
3465    return packFloat16(zSign, zExp, zSig >> 13);
3466}
3467
3468static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3469                                      uint32_t *zSigPtr)
3470{
3471    int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3472    *zSigPtr = aSig << shiftCount;
3473    *zExpPtr = 1 - shiftCount;
3474}
3475
3476/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3477   The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3478
3479float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3480{
3481    flag aSign;
3482    int aExp;
3483    uint32_t aSig;
3484
3485    aSign = extractFloat16Sign(a);
3486    aExp = extractFloat16Exp(a);
3487    aSig = extractFloat16Frac(a);
3488
3489    if (aExp == 0x1f && ieee) {
3490        if (aSig) {
3491            return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3492        }
3493        return packFloat32(aSign, 0xff, 0);
3494    }
3495    if (aExp == 0) {
3496        if (aSig == 0) {
3497            return packFloat32(aSign, 0, 0);
3498        }
3499
3500        normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3501        aExp--;
3502    }
3503    return packFloat32( aSign, aExp + 0x70, aSig << 13);
3504}
3505
3506float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3507{
3508    flag aSign;
3509    int aExp;
3510    uint32_t aSig;
3511
3512    a = float32_squash_input_denormal(a, status);
3513
3514    aSig = extractFloat32Frac( a );
3515    aExp = extractFloat32Exp( a );
3516    aSign = extractFloat32Sign( a );
3517    if ( aExp == 0xFF ) {
3518        if (aSig) {
3519            /* Input is a NaN */
3520            if (!ieee) {
3521                float_raise(float_flag_invalid, status);
3522                return packFloat16(aSign, 0, 0);
3523            }
3524            return commonNaNToFloat16(
3525                float32ToCommonNaN(a, status), status);
3526        }
3527        /* Infinity */
3528        if (!ieee) {
3529            float_raise(float_flag_invalid, status);
3530            return packFloat16(aSign, 0x1f, 0x3ff);
3531        }
3532        return packFloat16(aSign, 0x1f, 0);
3533    }
3534    if (aExp == 0 && aSig == 0) {
3535        return packFloat16(aSign, 0, 0);
3536    }
3537    /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3538     * even if the input is denormal; however this is harmless because
3539     * the largest possible single-precision denormal is still smaller
3540     * than the smallest representable half-precision denormal, and so we
3541     * will end up ignoring aSig and returning via the "always return zero"
3542     * codepath.
3543     */
3544    aSig |= 0x00800000;
3545    aExp -= 0x71;
3546
3547    return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3548}
3549
3550float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3551{
3552    flag aSign;
3553    int aExp;
3554    uint32_t aSig;
3555
3556    aSign = extractFloat16Sign(a);
3557    aExp = extractFloat16Exp(a);
3558    aSig = extractFloat16Frac(a);
3559
3560    if (aExp == 0x1f && ieee) {
3561        if (aSig) {
3562            return commonNaNToFloat64(
3563                float16ToCommonNaN(a, status), status);
3564        }
3565        return packFloat64(aSign, 0x7ff, 0);
3566    }
3567    if (aExp == 0) {
3568        if (aSig == 0) {
3569            return packFloat64(aSign, 0, 0);
3570        }
3571
3572        normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3573        aExp--;
3574    }
3575    return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3576}
3577
3578float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3579{
3580    flag aSign;
3581    int aExp;
3582    uint64_t aSig;
3583    uint32_t zSig;
3584
3585    a = float64_squash_input_denormal(a, status);
3586
3587    aSig = extractFloat64Frac(a);
3588    aExp = extractFloat64Exp(a);
3589    aSign = extractFloat64Sign(a);
3590    if (aExp == 0x7FF) {
3591        if (aSig) {
3592            /* Input is a NaN */
3593            if (!ieee) {
3594                float_raise(float_flag_invalid, status);
3595                return packFloat16(aSign, 0, 0);
3596            }
3597            return commonNaNToFloat16(
3598                float64ToCommonNaN(a, status), status);
3599        }
3600        /* Infinity */
3601        if (!ieee) {
3602            float_raise(float_flag_invalid, status);
3603            return packFloat16(aSign, 0x1f, 0x3ff);
3604        }
3605        return packFloat16(aSign, 0x1f, 0);
3606    }
3607    shift64RightJamming(aSig, 29, &aSig);
3608    zSig = aSig;
3609    if (aExp == 0 && zSig == 0) {
3610        return packFloat16(aSign, 0, 0);
3611    }
3612    /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3613     * even if the input is denormal; however this is harmless because
3614     * the largest possible single-precision denormal is still smaller
3615     * than the smallest representable half-precision denormal, and so we
3616     * will end up ignoring aSig and returning via the "always return zero"
3617     * codepath.
3618     */
3619    zSig |= 0x00800000;
3620    aExp -= 0x3F1;
3621
3622    return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3623}
3624
3625/*----------------------------------------------------------------------------
3626| Returns the result of converting the double-precision floating-point value
3627| `a' to the extended double-precision floating-point format.  The conversion
3628| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3629| Arithmetic.
3630*----------------------------------------------------------------------------*/
3631
3632floatx80 float64_to_floatx80(float64 a, float_status *status)
3633{
3634    flag aSign;
3635    int aExp;
3636    uint64_t aSig;
3637
3638    a = float64_squash_input_denormal(a, status);
3639    aSig = extractFloat64Frac( a );
3640    aExp = extractFloat64Exp( a );
3641    aSign = extractFloat64Sign( a );
3642    if ( aExp == 0x7FF ) {
3643        if (aSig) {
3644            return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3645        }
3646        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3647    }
3648    if ( aExp == 0 ) {
3649        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3650        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3651    }
3652    return
3653        packFloatx80(
3654            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3655
3656}
3657
3658/*----------------------------------------------------------------------------
3659| Returns the result of converting the double-precision floating-point value
3660| `a' to the quadruple-precision floating-point format.  The conversion is
3661| performed according to the IEC/IEEE Standard for Binary Floating-Point
3662| Arithmetic.
3663*----------------------------------------------------------------------------*/
3664
3665float128 float64_to_float128(float64 a, float_status *status)
3666{
3667    flag aSign;
3668    int aExp;
3669    uint64_t aSig, zSig0, zSig1;
3670
3671    a = float64_squash_input_denormal(a, status);
3672    aSig = extractFloat64Frac( a );
3673    aExp = extractFloat64Exp( a );
3674    aSign = extractFloat64Sign( a );
3675    if ( aExp == 0x7FF ) {
3676        if (aSig) {
3677            return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3678        }
3679        return packFloat128( aSign, 0x7FFF, 0, 0 );
3680    }
3681    if ( aExp == 0 ) {
3682        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3683        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3684        --aExp;
3685    }
3686    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3687    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3688
3689}
3690
3691/*----------------------------------------------------------------------------
3692| Rounds the double-precision floating-point value `a' to an integer, and
3693| returns the result as a double-precision floating-point value.  The
3694| operation is performed according to the IEC/IEEE Standard for Binary
3695| Floating-Point Arithmetic.
3696*----------------------------------------------------------------------------*/
3697
3698float64 float64_round_to_int(float64 a, float_status *status)
3699{
3700    flag aSign;
3701    int aExp;
3702    uint64_t lastBitMask, roundBitsMask;
3703    uint64_t z;
3704    a = float64_squash_input_denormal(a, status);
3705
3706    aExp = extractFloat64Exp( a );
3707    if ( 0x433 <= aExp ) {
3708        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3709            return propagateFloat64NaN(a, a, status);
3710        }
3711        return a;
3712    }
3713    if ( aExp < 0x3FF ) {
3714        if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3715        status->float_exception_flags |= float_flag_inexact;
3716        aSign = extractFloat64Sign( a );
3717        switch (status->float_rounding_mode) {
3718         case float_round_nearest_even:
3719            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3720                return packFloat64( aSign, 0x3FF, 0 );
3721            }
3722            break;
3723        case float_round_ties_away:
3724            if (aExp == 0x3FE) {
3725                return packFloat64(aSign, 0x3ff, 0);
3726            }
3727            break;
3728         case float_round_down:
3729            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3730         case float_round_up:
3731            return make_float64(
3732            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3733        }
3734        return packFloat64( aSign, 0, 0 );
3735    }
3736    lastBitMask = 1;
3737    lastBitMask <<= 0x433 - aExp;
3738    roundBitsMask = lastBitMask - 1;
3739    z = float64_val(a);
3740    switch (status->float_rounding_mode) {
3741    case float_round_nearest_even:
3742        z += lastBitMask >> 1;
3743        if ((z & roundBitsMask) == 0) {
3744            z &= ~lastBitMask;
3745        }
3746        break;
3747    case float_round_ties_away:
3748        z += lastBitMask >> 1;
3749        break;
3750    case float_round_to_zero:
3751        break;
3752    case float_round_up:
3753        if (!extractFloat64Sign(make_float64(z))) {
3754            z += roundBitsMask;
3755        }
3756        break;
3757    case float_round_down:
3758        if (extractFloat64Sign(make_float64(z))) {
3759            z += roundBitsMask;
3760        }
3761        break;
3762    default:
3763        abort();
3764    }
3765    z &= ~ roundBitsMask;
3766    if (z != float64_val(a)) {
3767        status->float_exception_flags |= float_flag_inexact;
3768    }
3769    return make_float64(z);
3770
3771}
3772
3773float64 float64_trunc_to_int(float64 a, float_status *status)
3774{
3775    int oldmode;
3776    float64 res;
3777    oldmode = status->float_rounding_mode;
3778    status->float_rounding_mode = float_round_to_zero;
3779    res = float64_round_to_int(a, status);
3780    status->float_rounding_mode = oldmode;
3781    return res;
3782}
3783
3784/*----------------------------------------------------------------------------
3785| Returns the result of adding the absolute values of the double-precision
3786| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3787| before being returned.  `zSign' is ignored if the result is a NaN.
3788| The addition is performed according to the IEC/IEEE Standard for Binary
3789| Floating-Point Arithmetic.
3790*----------------------------------------------------------------------------*/
3791
3792static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3793                              float_status *status)
3794{
3795    int aExp, bExp, zExp;
3796    uint64_t aSig, bSig, zSig;
3797    int expDiff;
3798
3799    aSig = extractFloat64Frac( a );
3800    aExp = extractFloat64Exp( a );
3801    bSig = extractFloat64Frac( b );
3802    bExp = extractFloat64Exp( b );
3803    expDiff = aExp - bExp;
3804    aSig <<= 9;
3805    bSig <<= 9;
3806    if ( 0 < expDiff ) {
3807        if ( aExp == 0x7FF ) {
3808            if (aSig) {
3809                return propagateFloat64NaN(a, b, status);
3810            }
3811            return a;
3812        }
3813        if ( bExp == 0 ) {
3814            --expDiff;
3815        }
3816        else {
3817            bSig |= LIT64( 0x2000000000000000 );
3818        }
3819        shift64RightJamming( bSig, expDiff, &bSig );
3820        zExp = aExp;
3821    }
3822    else if ( expDiff < 0 ) {
3823        if ( bExp == 0x7FF ) {
3824            if (bSig) {
3825                return propagateFloat64NaN(a, b, status);
3826            }
3827            return packFloat64( zSign, 0x7FF, 0 );
3828        }
3829        if ( aExp == 0 ) {
3830            ++expDiff;
3831        }
3832        else {
3833            aSig |= LIT64( 0x2000000000000000 );
3834        }
3835        shift64RightJamming( aSig, - expDiff, &aSig );
3836        zExp = bExp;
3837    }
3838    else {
3839        if ( aExp == 0x7FF ) {
3840            if (aSig | bSig) {
3841                return propagateFloat64NaN(a, b, status);
3842            }
3843            return a;
3844        }
3845        if ( aExp == 0 ) {
3846            if (status->flush_to_zero) {
3847                if (aSig | bSig) {
3848                    float_raise(float_flag_output_denormal, status);
3849                }
3850                return packFloat64(zSign, 0, 0);
3851            }
3852            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3853        }
3854        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3855        zExp = aExp;
3856        goto roundAndPack;
3857    }
3858    aSig |= LIT64( 0x2000000000000000 );
3859    zSig = ( aSig + bSig )<<1;
3860    --zExp;
3861    if ( (int64_t) zSig < 0 ) {
3862        zSig = aSig + bSig;
3863        ++zExp;
3864    }
3865 roundAndPack:
3866    return roundAndPackFloat64(zSign, zExp, zSig, status);
3867
3868}
3869
3870/*----------------------------------------------------------------------------
3871| Returns the result of subtracting the absolute values of the double-
3872| precision floating-point values `a' and `b'.  If `zSign' is 1, the
3873| difference is negated before being returned.  `zSign' is ignored if the
3874| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3875| Standard for Binary Floating-Point Arithmetic.
3876*----------------------------------------------------------------------------*/
3877
3878static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3879                              float_status *status)
3880{
3881    int aExp, bExp, zExp;
3882    uint64_t aSig, bSig, zSig;
3883    int expDiff;
3884
3885    aSig = extractFloat64Frac( a );
3886    aExp = extractFloat64Exp( a );
3887    bSig = extractFloat64Frac( b );
3888    bExp = extractFloat64Exp( b );
3889    expDiff = aExp - bExp;
3890    aSig <<= 10;
3891    bSig <<= 10;
3892    if ( 0 < expDiff ) goto aExpBigger;
3893    if ( expDiff < 0 ) goto bExpBigger;
3894    if ( aExp == 0x7FF ) {
3895        if (aSig | bSig) {
3896            return propagateFloat64NaN(a, b, status);
3897        }
3898        float_raise(float_flag_invalid, status);
3899        return float64_default_nan;
3900    }
3901    if ( aExp == 0 ) {
3902        aExp = 1;
3903        bExp = 1;
3904    }
3905    if ( bSig < aSig ) goto aBigger;
3906    if ( aSig < bSig ) goto bBigger;
3907    return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
3908 bExpBigger:
3909    if ( bExp == 0x7FF ) {
3910        if (bSig) {
3911            return propagateFloat64NaN(a, b, status);
3912        }
3913        return packFloat64( zSign ^ 1, 0x7FF, 0 );
3914    }
3915    if ( aExp == 0 ) {
3916        ++expDiff;
3917    }
3918    else {
3919        aSig |= LIT64( 0x4000000000000000 );
3920    }
3921    shift64RightJamming( aSig, - expDiff, &aSig );
3922    bSig |= LIT64( 0x4000000000000000 );
3923 bBigger:
3924    zSig = bSig - aSig;
3925    zExp = bExp;
3926    zSign ^= 1;
3927    goto normalizeRoundAndPack;
3928 aExpBigger:
3929    if ( aExp == 0x7FF ) {
3930        if (aSig) {
3931            return propagateFloat64NaN(a, b, status);
3932        }
3933        return a;
3934    }
3935    if ( bExp == 0 ) {
3936        --expDiff;
3937    }
3938    else {
3939        bSig |= LIT64( 0x4000000000000000 );
3940    }
3941    shift64RightJamming( bSig, expDiff, &bSig );
3942    aSig |= LIT64( 0x4000000000000000 );
3943 aBigger:
3944    zSig = aSig - bSig;
3945    zExp = aExp;
3946 normalizeRoundAndPack:
3947    --zExp;
3948    return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
3949
3950}
3951
3952/*----------------------------------------------------------------------------
3953| Returns the result of adding the double-precision floating-point values `a'
3954| and `b'.  The operation is performed according to the IEC/IEEE Standard for
3955| Binary Floating-Point Arithmetic.
3956*----------------------------------------------------------------------------*/
3957
3958float64 float64_add(float64 a, float64 b, float_status *status)
3959{
3960    flag aSign, bSign;
3961    a = float64_squash_input_denormal(a, status);
3962    b = float64_squash_input_denormal(b, status);
3963
3964    aSign = extractFloat64Sign( a );
3965    bSign = extractFloat64Sign( b );
3966    if ( aSign == bSign ) {
3967        return addFloat64Sigs(a, b, aSign, status);
3968    }
3969    else {
3970        return subFloat64Sigs(a, b, aSign, status);
3971    }
3972
3973}
3974
3975/*----------------------------------------------------------------------------
3976| Returns the result of subtracting the double-precision floating-point values
3977| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3978| for Binary Floating-Point Arithmetic.
3979*----------------------------------------------------------------------------*/
3980
3981float64 float64_sub(float64 a, float64 b, float_status *status)
3982{
3983    flag aSign, bSign;
3984    a = float64_squash_input_denormal(a, status);
3985    b = float64_squash_input_denormal(b, status);
3986
3987    aSign = extractFloat64Sign( a );
3988    bSign = extractFloat64Sign( b );
3989    if ( aSign == bSign ) {
3990        return subFloat64Sigs(a, b, aSign, status);
3991    }
3992    else {
3993        return addFloat64Sigs(a, b, aSign, status);
3994    }
3995
3996}
3997
3998/*----------------------------------------------------------------------------
3999| Returns the result of multiplying the double-precision floating-point values
4000| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4001| for Binary Floating-Point Arithmetic.
4002*----------------------------------------------------------------------------*/
4003
4004float64 float64_mul(float64 a, float64 b, float_status *status)
4005{
4006    flag aSign, bSign, zSign;
4007    int aExp, bExp, zExp;
4008    uint64_t aSig, bSig, zSig0, zSig1;
4009
4010    a = float64_squash_input_denormal(a, status);
4011    b = float64_squash_input_denormal(b, status);
4012
4013    aSig = extractFloat64Frac( a );
4014    aExp = extractFloat64Exp( a );
4015    aSign = extractFloat64Sign( a );
4016    bSig = extractFloat64Frac( b );
4017    bExp = extractFloat64Exp( b );
4018    bSign = extractFloat64Sign( b );
4019    zSign = aSign ^ bSign;
4020    if ( aExp == 0x7FF ) {
4021        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4022            return propagateFloat64NaN(a, b, status);
4023        }
4024        if ( ( bExp | bSig ) == 0 ) {
4025            float_raise(float_flag_invalid, status);
4026            return float64_default_nan;
4027        }
4028        return packFloat64( zSign, 0x7FF, 0 );
4029    }
4030    if ( bExp == 0x7FF ) {
4031        if (bSig) {
4032            return propagateFloat64NaN(a, b, status);
4033        }
4034        if ( ( aExp | aSig ) == 0 ) {
4035            float_raise(float_flag_invalid, status);
4036            return float64_default_nan;
4037        }
4038        return packFloat64( zSign, 0x7FF, 0 );
4039    }
4040    if ( aExp == 0 ) {
4041        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4042        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4043    }
4044    if ( bExp == 0 ) {
4045        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4046        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4047    }
4048    zExp = aExp + bExp - 0x3FF;
4049    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4050    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4051    mul64To128( aSig, bSig, &zSig0, &zSig1 );
4052    zSig0 |= ( zSig1 != 0 );
4053    if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
4054        zSig0 <<= 1;
4055        --zExp;
4056    }
4057    return roundAndPackFloat64(zSign, zExp, zSig0, status);
4058
4059}
4060
4061/*----------------------------------------------------------------------------
4062| Returns the result of dividing the double-precision floating-point value `a'
4063| by the corresponding value `b'.  The operation is performed according to
4064| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4065*----------------------------------------------------------------------------*/
4066
4067float64 float64_div(float64 a, float64 b, float_status *status)
4068{
4069    flag aSign, bSign, zSign;
4070    int aExp, bExp, zExp;
4071    uint64_t aSig, bSig, zSig;
4072    uint64_t rem0, rem1;
4073    uint64_t term0, term1;
4074    a = float64_squash_input_denormal(a, status);
4075    b = float64_squash_input_denormal(b, status);
4076
4077    aSig = extractFloat64Frac( a );
4078    aExp = extractFloat64Exp( a );
4079    aSign = extractFloat64Sign( a );
4080    bSig = extractFloat64Frac( b );
4081    bExp = extractFloat64Exp( b );
4082    bSign = extractFloat64Sign( b );
4083    zSign = aSign ^ bSign;
4084    if ( aExp == 0x7FF ) {
4085        if (aSig) {
4086            return propagateFloat64NaN(a, b, status);
4087        }
4088        if ( bExp == 0x7FF ) {
4089            if (bSig) {
4090                return propagateFloat64NaN(a, b, status);
4091            }
4092            float_raise(float_flag_invalid, status);
4093            return float64_default_nan;
4094        }
4095        return packFloat64( zSign, 0x7FF, 0 );
4096    }
4097    if ( bExp == 0x7FF ) {
4098        if (bSig) {
4099            return propagateFloat64NaN(a, b, status);
4100        }
4101        return packFloat64( zSign, 0, 0 );
4102    }
4103    if ( bExp == 0 ) {
4104        if ( bSig == 0 ) {
4105            if ( ( aExp | aSig ) == 0 ) {
4106                float_raise(float_flag_invalid, status);
4107                return float64_default_nan;
4108            }
4109            float_raise(float_flag_divbyzero, status);
4110            return packFloat64( zSign, 0x7FF, 0 );
4111        }
4112        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4113    }
4114    if ( aExp == 0 ) {
4115        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4116        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4117    }
4118    zExp = aExp - bExp + 0x3FD;
4119    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4120    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4121    if ( bSig <= ( aSig + aSig ) ) {
4122        aSig >>= 1;
4123        ++zExp;
4124    }
4125    zSig = estimateDiv128To64( aSig, 0, bSig );
4126    if ( ( zSig & 0x1FF ) <= 2 ) {
4127        mul64To128( bSig, zSig, &term0, &term1 );
4128        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4129        while ( (int64_t) rem0 < 0 ) {
4130            --zSig;
4131            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4132        }
4133        zSig |= ( rem1 != 0 );
4134    }
4135    return roundAndPackFloat64(zSign, zExp, zSig, status);
4136
4137}
4138
4139/*----------------------------------------------------------------------------
4140| Returns the remainder of the double-precision floating-point value `a'
4141| with respect to the corresponding value `b'.  The operation is performed
4142| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4143*----------------------------------------------------------------------------*/
4144
4145float64 float64_rem(float64 a, float64 b, float_status *status)
4146{
4147    flag aSign, zSign;
4148    int aExp, bExp, expDiff;
4149    uint64_t aSig, bSig;
4150    uint64_t q, alternateASig;
4151    int64_t sigMean;
4152
4153    a = float64_squash_input_denormal(a, status);
4154    b = float64_squash_input_denormal(b, status);
4155    aSig = extractFloat64Frac( a );
4156    aExp = extractFloat64Exp( a );
4157    aSign = extractFloat64Sign( a );
4158    bSig = extractFloat64Frac( b );
4159    bExp = extractFloat64Exp( b );
4160    if ( aExp == 0x7FF ) {
4161        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4162            return propagateFloat64NaN(a, b, status);
4163        }
4164        float_raise(float_flag_invalid, status);
4165        return float64_default_nan;
4166    }
4167    if ( bExp == 0x7FF ) {
4168        if (bSig) {
4169            return propagateFloat64NaN(a, b, status);
4170        }
4171        return a;
4172    }
4173    if ( bExp == 0 ) {
4174        if ( bSig == 0 ) {
4175            float_raise(float_flag_invalid, status);
4176            return float64_default_nan;
4177        }
4178        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4179    }
4180    if ( aExp == 0 ) {
4181        if ( aSig == 0 ) return a;
4182        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4183    }
4184    expDiff = aExp - bExp;
4185    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4186    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4187    if ( expDiff < 0 ) {
4188        if ( expDiff < -1 ) return a;
4189        aSig >>= 1;
4190    }
4191    q = ( bSig <= aSig );
4192    if ( q ) aSig -= bSig;
4193    expDiff -= 64;
4194    while ( 0 < expDiff ) {
4195        q = estimateDiv128To64( aSig, 0, bSig );
4196        q = ( 2 < q ) ? q - 2 : 0;
4197        aSig = - ( ( bSig>>2 ) * q );
4198        expDiff -= 62;
4199    }
4200    expDiff += 64;
4201    if ( 0 < expDiff ) {
4202        q = estimateDiv128To64( aSig, 0, bSig );
4203        q = ( 2 < q ) ? q - 2 : 0;
4204        q >>= 64 - expDiff;
4205        bSig >>= 2;
4206        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4207    }
4208    else {
4209        aSig >>= 2;
4210        bSig >>= 2;
4211    }
4212    do {
4213        alternateASig = aSig;
4214        ++q;
4215        aSig -= bSig;
4216    } while ( 0 <= (int64_t) aSig );
4217    sigMean = aSig + alternateASig;
4218    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4219        aSig = alternateASig;
4220    }
4221    zSign = ( (int64_t) aSig < 0 );
4222    if ( zSign ) aSig = - aSig;
4223    return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4224
4225}
4226
4227/*----------------------------------------------------------------------------
4228| Returns the result of multiplying the double-precision floating-point values
4229| `a' and `b' then adding 'c', with no intermediate rounding step after the
4230| multiplication.  The operation is performed according to the IEC/IEEE
4231| Standard for Binary Floating-Point Arithmetic 754-2008.
4232| The flags argument allows the caller to select negation of the
4233| addend, the intermediate product, or the final result. (The difference
4234| between this and having the caller do a separate negation is that negating
4235| externally will flip the sign bit on NaNs.)
4236*----------------------------------------------------------------------------*/
4237
4238float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4239                       float_status *status)
4240{
4241    flag aSign, bSign, cSign, zSign;
4242    int aExp, bExp, cExp, pExp, zExp, expDiff;
4243    uint64_t aSig, bSig, cSig;
4244    flag pInf, pZero, pSign;
4245    uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4246    int shiftcount;
4247    flag signflip, infzero;
4248
4249    a = float64_squash_input_denormal(a, status);
4250    b = float64_squash_input_denormal(b, status);
4251    c = float64_squash_input_denormal(c, status);
4252    aSig = extractFloat64Frac(a);
4253    aExp = extractFloat64Exp(a);
4254    aSign = extractFloat64Sign(a);
4255    bSig = extractFloat64Frac(b);
4256    bExp = extractFloat64Exp(b);
4257    bSign = extractFloat64Sign(b);
4258    cSig = extractFloat64Frac(c);
4259    cExp = extractFloat64Exp(c);
4260    cSign = extractFloat64Sign(c);
4261
4262    infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4263               (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4264
4265    /* It is implementation-defined whether the cases of (0,inf,qnan)
4266     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4267     * they return if they do), so we have to hand this information
4268     * off to the target-specific pick-a-NaN routine.
4269     */
4270    if (((aExp == 0x7ff) && aSig) ||
4271        ((bExp == 0x7ff) && bSig) ||
4272        ((cExp == 0x7ff) && cSig)) {
4273        return propagateFloat64MulAddNaN(a, b, c, infzero, status);
4274    }
4275
4276    if (infzero) {
4277        float_raise(float_flag_invalid, status);
4278        return float64_default_nan;
4279    }
4280
4281    if (flags & float_muladd_negate_c) {
4282        cSign ^= 1;
4283    }
4284
4285    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4286
4287    /* Work out the sign and type of the product */
4288    pSign = aSign ^ bSign;
4289    if (flags & float_muladd_negate_product) {
4290        pSign ^= 1;
4291    }
4292    pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4293    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4294
4295    if (cExp == 0x7ff) {
4296        if (pInf && (pSign ^ cSign)) {
4297            /* addition of opposite-signed infinities => InvalidOperation */
4298            float_raise(float_flag_invalid, status);
4299            return float64_default_nan;
4300        }
4301        /* Otherwise generate an infinity of the same sign */
4302        return packFloat64(cSign ^ signflip, 0x7ff, 0);
4303    }
4304
4305    if (pInf) {
4306        return packFloat64(pSign ^ signflip, 0x7ff, 0);
4307    }
4308
4309    if (pZero) {
4310        if (cExp == 0) {
4311            if (cSig == 0) {
4312                /* Adding two exact zeroes */
4313                if (pSign == cSign) {
4314                    zSign = pSign;
4315                } else if (status->float_rounding_mode == float_round_down) {
4316                    zSign = 1;
4317                } else {
4318                    zSign = 0;
4319                }
4320                return packFloat64(zSign ^ signflip, 0, 0);
4321            }
4322            /* Exact zero plus a denorm */
4323            if (status->flush_to_zero) {
4324                float_raise(float_flag_output_denormal, status);
4325                return packFloat64(cSign ^ signflip, 0, 0);
4326            }
4327        }
4328        /* Zero plus something non-zero : just return the something */
4329        if (flags & float_muladd_halve_result) {
4330            if (cExp == 0) {
4331                normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4332            }
4333            /* Subtract one to halve, and one again because roundAndPackFloat64
4334             * wants one less than the true exponent.
4335             */
4336            cExp -= 2;
4337            cSig = (cSig | 0x0010000000000000ULL) << 10;
4338            return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
4339        }
4340        return packFloat64(cSign ^ signflip, cExp, cSig);
4341    }
4342
4343    if (aExp == 0) {
4344        normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4345    }
4346    if (bExp == 0) {
4347        normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4348    }
4349
4350    /* Calculate the actual result a * b + c */
4351
4352    /* Multiply first; this is easy. */
4353    /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4354     * because we want the true exponent, not the "one-less-than"
4355     * flavour that roundAndPackFloat64() takes.
4356     */
4357    pExp = aExp + bExp - 0x3fe;
4358    aSig = (aSig | LIT64(0x0010000000000000))<<10;
4359    bSig = (bSig | LIT64(0x0010000000000000))<<11;
4360    mul64To128(aSig, bSig, &pSig0, &pSig1);
4361    if ((int64_t)(pSig0 << 1) >= 0) {
4362        shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4363        pExp--;
4364    }
4365
4366    zSign = pSign ^ signflip;
4367
4368    /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4369     * bit in position 126.
4370     */
4371    if (cExp == 0) {
4372        if (!cSig) {
4373            /* Throw out the special case of c being an exact zero now */
4374            shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4375            if (flags & float_muladd_halve_result) {
4376                pExp--;
4377            }
4378            return roundAndPackFloat64(zSign, pExp - 1,
4379                                       pSig1, status);
4380        }
4381        normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4382    }
4383
4384    /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4385     * significand of the addend, with the explicit bit in position 126.
4386     */
4387    cSig0 = cSig << (126 - 64 - 52);
4388    cSig1 = 0;
4389    cSig0 |= LIT64(0x4000000000000000);
4390    expDiff = pExp - cExp;
4391
4392    if (pSign == cSign) {
4393        /* Addition */
4394        if (expDiff > 0) {
4395            /* scale c to match p */
4396            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4397            zExp = pExp;
4398        } else if (expDiff < 0) {
4399            /* scale p to match c */
4400            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4401            zExp = cExp;
4402        } else {
4403            /* no scaling needed */
4404            zExp = cExp;
4405        }
4406        /* Add significands and make sure explicit bit ends up in posn 126 */
4407        add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4408        if ((int64_t)zSig0 < 0) {
4409            shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4410        } else {
4411            zExp--;
4412        }
4413        shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4414        if (flags & float_muladd_halve_result) {
4415            zExp--;
4416        }
4417        return roundAndPackFloat64(zSign, zExp, zSig1, status);
4418    } else {
4419        /* Subtraction */
4420        if (expDiff > 0) {
4421            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4422            sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4423            zExp = pExp;
4424        } else if (expDiff < 0) {
4425            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4426            sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4427            zExp = cExp;
4428            zSign ^= 1;
4429        } else {
4430            zExp = pExp;
4431            if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4432                sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4433            } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4434                sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4435                zSign ^= 1;
4436            } else {
4437                /* Exact zero */
4438                zSign = signflip;
4439                if (status->float_rounding_mode == float_round_down) {
4440                    zSign ^= 1;
4441                }
4442                return packFloat64(zSign, 0, 0);
4443            }
4444        }
4445        --zExp;
4446        /* Do the equivalent of normalizeRoundAndPackFloat64() but
4447         * starting with the significand in a pair of uint64_t.
4448         */
4449        if (zSig0) {
4450            shiftcount = countLeadingZeros64(zSig0) - 1;
4451            shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4452            if (zSig1) {
4453                zSig0 |= 1;
4454            }
4455            zExp -= shiftcount;
4456        } else {
4457            shiftcount = countLeadingZeros64(zSig1);
4458            if (shiftcount == 0) {
4459                zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4460                zExp -= 63;
4461            } else {
4462                shiftcount--;
4463                zSig0 = zSig1 << shiftcount;
4464                zExp -= (shiftcount + 64);
4465            }
4466        }
4467        if (flags & float_muladd_halve_result) {
4468            zExp--;
4469        }
4470        return roundAndPackFloat64(zSign, zExp, zSig0, status);
4471    }
4472}
4473
4474/*----------------------------------------------------------------------------
4475| Returns the square root of the double-precision floating-point value `a'.
4476| The operation is performed according to the IEC/IEEE Standard for Binary
4477| Floating-Point Arithmetic.
4478*----------------------------------------------------------------------------*/
4479
4480float64 float64_sqrt(float64 a, float_status *status)
4481{
4482    flag aSign;
4483    int aExp, zExp;
4484    uint64_t aSig, zSig, doubleZSig;
4485    uint64_t rem0, rem1, term0, term1;
4486    a = float64_squash_input_denormal(a, status);
4487
4488    aSig = extractFloat64Frac( a );
4489    aExp = extractFloat64Exp( a );
4490    aSign = extractFloat64Sign( a );
4491    if ( aExp == 0x7FF ) {
4492        if (aSig) {
4493            return propagateFloat64NaN(a, a, status);
4494        }
4495        if ( ! aSign ) return a;
4496        float_raise(float_flag_invalid, status);
4497        return float64_default_nan;
4498    }
4499    if ( aSign ) {
4500        if ( ( aExp | aSig ) == 0 ) return a;
4501        float_raise(float_flag_invalid, status);
4502        return float64_default_nan;
4503    }
4504    if ( aExp == 0 ) {
4505        if ( aSig == 0 ) return float64_zero;
4506        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4507    }
4508    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4509    aSig |= LIT64( 0x0010000000000000 );
4510    zSig = estimateSqrt32( aExp, aSig>>21 );
4511    aSig <<= 9 - ( aExp & 1 );
4512    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4513    if ( ( zSig & 0x1FF ) <= 5 ) {
4514        doubleZSig = zSig<<1;
4515        mul64To128( zSig, zSig, &term0, &term1 );
4516        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4517        while ( (int64_t) rem0 < 0 ) {
4518            --zSig;
4519            doubleZSig -= 2;
4520            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4521        }
4522        zSig |= ( ( rem0 | rem1 ) != 0 );
4523    }
4524    return roundAndPackFloat64(0, zExp, zSig, status);
4525
4526}
4527
4528/*----------------------------------------------------------------------------
4529| Returns the binary log of the double-precision floating-point value `a'.
4530| The operation is performed according to the IEC/IEEE Standard for Binary
4531| Floating-Point Arithmetic.
4532*----------------------------------------------------------------------------*/
4533float64 float64_log2(float64 a, float_status *status)
4534{
4535    flag aSign, zSign;
4536    int aExp;
4537    uint64_t aSig, aSig0, aSig1, zSig, i;
4538    a = float64_squash_input_denormal(a, status);
4539
4540    aSig = extractFloat64Frac( a );
4541    aExp = extractFloat64Exp( a );
4542    aSign = extractFloat64Sign( a );
4543
4544    if ( aExp == 0 ) {
4545        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4546        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4547    }
4548    if ( aSign ) {
4549        float_raise(float_flag_invalid, status);
4550        return float64_default_nan;
4551    }
4552    if ( aExp == 0x7FF ) {
4553        if (aSig) {
4554            return propagateFloat64NaN(a, float64_zero, status);
4555        }
4556        return a;
4557    }
4558
4559    aExp -= 0x3FF;
4560    aSig |= LIT64( 0x0010000000000000 );
4561    zSign = aExp < 0;
4562    zSig = (uint64_t)aExp << 52;
4563    for (i = 1LL << 51; i > 0; i >>= 1) {
4564        mul64To128( aSig, aSig, &aSig0, &aSig1 );
4565        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4566        if ( aSig & LIT64( 0x0020000000000000 ) ) {
4567            aSig >>= 1;
4568            zSig |= i;
4569        }
4570    }
4571
4572    if ( zSign )
4573        zSig = -zSig;
4574    return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4575}
4576
4577/*----------------------------------------------------------------------------
4578| Returns 1 if the double-precision floating-point value `a' is equal to the
4579| corresponding value `b', and 0 otherwise.  The invalid exception is raised
4580| if either operand is a NaN.  Otherwise, the comparison is performed
4581| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4582*----------------------------------------------------------------------------*/
4583
4584int float64_eq(float64 a, float64 b, float_status *status)
4585{
4586    uint64_t av, bv;
4587    a = float64_squash_input_denormal(a, status);
4588    b = float64_squash_input_denormal(b, status);
4589
4590    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4591         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4592       ) {
4593        float_raise(float_flag_invalid, status);
4594        return 0;
4595    }
4596    av = float64_val(a);
4597    bv = float64_val(b);
4598    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4599
4600}
4601
4602/*----------------------------------------------------------------------------
4603| Returns 1 if the double-precision floating-point value `a' is less than or
4604| equal to the corresponding value `b', and 0 otherwise.  The invalid
4605| exception is raised if either operand is a NaN.  The comparison is performed
4606| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4607*----------------------------------------------------------------------------*/
4608
4609int float64_le(float64 a, float64 b, float_status *status)
4610{
4611    flag aSign, bSign;
4612    uint64_t av, bv;
4613    a = float64_squash_input_denormal(a, status);
4614    b = float64_squash_input_denormal(b, status);
4615
4616    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4617         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4618       ) {
4619        float_raise(float_flag_invalid, status);
4620        return 0;
4621    }
4622    aSign = extractFloat64Sign( a );
4623    bSign = extractFloat64Sign( b );
4624    av = float64_val(a);
4625    bv = float64_val(b);
4626    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4627    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4628
4629}
4630
4631/*----------------------------------------------------------------------------
4632| Returns 1 if the double-precision floating-point value `a' is less than
4633| the corresponding value `b', and 0 otherwise.  The invalid exception is
4634| raised if either operand is a NaN.  The comparison is performed according
4635| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4636*----------------------------------------------------------------------------*/
4637
4638int float64_lt(float64 a, float64 b, float_status *status)
4639{
4640    flag aSign, bSign;
4641    uint64_t av, bv;
4642
4643    a = float64_squash_input_denormal(a, status);
4644    b = float64_squash_input_denormal(b, status);
4645    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4646         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4647       ) {
4648        float_raise(float_flag_invalid, status);
4649        return 0;
4650    }
4651    aSign = extractFloat64Sign( a );
4652    bSign = extractFloat64Sign( b );
4653    av = float64_val(a);
4654    bv = float64_val(b);
4655    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4656    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4657
4658}
4659
4660/*----------------------------------------------------------------------------
4661| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4662| be compared, and 0 otherwise.  The invalid exception is raised if either
4663| operand is a NaN.  The comparison is performed according to the IEC/IEEE
4664| Standard for Binary Floating-Point Arithmetic.
4665*----------------------------------------------------------------------------*/
4666
4667int float64_unordered(float64 a, float64 b, float_status *status)
4668{
4669    a = float64_squash_input_denormal(a, status);
4670    b = float64_squash_input_denormal(b, status);
4671
4672    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4673         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4674       ) {
4675        float_raise(float_flag_invalid, status);
4676        return 1;
4677    }
4678    return 0;
4679}
4680
4681/*----------------------------------------------------------------------------
4682| Returns 1 if the double-precision floating-point value `a' is equal to the
4683| corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4684| exception.The comparison is performed according to the IEC/IEEE Standard
4685| for Binary Floating-Point Arithmetic.
4686*----------------------------------------------------------------------------*/
4687
4688int float64_eq_quiet(float64 a, float64 b, float_status *status)
4689{
4690    uint64_t av, bv;
4691    a = float64_squash_input_denormal(a, status);
4692    b = float64_squash_input_denormal(b, status);
4693
4694    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4695         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4696       ) {
4697        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4698            float_raise(float_flag_invalid, status);
4699        }
4700        return 0;
4701    }
4702    av = float64_val(a);
4703    bv = float64_val(b);
4704    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4705
4706}
4707
4708/*----------------------------------------------------------------------------
4709| Returns 1 if the double-precision floating-point value `a' is less than or
4710| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4711| cause an exception.  Otherwise, the comparison is performed according to the
4712| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4713*----------------------------------------------------------------------------*/
4714
4715int float64_le_quiet(float64 a, float64 b, float_status *status)
4716{
4717    flag aSign, bSign;
4718    uint64_t av, bv;
4719    a = float64_squash_input_denormal(a, status);
4720    b = float64_squash_input_denormal(b, status);
4721
4722    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4723         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4724       ) {
4725        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4726            float_raise(float_flag_invalid, status);
4727        }
4728        return 0;
4729    }
4730    aSign = extractFloat64Sign( a );
4731    bSign = extractFloat64Sign( b );
4732    av = float64_val(a);
4733    bv = float64_val(b);
4734    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4735    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4736
4737}
4738
4739/*----------------------------------------------------------------------------
4740| Returns 1 if the double-precision floating-point value `a' is less than
4741| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4742| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4743| Standard for Binary Floating-Point Arithmetic.
4744*----------------------------------------------------------------------------*/
4745
4746int float64_lt_quiet(float64 a, float64 b, float_status *status)
4747{
4748    flag aSign, bSign;
4749    uint64_t av, bv;
4750    a = float64_squash_input_denormal(a, status);
4751    b = float64_squash_input_denormal(b, status);
4752
4753    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4754         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4755       ) {
4756        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4757            float_raise(float_flag_invalid, status);
4758        }
4759        return 0;
4760    }
4761    aSign = extractFloat64Sign( a );
4762    bSign = extractFloat64Sign( b );
4763    av = float64_val(a);
4764    bv = float64_val(b);
4765    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4766    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4767
4768}
4769
4770/*----------------------------------------------------------------------------
4771| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4772| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4773| comparison is performed according to the IEC/IEEE Standard for Binary
4774| Floating-Point Arithmetic.
4775*----------------------------------------------------------------------------*/
4776
4777int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4778{
4779    a = float64_squash_input_denormal(a, status);
4780    b = float64_squash_input_denormal(b, status);
4781
4782    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4783         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4784       ) {
4785        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4786            float_raise(float_flag_invalid, status);
4787        }
4788        return 1;
4789    }
4790    return 0;
4791}
4792
4793/*----------------------------------------------------------------------------
4794| Returns the result of converting the extended double-precision floating-
4795| point value `a' to the 32-bit two's complement integer format.  The
4796| conversion is performed according to the IEC/IEEE Standard for Binary
4797| Floating-Point Arithmetic---which means in particular that the conversion
4798| is rounded according to the current rounding mode.  If `a' is a NaN, the
4799| largest positive integer is returned.  Otherwise, if the conversion
4800| overflows, the largest integer with the same sign as `a' is returned.
4801*----------------------------------------------------------------------------*/
4802
4803int32_t floatx80_to_int32(floatx80 a, float_status *status)
4804{
4805    flag aSign;
4806    int32_t aExp, shiftCount;
4807    uint64_t aSig;
4808
4809    aSig = extractFloatx80Frac( a );
4810    aExp = extractFloatx80Exp( a );
4811    aSign = extractFloatx80Sign( a );
4812    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4813    shiftCount = 0x4037 - aExp;
4814    if ( shiftCount <= 0 ) shiftCount = 1;
4815    shift64RightJamming( aSig, shiftCount, &aSig );
4816    return roundAndPackInt32(aSign, aSig, status);
4817
4818}
4819
4820/*----------------------------------------------------------------------------
4821| Returns the result of converting the extended double-precision floating-
4822| point value `a' to the 32-bit two's complement integer format.  The
4823| conversion is performed according to the IEC/IEEE Standard for Binary
4824| Floating-Point Arithmetic, except that the conversion is always rounded
4825| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4826| Otherwise, if the conversion overflows, the largest integer with the same
4827| sign as `a' is returned.
4828*----------------------------------------------------------------------------*/
4829
4830int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4831{
4832    flag aSign;
4833    int32_t aExp, shiftCount;
4834    uint64_t aSig, savedASig;
4835    int32_t z;
4836
4837    aSig = extractFloatx80Frac( a );
4838    aExp = extractFloatx80Exp( a );
4839    aSign = extractFloatx80Sign( a );
4840    if ( 0x401E < aExp ) {
4841        if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4842        goto invalid;
4843    }
4844    else if ( aExp < 0x3FFF ) {
4845        if (aExp || aSig) {
4846            status->float_exception_flags |= float_flag_inexact;
4847        }
4848        return 0;
4849    }
4850    shiftCount = 0x403E - aExp;
4851    savedASig = aSig;
4852    aSig >>= shiftCount;
4853    z = aSig;
4854    if ( aSign ) z = - z;
4855    if ( ( z < 0 ) ^ aSign ) {
4856 invalid:
4857        float_raise(float_flag_invalid, status);
4858        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4859    }
4860    if ( ( aSig<<shiftCount ) != savedASig ) {
4861        status->float_exception_flags |= float_flag_inexact;
4862    }
4863    return z;
4864
4865}
4866
4867/*----------------------------------------------------------------------------
4868| Returns the result of converting the extended double-precision floating-
4869| point value `a' to the 64-bit two's complement integer format.  The
4870| conversion is performed according to the IEC/IEEE Standard for Binary
4871| Floating-Point Arithmetic---which means in particular that the conversion
4872| is rounded according to the current rounding mode.  If `a' is a NaN,
4873| the largest positive integer is returned.  Otherwise, if the conversion
4874| overflows, the largest integer with the same sign as `a' is returned.
4875*----------------------------------------------------------------------------*/
4876
4877int64_t floatx80_to_int64(floatx80 a, float_status *status)
4878{
4879    flag aSign;
4880    int32_t aExp, shiftCount;
4881    uint64_t aSig, aSigExtra;
4882
4883    aSig = extractFloatx80Frac( a );
4884    aExp = extractFloatx80Exp( a );
4885    aSign = extractFloatx80Sign( a );
4886    shiftCount = 0x403E - aExp;
4887    if ( shiftCount <= 0 ) {
4888        if ( shiftCount ) {
4889            float_raise(float_flag_invalid, status);
4890            if (    ! aSign
4891                 || (    ( aExp == 0x7FFF )
4892                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
4893               ) {
4894                return LIT64( 0x7FFFFFFFFFFFFFFF );
4895            }
4896            return (int64_t) LIT64( 0x8000000000000000 );
4897        }
4898        aSigExtra = 0;
4899    }
4900    else {
4901        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4902    }
4903    return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4904
4905}
4906
4907/*----------------------------------------------------------------------------
4908| Returns the result of converting the extended double-precision floating-
4909| point value `a' to the 64-bit two's complement integer format.  The
4910| conversion is performed according to the IEC/IEEE Standard for Binary
4911| Floating-Point Arithmetic, except that the conversion is always rounded
4912| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4913| Otherwise, if the conversion overflows, the largest integer with the same
4914| sign as `a' is returned.
4915*----------------------------------------------------------------------------*/
4916
4917int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4918{
4919    flag aSign;
4920    int32_t aExp, shiftCount;
4921    uint64_t aSig;
4922    int64_t z;
4923
4924    aSig = extractFloatx80Frac( a );
4925    aExp = extractFloatx80Exp( a );
4926    aSign = extractFloatx80Sign( a );
4927    shiftCount = aExp - 0x403E;
4928    if ( 0 <= shiftCount ) {
4929        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4930        if ( ( a.high != 0xC03E ) || aSig ) {
4931            float_raise(float_flag_invalid, status);
4932            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4933                return LIT64( 0x7FFFFFFFFFFFFFFF );
4934            }
4935        }
4936        return (int64_t) LIT64( 0x8000000000000000 );
4937    }
4938    else if ( aExp < 0x3FFF ) {
4939        if (aExp | aSig) {
4940            status->float_exception_flags |= float_flag_inexact;
4941        }
4942        return 0;
4943    }
4944    z = aSig>>( - shiftCount );
4945    if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4946        status->float_exception_flags |= float_flag_inexact;
4947    }
4948    if ( aSign ) z = - z;
4949    return z;
4950
4951}
4952
4953/*----------------------------------------------------------------------------
4954| Returns the result of converting the extended double-precision floating-
4955| point value `a' to the single-precision floating-point format.  The
4956| conversion is performed according to the IEC/IEEE Standard for Binary
4957| Floating-Point Arithmetic.
4958*----------------------------------------------------------------------------*/
4959
4960float32 floatx80_to_float32(floatx80 a, float_status *status)
4961{
4962    flag aSign;
4963    int32_t aExp;
4964    uint64_t aSig;
4965
4966    aSig = extractFloatx80Frac( a );
4967    aExp = extractFloatx80Exp( a );
4968    aSign = extractFloatx80Sign( a );
4969    if ( aExp == 0x7FFF ) {
4970        if ( (uint64_t) ( aSig<<1 ) ) {
4971            return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4972        }
4973        return packFloat32( aSign, 0xFF, 0 );
4974    }
4975    shift64RightJamming( aSig, 33, &aSig );
4976    if ( aExp || aSig ) aExp -= 0x3F81;
4977    return roundAndPackFloat32(aSign, aExp, aSig, status);
4978
4979}
4980
4981/*----------------------------------------------------------------------------
4982| Returns the result of converting the extended double-precision floating-
4983| point value `a' to the double-precision floating-point format.  The
4984| conversion is performed according to the IEC/IEEE Standard for Binary
4985| Floating-Point Arithmetic.
4986*----------------------------------------------------------------------------*/
4987
4988float64 floatx80_to_float64(floatx80 a, float_status *status)
4989{
4990    flag aSign;
4991    int32_t aExp;
4992    uint64_t aSig, zSig;
4993
4994    aSig = extractFloatx80Frac( a );
4995    aExp = extractFloatx80Exp( a );
4996    aSign = extractFloatx80Sign( a );
4997    if ( aExp == 0x7FFF ) {
4998        if ( (uint64_t) ( aSig<<1 ) ) {
4999            return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5000        }
5001        return packFloat64( aSign, 0x7FF, 0 );
5002    }
5003    shift64RightJamming( aSig, 1, &zSig );
5004    if ( aExp || aSig ) aExp -= 0x3C01;
5005    return roundAndPackFloat64(aSign, aExp, zSig, status);
5006
5007}
5008
5009/*----------------------------------------------------------------------------
5010| Returns the result of converting the extended double-precision floating-
5011| point value `a' to the quadruple-precision floating-point format.  The
5012| conversion is performed according to the IEC/IEEE Standard for Binary
5013| Floating-Point Arithmetic.
5014*----------------------------------------------------------------------------*/
5015
5016float128 floatx80_to_float128(floatx80 a, float_status *status)
5017{
5018    flag aSign;
5019    int aExp;
5020    uint64_t aSig, zSig0, zSig1;
5021
5022    aSig = extractFloatx80Frac( a );
5023    aExp = extractFloatx80Exp( a );
5024    aSign = extractFloatx80Sign( a );
5025    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5026        return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5027    }
5028    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5029    return packFloat128( aSign, aExp, zSig0, zSig1 );
5030
5031}
5032
5033/*----------------------------------------------------------------------------
5034| Rounds the extended double-precision floating-point value `a' to an integer,
5035| and returns the result as an extended quadruple-precision floating-point
5036| value.  The operation is performed according to the IEC/IEEE Standard for
5037| Binary Floating-Point Arithmetic.
5038*----------------------------------------------------------------------------*/
5039
5040floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5041{
5042    flag aSign;
5043    int32_t aExp;
5044    uint64_t lastBitMask, roundBitsMask;
5045    floatx80 z;
5046
5047    aExp = extractFloatx80Exp( a );
5048    if ( 0x403E <= aExp ) {
5049        if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5050            return propagateFloatx80NaN(a, a, status);
5051        }
5052        return a;
5053    }
5054    if ( aExp < 0x3FFF ) {
5055        if (    ( aExp == 0 )
5056             && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5057            return a;
5058        }
5059        status->float_exception_flags |= float_flag_inexact;
5060        aSign = extractFloatx80Sign( a );
5061        switch (status->float_rounding_mode) {
5062         case float_round_nearest_even:
5063            if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5064               ) {
5065                return
5066                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5067            }
5068            break;
5069        case float_round_ties_away:
5070            if (aExp == 0x3FFE) {
5071                return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5072            }
5073            break;
5074         case float_round_down:
5075            return
5076                  aSign ?
5077                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5078                : packFloatx80( 0, 0, 0 );
5079         case float_round_up:
5080            return
5081                  aSign ? packFloatx80( 1, 0, 0 )
5082                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5083        }
5084        return packFloatx80( aSign, 0, 0 );
5085    }
5086    lastBitMask = 1;
5087    lastBitMask <<= 0x403E - aExp;
5088    roundBitsMask = lastBitMask - 1;
5089    z = a;
5090    switch (status->float_rounding_mode) {
5091    case float_round_nearest_even:
5092        z.low += lastBitMask>>1;
5093        if ((z.low & roundBitsMask) == 0) {
5094            z.low &= ~lastBitMask;
5095        }
5096        break;
5097    case float_round_ties_away:
5098        z.low += lastBitMask >> 1;
5099        break;
5100    case float_round_to_zero:
5101        break;
5102    case float_round_up:
5103        if (!extractFloatx80Sign(z)) {
5104            z.low += roundBitsMask;
5105        }
5106        break;
5107    case float_round_down:
5108        if (extractFloatx80Sign(z)) {
5109            z.low += roundBitsMask;
5110        }
5111        break;
5112    default:
5113        abort();
5114    }
5115    z.low &= ~ roundBitsMask;
5116    if ( z.low == 0 ) {
5117        ++z.high;
5118        z.low = LIT64( 0x8000000000000000 );
5119    }
5120    if (z.low != a.low) {
5121        status->float_exception_flags |= float_flag_inexact;
5122    }
5123    return z;
5124
5125}
5126
5127/*----------------------------------------------------------------------------
5128| Returns the result of adding the absolute values of the extended double-
5129| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5130| negated before being returned.  `zSign' is ignored if the result is a NaN.
5131| The addition is performed according to the IEC/IEEE Standard for Binary
5132| Floating-Point Arithmetic.
5133*----------------------------------------------------------------------------*/
5134
5135static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5136                                float_status *status)
5137{
5138    int32_t aExp, bExp, zExp;
5139    uint64_t aSig, bSig, zSig0, zSig1;
5140    int32_t expDiff;
5141
5142    aSig = extractFloatx80Frac( a );
5143    aExp = extractFloatx80Exp( a );
5144    bSig = extractFloatx80Frac( b );
5145    bExp = extractFloatx80Exp( b );
5146    expDiff = aExp - bExp;
5147    if ( 0 < expDiff ) {
5148        if ( aExp == 0x7FFF ) {
5149            if ((uint64_t)(aSig << 1)) {
5150                return propagateFloatx80NaN(a, b, status);
5151            }
5152            return a;
5153        }
5154        if ( bExp == 0 ) --expDiff;
5155        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5156        zExp = aExp;
5157    }
5158    else if ( expDiff < 0 ) {
5159        if ( bExp == 0x7FFF ) {
5160            if ((uint64_t)(bSig << 1)) {
5161                return propagateFloatx80NaN(a, b, status);
5162            }
5163            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5164        }
5165        if ( aExp == 0 ) ++expDiff;
5166        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5167        zExp = bExp;
5168    }
5169    else {
5170        if ( aExp == 0x7FFF ) {
5171            if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5172                return propagateFloatx80NaN(a, b, status);
5173            }
5174            return a;
5175        }
5176        zSig1 = 0;
5177        zSig0 = aSig + bSig;
5178        if ( aExp == 0 ) {
5179            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5180            goto roundAndPack;
5181        }
5182        zExp = aExp;
5183        goto shiftRight1;
5184    }
5185    zSig0 = aSig + bSig;
5186    if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5187 shiftRight1:
5188    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5189    zSig0 |= LIT64( 0x8000000000000000 );
5190    ++zExp;
5191 roundAndPack:
5192    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5193                                zSign, zExp, zSig0, zSig1, status);
5194}
5195
5196/*----------------------------------------------------------------------------
5197| Returns the result of subtracting the absolute values of the extended
5198| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5199| difference is negated before being returned.  `zSign' is ignored if the
5200| result is a NaN.  The subtraction is performed according to the IEC/IEEE
5201| Standard for Binary Floating-Point Arithmetic.
5202*----------------------------------------------------------------------------*/
5203
5204static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5205                                float_status *status)
5206{
5207    int32_t aExp, bExp, zExp;
5208    uint64_t aSig, bSig, zSig0, zSig1;
5209    int32_t expDiff;
5210    floatx80 z;
5211
5212    aSig = extractFloatx80Frac( a );
5213    aExp = extractFloatx80Exp( a );
5214    bSig = extractFloatx80Frac( b );
5215    bExp = extractFloatx80Exp( b );
5216    expDiff = aExp - bExp;
5217    if ( 0 < expDiff ) goto aExpBigger;
5218    if ( expDiff < 0 ) goto bExpBigger;
5219    if ( aExp == 0x7FFF ) {
5220        if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5221            return propagateFloatx80NaN(a, b, status);
5222        }
5223        float_raise(float_flag_invalid, status);
5224        z.low = floatx80_default_nan_low;
5225        z.high = floatx80_default_nan_high;
5226        return z;
5227    }
5228    if ( aExp == 0 ) {
5229        aExp = 1;
5230        bExp = 1;
5231    }
5232    zSig1 = 0;
5233    if ( bSig < aSig ) goto aBigger;
5234    if ( aSig < bSig ) goto bBigger;
5235    return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5236 bExpBigger:
5237    if ( bExp == 0x7FFF ) {
5238        if ((uint64_t)(bSig << 1)) {
5239            return propagateFloatx80NaN(a, b, status);
5240        }
5241        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5242    }
5243    if ( aExp == 0 ) ++expDiff;
5244    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5245 bBigger:
5246    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5247    zExp = bExp;
5248    zSign ^= 1;
5249    goto normalizeRoundAndPack;
5250 aExpBigger:
5251    if ( aExp == 0x7FFF ) {
5252        if ((uint64_t)(aSig << 1)) {
5253            return propagateFloatx80NaN(a, b, status);
5254        }
5255        return a;
5256    }
5257    if ( bExp == 0 ) --expDiff;
5258    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5259 aBigger:
5260    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5261    zExp = aExp;
5262 normalizeRoundAndPack:
5263    return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5264                                         zSign, zExp, zSig0, zSig1, status);
5265}
5266
5267/*----------------------------------------------------------------------------
5268| Returns the result of adding the extended double-precision floating-point
5269| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5270| Standard for Binary Floating-Point Arithmetic.
5271*----------------------------------------------------------------------------*/
5272
5273floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5274{
5275    flag aSign, bSign;
5276
5277    aSign = extractFloatx80Sign( a );
5278    bSign = extractFloatx80Sign( b );
5279    if ( aSign == bSign ) {
5280        return addFloatx80Sigs(a, b, aSign, status);
5281    }
5282    else {
5283        return subFloatx80Sigs(a, b, aSign, status);
5284    }
5285
5286}
5287
5288/*----------------------------------------------------------------------------
5289| Returns the result of subtracting the extended double-precision floating-
5290| point values `a' and `b'.  The operation is performed according to the
5291| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5292*----------------------------------------------------------------------------*/
5293
5294floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5295{
5296    flag aSign, bSign;
5297
5298    aSign = extractFloatx80Sign( a );
5299    bSign = extractFloatx80Sign( b );
5300    if ( aSign == bSign ) {
5301        return subFloatx80Sigs(a, b, aSign, status);
5302    }
5303    else {
5304        return addFloatx80Sigs(a, b, aSign, status);
5305    }
5306
5307}
5308
5309/*----------------------------------------------------------------------------
5310| Returns the result of multiplying the extended double-precision floating-
5311| point values `a' and `b'.  The operation is performed according to the
5312| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5313*----------------------------------------------------------------------------*/
5314
5315floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5316{
5317    flag aSign, bSign, zSign;
5318    int32_t aExp, bExp, zExp;
5319    uint64_t aSig, bSig, zSig0, zSig1;
5320    floatx80 z;
5321
5322    aSig = extractFloatx80Frac( a );
5323    aExp = extractFloatx80Exp( a );
5324    aSign = extractFloatx80Sign( a );
5325    bSig = extractFloatx80Frac( b );
5326    bExp = extractFloatx80Exp( b );
5327    bSign = extractFloatx80Sign( b );
5328    zSign = aSign ^ bSign;
5329    if ( aExp == 0x7FFF ) {
5330        if (    (uint64_t) ( aSig<<1 )
5331             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5332            return propagateFloatx80NaN(a, b, status);
5333        }
5334        if ( ( bExp | bSig ) == 0 ) goto invalid;
5335        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5336    }
5337    if ( bExp == 0x7FFF ) {
5338        if ((uint64_t)(bSig << 1)) {
5339            return propagateFloatx80NaN(a, b, status);
5340        }
5341        if ( ( aExp | aSig ) == 0 ) {
5342 invalid:
5343            float_raise(float_flag_invalid, status);
5344            z.low = floatx80_default_nan_low;
5345            z.high = floatx80_default_nan_high;
5346            return z;
5347        }
5348        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5349    }
5350    if ( aExp == 0 ) {
5351        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5352        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5353    }
5354    if ( bExp == 0 ) {
5355        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5356        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5357    }
5358    zExp = aExp + bExp - 0x3FFE;
5359    mul64To128( aSig, bSig, &zSig0, &zSig1 );
5360    if ( 0 < (int64_t) zSig0 ) {
5361        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5362        --zExp;
5363    }
5364    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5365                                zSign, zExp, zSig0, zSig1, status);
5366}
5367
5368/*----------------------------------------------------------------------------
5369| Returns the result of dividing the extended double-precision floating-point
5370| value `a' by the corresponding value `b'.  The operation is performed
5371| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5372*----------------------------------------------------------------------------*/
5373
5374floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5375{
5376    flag aSign, bSign, zSign;
5377    int32_t aExp, bExp, zExp;
5378    uint64_t aSig, bSig, zSig0, zSig1;
5379    uint64_t rem0, rem1, rem2, term0, term1, term2;
5380    floatx80 z;
5381
5382    aSig = extractFloatx80Frac( a );
5383    aExp = extractFloatx80Exp( a );
5384    aSign = extractFloatx80Sign( a );
5385    bSig = extractFloatx80Frac( b );
5386    bExp = extractFloatx80Exp( b );
5387    bSign = extractFloatx80Sign( b );
5388    zSign = aSign ^ bSign;
5389    if ( aExp == 0x7FFF ) {
5390        if ((uint64_t)(aSig << 1)) {
5391            return propagateFloatx80NaN(a, b, status);
5392        }
5393        if ( bExp == 0x7FFF ) {
5394            if ((uint64_t)(bSig << 1)) {
5395                return propagateFloatx80NaN(a, b, status);
5396            }
5397            goto invalid;
5398        }
5399        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5400    }
5401    if ( bExp == 0x7FFF ) {
5402        if ((uint64_t)(bSig << 1)) {
5403            return propagateFloatx80NaN(a, b, status);
5404        }
5405        return packFloatx80( zSign, 0, 0 );
5406    }
5407    if ( bExp == 0 ) {
5408        if ( bSig == 0 ) {
5409            if ( ( aExp | aSig ) == 0 ) {
5410 invalid:
5411                float_raise(float_flag_invalid, status);
5412                z.low = floatx80_default_nan_low;
5413                z.high = floatx80_default_nan_high;
5414                return z;
5415            }
5416            float_raise(float_flag_divbyzero, status);
5417            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5418        }
5419        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5420    }
5421    if ( aExp == 0 ) {
5422        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5423        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5424    }
5425    zExp = aExp - bExp + 0x3FFE;
5426    rem1 = 0;
5427    if ( bSig <= aSig ) {
5428        shift128Right( aSig, 0, 1, &aSig, &rem1 );
5429        ++zExp;
5430    }
5431    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5432    mul64To128( bSig, zSig0, &term0, &term1 );
5433    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5434    while ( (int64_t) rem0 < 0 ) {
5435        --zSig0;
5436        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5437    }
5438    zSig1 = estimateDiv128To64( rem1, 0, bSig );
5439    if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5440        mul64To128( bSig, zSig1, &term1, &term2 );
5441        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5442        while ( (int64_t) rem1 < 0 ) {
5443            --zSig1;
5444            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5445        }
5446        zSig1 |= ( ( rem1 | rem2 ) != 0 );
5447    }
5448    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5449                                zSign, zExp, zSig0, zSig1, status);
5450}
5451
5452/*----------------------------------------------------------------------------
5453| Returns the remainder of the extended double-precision floating-point value
5454| `a' with respect to the corresponding value `b'.  The operation is performed
5455| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5456*----------------------------------------------------------------------------*/
5457
5458floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5459{
5460    flag aSign, zSign;
5461    int32_t aExp, bExp, expDiff;
5462    uint64_t aSig0, aSig1, bSig;
5463    uint64_t q, term0, term1, alternateASig0, alternateASig1;
5464    floatx80 z;
5465
5466    aSig0 = extractFloatx80Frac( a );
5467    aExp = extractFloatx80Exp( a );
5468    aSign = extractFloatx80Sign( a );
5469    bSig = extractFloatx80Frac( b );
5470    bExp = extractFloatx80Exp( b );
5471    if ( aExp == 0x7FFF ) {
5472        if (    (uint64_t) ( aSig0<<1 )
5473             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5474            return propagateFloatx80NaN(a, b, status);
5475        }
5476        goto invalid;
5477    }
5478    if ( bExp == 0x7FFF ) {
5479        if ((uint64_t)(bSig << 1)) {
5480            return propagateFloatx80NaN(a, b, status);
5481        }
5482        return a;
5483    }
5484    if ( bExp == 0 ) {
5485        if ( bSig == 0 ) {
5486 invalid:
5487            float_raise(float_flag_invalid, status);
5488            z.low = floatx80_default_nan_low;
5489            z.high = floatx80_default_nan_high;
5490            return z;
5491        }
5492        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5493    }
5494    if ( aExp == 0 ) {
5495        if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5496        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5497    }
5498    bSig |= LIT64( 0x8000000000000000 );
5499    zSign = aSign;
5500    expDiff = aExp - bExp;
5501    aSig1 = 0;
5502    if ( expDiff < 0 ) {
5503        if ( expDiff < -1 ) return a;
5504        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5505        expDiff = 0;
5506    }
5507    q = ( bSig <= aSig0 );
5508    if ( q ) aSig0 -= bSig;
5509    expDiff -= 64;
5510    while ( 0 < expDiff ) {
5511        q = estimateDiv128To64( aSig0, aSig1, bSig );
5512        q = ( 2 < q ) ? q - 2 : 0;
5513        mul64To128( bSig, q, &term0, &term1 );
5514        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5515        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5516        expDiff -= 62;
5517    }
5518    expDiff += 64;
5519    if ( 0 < expDiff ) {
5520        q = estimateDiv128To64( aSig0, aSig1, bSig );
5521        q = ( 2 < q ) ? q - 2 : 0;
5522        q >>= 64 - expDiff;
5523        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5524        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5525        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5526        while ( le128( term0, term1, aSig0, aSig1 ) ) {
5527            ++q;
5528            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5529        }
5530    }
5531    else {
5532        term1 = 0;
5533        term0 = bSig;
5534    }
5535    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5536    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5537         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5538              && ( q & 1 ) )
5539       ) {
5540        aSig0 = alternateASig0;
5541        aSig1 = alternateASig1;
5542        zSign = ! zSign;
5543    }
5544    return
5545        normalizeRoundAndPackFloatx80(
5546            80, zSign, bExp + expDiff, aSig0, aSig1, status);
5547
5548}
5549
5550/*----------------------------------------------------------------------------
5551| Returns the square root of the extended double-precision floating-point
5552| value `a'.  The operation is performed according to the IEC/IEEE Standard
5553| for Binary Floating-Point Arithmetic.
5554*----------------------------------------------------------------------------*/
5555
5556floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5557{
5558    flag aSign;
5559    int32_t aExp, zExp;
5560    uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5561    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5562    floatx80 z;
5563
5564    aSig0 = extractFloatx80Frac( a );
5565    aExp = extractFloatx80Exp( a );
5566    aSign = extractFloatx80Sign( a );
5567    if ( aExp == 0x7FFF ) {
5568        if ((uint64_t)(aSig0 << 1)) {
5569            return propagateFloatx80NaN(a, a, status);
5570        }
5571        if ( ! aSign ) return a;
5572        goto invalid;
5573    }
5574    if ( aSign ) {
5575        if ( ( aExp | aSig0 ) == 0 ) return a;
5576 invalid:
5577        float_raise(float_flag_invalid, status);
5578        z.low = floatx80_default_nan_low;
5579        z.high = floatx80_default_nan_high;
5580        return z;
5581    }
5582    if ( aExp == 0 ) {
5583        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5584        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5585    }
5586    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5587    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5588    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5589    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5590    doubleZSig0 = zSig0<<1;
5591    mul64To128( zSig0, zSig0, &term0, &term1 );
5592    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5593    while ( (int64_t) rem0 < 0 ) {
5594        --zSig0;
5595        doubleZSig0 -= 2;
5596        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5597    }
5598    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5599    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5600        if ( zSig1 == 0 ) zSig1 = 1;
5601        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5602        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5603        mul64To128( zSig1, zSig1, &term2, &term3 );
5604        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5605        while ( (int64_t) rem1 < 0 ) {
5606            --zSig1;
5607            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5608            term3 |= 1;
5609            term2 |= doubleZSig0;
5610            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5611        }
5612        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5613    }
5614    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5615    zSig0 |= doubleZSig0;
5616    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5617                                0, zExp, zSig0, zSig1, status);
5618}
5619
5620/*----------------------------------------------------------------------------
5621| Returns 1 if the extended double-precision floating-point value `a' is equal
5622| to the corresponding value `b', and 0 otherwise.  The invalid exception is
5623| raised if either operand is a NaN.  Otherwise, the comparison is performed
5624| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5625*----------------------------------------------------------------------------*/
5626
5627int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5628{
5629
5630    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5631              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5632         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5633              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5634       ) {
5635        float_raise(float_flag_invalid, status);
5636        return 0;
5637    }
5638    return
5639           ( a.low == b.low )
5640        && (    ( a.high == b.high )
5641             || (    ( a.low == 0 )
5642                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5643           );
5644
5645}
5646
5647/*----------------------------------------------------------------------------
5648| Returns 1 if the extended double-precision floating-point value `a' is
5649| less than or equal to the corresponding value `b', and 0 otherwise.  The
5650| invalid exception is raised if either operand is a NaN.  The comparison is
5651| performed according to the IEC/IEEE Standard for Binary Floating-Point
5652| Arithmetic.
5653*----------------------------------------------------------------------------*/
5654
5655int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5656{
5657    flag aSign, bSign;
5658
5659    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5660              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5661         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5662              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5663       ) {
5664        float_raise(float_flag_invalid, status);
5665        return 0;
5666    }
5667    aSign = extractFloatx80Sign( a );
5668    bSign = extractFloatx80Sign( b );
5669    if ( aSign != bSign ) {
5670        return
5671               aSign
5672            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5673                 == 0 );
5674    }
5675    return
5676          aSign ? le128( b.high, b.low, a.high, a.low )
5677        : le128( a.high, a.low, b.high, b.low );
5678
5679}
5680
5681/*----------------------------------------------------------------------------
5682| Returns 1 if the extended double-precision floating-point value `a' is
5683| less than the corresponding value `b', and 0 otherwise.  The invalid
5684| exception is raised if either operand is a NaN.  The comparison is performed
5685| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5686*----------------------------------------------------------------------------*/
5687
5688int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5689{
5690    flag aSign, bSign;
5691
5692    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5693              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5694         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5695              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5696       ) {
5697        float_raise(float_flag_invalid, status);
5698        return 0;
5699    }
5700    aSign = extractFloatx80Sign( a );
5701    bSign = extractFloatx80Sign( b );
5702    if ( aSign != bSign ) {
5703        return
5704               aSign
5705            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5706                 != 0 );
5707    }
5708    return
5709          aSign ? lt128( b.high, b.low, a.high, a.low )
5710        : lt128( a.high, a.low, b.high, b.low );
5711
5712}
5713
5714/*----------------------------------------------------------------------------
5715| Returns 1 if the extended double-precision floating-point values `a' and `b'
5716| cannot be compared, and 0 otherwise.  The invalid exception is raised if
5717| either operand is a NaN.   The comparison is performed according to the
5718| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5719*----------------------------------------------------------------------------*/
5720int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5721{
5722    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5723              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5724         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5725              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5726       ) {
5727        float_raise(float_flag_invalid, status);
5728        return 1;
5729    }
5730    return 0;
5731}
5732
5733/*----------------------------------------------------------------------------
5734| Returns 1 if the extended double-precision floating-point value `a' is
5735| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5736| cause an exception.  The comparison is performed according to the IEC/IEEE
5737| Standard for Binary Floating-Point Arithmetic.
5738*----------------------------------------------------------------------------*/
5739
5740int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5741{
5742
5743    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5744              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5745         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5746              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5747       ) {
5748        if (    floatx80_is_signaling_nan( a )
5749             || floatx80_is_signaling_nan( b ) ) {
5750            float_raise(float_flag_invalid, status);
5751        }
5752        return 0;
5753    }
5754    return
5755           ( a.low == b.low )
5756        && (    ( a.high == b.high )
5757             || (    ( a.low == 0 )
5758                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5759           );
5760
5761}
5762
5763/*----------------------------------------------------------------------------
5764| Returns 1 if the extended double-precision floating-point value `a' is less
5765| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5766| do not cause an exception.  Otherwise, the comparison is performed according
5767| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5768*----------------------------------------------------------------------------*/
5769
5770int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5771{
5772    flag aSign, bSign;
5773
5774    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5775              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5776         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5777              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5778       ) {
5779        if (    floatx80_is_signaling_nan( a )
5780             || floatx80_is_signaling_nan( b ) ) {
5781            float_raise(float_flag_invalid, status);
5782        }
5783        return 0;
5784    }
5785    aSign = extractFloatx80Sign( a );
5786    bSign = extractFloatx80Sign( b );
5787    if ( aSign != bSign ) {
5788        return
5789               aSign
5790            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5791                 == 0 );
5792    }
5793    return
5794          aSign ? le128( b.high, b.low, a.high, a.low )
5795        : le128( a.high, a.low, b.high, b.low );
5796
5797}
5798
5799/*----------------------------------------------------------------------------
5800| Returns 1 if the extended double-precision floating-point value `a' is less
5801| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5802| an exception.  Otherwise, the comparison is performed according to the
5803| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5804*----------------------------------------------------------------------------*/
5805
5806int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5807{
5808    flag aSign, bSign;
5809
5810    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5811              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5812         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5813              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5814       ) {
5815        if (    floatx80_is_signaling_nan( a )
5816             || floatx80_is_signaling_nan( b ) ) {
5817            float_raise(float_flag_invalid, status);
5818        }
5819        return 0;
5820    }
5821    aSign = extractFloatx80Sign( a );
5822    bSign = extractFloatx80Sign( b );
5823    if ( aSign != bSign ) {
5824        return
5825               aSign
5826            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5827                 != 0 );
5828    }
5829    return
5830          aSign ? lt128( b.high, b.low, a.high, a.low )
5831        : lt128( a.high, a.low, b.high, b.low );
5832
5833}
5834
5835/*----------------------------------------------------------------------------
5836| Returns 1 if the extended double-precision floating-point values `a' and `b'
5837| cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5838| The comparison is performed according to the IEC/IEEE Standard for Binary
5839| Floating-Point Arithmetic.
5840*----------------------------------------------------------------------------*/
5841int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5842{
5843    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5844              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5845         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5846              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5847       ) {
5848        if (    floatx80_is_signaling_nan( a )
5849             || floatx80_is_signaling_nan( b ) ) {
5850            float_raise(float_flag_invalid, status);
5851        }
5852        return 1;
5853    }
5854    return 0;
5855}
5856
5857/*----------------------------------------------------------------------------
5858| Returns the result of converting the quadruple-precision floating-point
5859| value `a' to the 32-bit two's complement integer format.  The conversion
5860| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5861| Arithmetic---which means in particular that the conversion is rounded
5862| according to the current rounding mode.  If `a' is a NaN, the largest
5863| positive integer is returned.  Otherwise, if the conversion overflows, the
5864| largest integer with the same sign as `a' is returned.
5865*----------------------------------------------------------------------------*/
5866
5867int32_t float128_to_int32(float128 a, float_status *status)
5868{
5869    flag aSign;
5870    int32_t aExp, shiftCount;
5871    uint64_t aSig0, aSig1;
5872
5873    aSig1 = extractFloat128Frac1( a );
5874    aSig0 = extractFloat128Frac0( a );
5875    aExp = extractFloat128Exp( a );
5876    aSign = extractFloat128Sign( a );
5877    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5878    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5879    aSig0 |= ( aSig1 != 0 );
5880    shiftCount = 0x4028 - aExp;
5881    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5882    return roundAndPackInt32(aSign, aSig0, status);
5883
5884}
5885
5886/*----------------------------------------------------------------------------
5887| Returns the result of converting the quadruple-precision floating-point
5888| value `a' to the 32-bit two's complement integer format.  The conversion
5889| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5890| Arithmetic, except that the conversion is always rounded toward zero.  If
5891| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5892| conversion overflows, the largest integer with the same sign as `a' is
5893| returned.
5894*----------------------------------------------------------------------------*/
5895
5896int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5897{
5898    flag aSign;
5899    int32_t aExp, shiftCount;
5900    uint64_t aSig0, aSig1, savedASig;
5901    int32_t z;
5902
5903    aSig1 = extractFloat128Frac1( a );
5904    aSig0 = extractFloat128Frac0( a );
5905    aExp = extractFloat128Exp( a );
5906    aSign = extractFloat128Sign( a );
5907    aSig0 |= ( aSig1 != 0 );
5908    if ( 0x401E < aExp ) {
5909        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5910        goto invalid;
5911    }
5912    else if ( aExp < 0x3FFF ) {
5913        if (aExp || aSig0) {
5914            status->float_exception_flags |= float_flag_inexact;
5915        }
5916        return 0;
5917    }
5918    aSig0 |= LIT64( 0x0001000000000000 );
5919    shiftCount = 0x402F - aExp;
5920    savedASig = aSig0;
5921    aSig0 >>= shiftCount;
5922    z = aSig0;
5923    if ( aSign ) z = - z;
5924    if ( ( z < 0 ) ^ aSign ) {
5925 invalid:
5926        float_raise(float_flag_invalid, status);
5927        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5928    }
5929    if ( ( aSig0<<shiftCount ) != savedASig ) {
5930        status->float_exception_flags |= float_flag_inexact;
5931    }
5932    return z;
5933
5934}
5935
5936/*----------------------------------------------------------------------------
5937| Returns the result of converting the quadruple-precision floating-point
5938| value `a' to the 64-bit two's complement integer format.  The conversion
5939| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5940| Arithmetic---which means in particular that the conversion is rounded
5941| according to the current rounding mode.  If `a' is a NaN, the largest
5942| positive integer is returned.  Otherwise, if the conversion overflows, the
5943| largest integer with the same sign as `a' is returned.
5944*----------------------------------------------------------------------------*/
5945
5946int64_t float128_to_int64(float128 a, float_status *status)
5947{
5948    flag aSign;
5949    int32_t aExp, shiftCount;
5950    uint64_t aSig0, aSig1;
5951
5952    aSig1 = extractFloat128Frac1( a );
5953    aSig0 = extractFloat128Frac0( a );
5954    aExp = extractFloat128Exp( a );
5955    aSign = extractFloat128Sign( a );
5956    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5957    shiftCount = 0x402F - aExp;
5958    if ( shiftCount <= 0 ) {
5959        if ( 0x403E < aExp ) {
5960            float_raise(float_flag_invalid, status);
5961            if (    ! aSign
5962                 || (    ( aExp == 0x7FFF )
5963                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5964                    )
5965               ) {
5966                return LIT64( 0x7FFFFFFFFFFFFFFF );
5967            }
5968            return (int64_t) LIT64( 0x8000000000000000 );
5969        }
5970        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5971    }
5972    else {
5973        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5974    }
5975    return roundAndPackInt64(aSign, aSig0, aSig1, status);
5976
5977}
5978
5979/*----------------------------------------------------------------------------
5980| Returns the result of converting the quadruple-precision floating-point
5981| value `a' to the 64-bit two's complement integer format.  The conversion
5982| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5983| Arithmetic, except that the conversion is always rounded toward zero.
5984| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5985| the conversion overflows, the largest integer with the same sign as `a' is
5986| returned.
5987*----------------------------------------------------------------------------*/
5988
5989int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5990{
5991    flag aSign;
5992    int32_t aExp, shiftCount;
5993    uint64_t aSig0, aSig1;
5994    int64_t z;
5995
5996    aSig1 = extractFloat128Frac1( a );
5997    aSig0 = extractFloat128Frac0( a );
5998    aExp = extractFloat128Exp( a );
5999    aSign = extractFloat128Sign( a );
6000    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6001    shiftCount = aExp - 0x402F;
6002    if ( 0 < shiftCount ) {
6003        if ( 0x403E <= aExp ) {
6004            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6005            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6006                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6007                if (aSig1) {
6008                    status->float_exception_flags |= float_flag_inexact;
6009                }
6010            }
6011            else {
6012                float_raise(float_flag_invalid, status);
6013                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6014                    return LIT64( 0x7FFFFFFFFFFFFFFF );
6015                }
6016            }
6017            return (int64_t) LIT64( 0x8000000000000000 );
6018        }
6019        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6020        if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6021            status->float_exception_flags |= float_flag_inexact;
6022        }
6023    }
6024    else {
6025        if ( aExp < 0x3FFF ) {
6026            if ( aExp | aSig0 | aSig1 ) {
6027                status->float_exception_flags |= float_flag_inexact;
6028            }
6029            return 0;
6030        }
6031        z = aSig0>>( - shiftCount );
6032        if (    aSig1
6033             || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6034            status->float_exception_flags |= float_flag_inexact;
6035        }
6036    }
6037    if ( aSign ) z = - z;
6038    return z;
6039
6040}
6041
6042/*----------------------------------------------------------------------------
6043| Returns the result of converting the quadruple-precision floating-point
6044| value `a' to the single-precision floating-point format.  The conversion
6045| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6046| Arithmetic.
6047*----------------------------------------------------------------------------*/
6048
6049float32 float128_to_float32(float128 a, float_status *status)
6050{
6051    flag aSign;
6052    int32_t aExp;
6053    uint64_t aSig0, aSig1;
6054    uint32_t zSig;
6055
6056    aSig1 = extractFloat128Frac1( a );
6057    aSig0 = extractFloat128Frac0( a );
6058    aExp = extractFloat128Exp( a );
6059    aSign = extractFloat128Sign( a );
6060    if ( aExp == 0x7FFF ) {
6061        if ( aSig0 | aSig1 ) {
6062            return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6063        }
6064        return packFloat32( aSign, 0xFF, 0 );
6065    }
6066    aSig0 |= ( aSig1 != 0 );
6067    shift64RightJamming( aSig0, 18, &aSig0 );
6068    zSig = aSig0;
6069    if ( aExp || zSig ) {
6070        zSig |= 0x40000000;
6071        aExp -= 0x3F81;
6072    }
6073    return roundAndPackFloat32(aSign, aExp, zSig, status);
6074
6075}
6076
6077/*----------------------------------------------------------------------------
6078| Returns the result of converting the quadruple-precision floating-point
6079| value `a' to the double-precision floating-point format.  The conversion
6080| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6081| Arithmetic.
6082*----------------------------------------------------------------------------*/
6083
6084float64 float128_to_float64(float128 a, float_status *status)
6085{
6086    flag aSign;
6087    int32_t aExp;
6088    uint64_t aSig0, aSig1;
6089
6090    aSig1 = extractFloat128Frac1( a );
6091    aSig0 = extractFloat128Frac0( a );
6092    aExp = extractFloat128Exp( a );
6093    aSign = extractFloat128Sign( a );
6094    if ( aExp == 0x7FFF ) {
6095        if ( aSig0 | aSig1 ) {
6096            return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6097        }
6098        return packFloat64( aSign, 0x7FF, 0 );
6099    }
6100    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6101    aSig0 |= ( aSig1 != 0 );
6102    if ( aExp || aSig0 ) {
6103        aSig0 |= LIT64( 0x4000000000000000 );
6104        aExp -= 0x3C01;
6105    }
6106    return roundAndPackFloat64(aSign, aExp, aSig0, status);
6107
6108}
6109
6110/*----------------------------------------------------------------------------
6111| Returns the result of converting the quadruple-precision floating-point
6112| value `a' to the extended double-precision floating-point format.  The
6113| conversion is performed according to the IEC/IEEE Standard for Binary
6114| Floating-Point Arithmetic.
6115*----------------------------------------------------------------------------*/
6116
6117floatx80 float128_to_floatx80(float128 a, float_status *status)
6118{
6119    flag aSign;
6120    int32_t aExp;
6121    uint64_t aSig0, aSig1;
6122
6123    aSig1 = extractFloat128Frac1( a );
6124    aSig0 = extractFloat128Frac0( a );
6125    aExp = extractFloat128Exp( a );
6126    aSign = extractFloat128Sign( a );
6127    if ( aExp == 0x7FFF ) {
6128        if ( aSig0 | aSig1 ) {
6129            return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6130        }
6131        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6132    }
6133    if ( aExp == 0 ) {
6134        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6135        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6136    }
6137    else {
6138        aSig0 |= LIT64( 0x0001000000000000 );
6139    }
6140    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6141    return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6142
6143}
6144
6145/*----------------------------------------------------------------------------
6146| Rounds the quadruple-precision floating-point value `a' to an integer, and
6147| returns the result as a quadruple-precision floating-point value.  The
6148| operation is performed according to the IEC/IEEE Standard for Binary
6149| Floating-Point Arithmetic.
6150*----------------------------------------------------------------------------*/
6151
6152float128 float128_round_to_int(float128 a, float_status *status)
6153{
6154    flag aSign;
6155    int32_t aExp;
6156    uint64_t lastBitMask, roundBitsMask;
6157    float128 z;
6158
6159    aExp = extractFloat128Exp( a );
6160    if ( 0x402F <= aExp ) {
6161        if ( 0x406F <= aExp ) {
6162            if (    ( aExp == 0x7FFF )
6163                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6164               ) {
6165                return propagateFloat128NaN(a, a, status);
6166            }
6167            return a;
6168        }
6169        lastBitMask = 1;
6170        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6171        roundBitsMask = lastBitMask - 1;
6172        z = a;
6173        switch (status->float_rounding_mode) {
6174        case float_round_nearest_even:
6175            if ( lastBitMask ) {
6176                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6177                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6178            }
6179            else {
6180                if ( (int64_t) z.low < 0 ) {
6181                    ++z.high;
6182                    if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6183                }
6184            }
6185            break;
6186        case float_round_ties_away:
6187            if (lastBitMask) {
6188                add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6189            } else {
6190                if ((int64_t) z.low < 0) {
6191                    ++z.high;
6192                }
6193            }
6194            break;
6195        case float_round_to_zero:
6196            break;
6197        case float_round_up:
6198            if (!extractFloat128Sign(z)) {
6199                add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6200            }
6201            break;
6202        case float_round_down:
6203            if (extractFloat128Sign(z)) {
6204                add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6205            }
6206            break;
6207        default:
6208            abort();
6209        }
6210        z.low &= ~ roundBitsMask;
6211    }
6212    else {
6213        if ( aExp < 0x3FFF ) {
6214            if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6215            status->float_exception_flags |= float_flag_inexact;
6216            aSign = extractFloat128Sign( a );
6217            switch (status->float_rounding_mode) {
6218             case float_round_nearest_even:
6219                if (    ( aExp == 0x3FFE )
6220                     && (   extractFloat128Frac0( a )
6221                          | extractFloat128Frac1( a ) )
6222                   ) {
6223                    return packFloat128( aSign, 0x3FFF, 0, 0 );
6224                }
6225                break;
6226            case float_round_ties_away:
6227                if (aExp == 0x3FFE) {
6228                    return packFloat128(aSign, 0x3FFF, 0, 0);
6229                }
6230                break;
6231             case float_round_down:
6232                return
6233                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6234                    : packFloat128( 0, 0, 0, 0 );
6235             case float_round_up:
6236                return
6237                      aSign ? packFloat128( 1, 0, 0, 0 )
6238                    : packFloat128( 0, 0x3FFF, 0, 0 );
6239            }
6240            return packFloat128( aSign, 0, 0, 0 );
6241        }
6242        lastBitMask = 1;
6243        lastBitMask <<= 0x402F - aExp;
6244        roundBitsMask = lastBitMask - 1;
6245        z.low = 0;
6246        z.high = a.high;
6247        switch (status->float_rounding_mode) {
6248        case float_round_nearest_even:
6249            z.high += lastBitMask>>1;
6250            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6251                z.high &= ~ lastBitMask;
6252            }
6253            break;
6254        case float_round_ties_away:
6255            z.high += lastBitMask>>1;
6256            break;
6257        case float_round_to_zero:
6258            break;
6259        case float_round_up:
6260            if (!extractFloat128Sign(z)) {
6261                z.high |= ( a.low != 0 );
6262                z.high += roundBitsMask;
6263            }
6264            break;
6265        case float_round_down:
6266            if (extractFloat128Sign(z)) {
6267                z.high |= (a.low != 0);
6268                z.high += roundBitsMask;
6269            }
6270            break;
6271        default:
6272            abort();
6273        }
6274        z.high &= ~ roundBitsMask;
6275    }
6276    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6277        status->float_exception_flags |= float_flag_inexact;
6278    }
6279    return z;
6280
6281}
6282
6283/*----------------------------------------------------------------------------
6284| Returns the result of adding the absolute values of the quadruple-precision
6285| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6286| before being returned.  `zSign' is ignored if the result is a NaN.
6287| The addition is performed according to the IEC/IEEE Standard for Binary
6288| Floating-Point Arithmetic.
6289*----------------------------------------------------------------------------*/
6290
6291static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6292                                float_status *status)
6293{
6294    int32_t aExp, bExp, zExp;
6295    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6296    int32_t expDiff;
6297
6298    aSig1 = extractFloat128Frac1( a );
6299    aSig0 = extractFloat128Frac0( a );
6300    aExp = extractFloat128Exp( a );
6301    bSig1 = extractFloat128Frac1( b );
6302    bSig0 = extractFloat128Frac0( b );
6303    bExp = extractFloat128Exp( b );
6304    expDiff = aExp - bExp;
6305    if ( 0 < expDiff ) {
6306        if ( aExp == 0x7FFF ) {
6307            if (aSig0 | aSig1) {
6308                return propagateFloat128NaN(a, b, status);
6309            }
6310            return a;
6311        }
6312        if ( bExp == 0 ) {
6313            --expDiff;
6314        }
6315        else {
6316            bSig0 |= LIT64( 0x0001000000000000 );
6317        }
6318        shift128ExtraRightJamming(
6319            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6320        zExp = aExp;
6321    }
6322    else if ( expDiff < 0 ) {
6323        if ( bExp == 0x7FFF ) {
6324            if (bSig0 | bSig1) {
6325                return propagateFloat128NaN(a, b, status);
6326            }
6327            return packFloat128( zSign, 0x7FFF, 0, 0 );
6328        }
6329        if ( aExp == 0 ) {
6330            ++expDiff;
6331        }
6332        else {
6333            aSig0 |= LIT64( 0x0001000000000000 );
6334        }
6335        shift128ExtraRightJamming(
6336            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6337        zExp = bExp;
6338    }
6339    else {
6340        if ( aExp == 0x7FFF ) {
6341            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6342                return propagateFloat128NaN(a, b, status);
6343            }
6344            return a;
6345        }
6346        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6347        if ( aExp == 0 ) {
6348            if (status->flush_to_zero) {
6349                if (zSig0 | zSig1) {
6350                    float_raise(float_flag_output_denormal, status);
6351                }
6352                return packFloat128(zSign, 0, 0, 0);
6353            }
6354            return packFloat128( zSign, 0, zSig0, zSig1 );
6355        }
6356        zSig2 = 0;
6357        zSig0 |= LIT64( 0x0002000000000000 );
6358        zExp = aExp;
6359        goto shiftRight1;
6360    }
6361    aSig0 |= LIT64( 0x0001000000000000 );
6362    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6363    --zExp;
6364    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6365    ++zExp;
6366 shiftRight1:
6367    shift128ExtraRightJamming(
6368        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6369 roundAndPack:
6370    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6371
6372}
6373
6374/*----------------------------------------------------------------------------
6375| Returns the result of subtracting the absolute values of the quadruple-
6376| precision floating-point values `a' and `b'.  If `zSign' is 1, the
6377| difference is negated before being returned.  `zSign' is ignored if the
6378| result is a NaN.  The subtraction is performed according to the IEC/IEEE
6379| Standard for Binary Floating-Point Arithmetic.
6380*----------------------------------------------------------------------------*/
6381
6382static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6383                                float_status *status)
6384{
6385    int32_t aExp, bExp, zExp;
6386    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6387    int32_t expDiff;
6388    float128 z;
6389
6390    aSig1 = extractFloat128Frac1( a );
6391    aSig0 = extractFloat128Frac0( a );
6392    aExp = extractFloat128Exp( a );
6393    bSig1 = extractFloat128Frac1( b );
6394    bSig0 = extractFloat128Frac0( b );
6395    bExp = extractFloat128Exp( b );
6396    expDiff = aExp - bExp;
6397    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6398    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6399    if ( 0 < expDiff ) goto aExpBigger;
6400    if ( expDiff < 0 ) goto bExpBigger;
6401    if ( aExp == 0x7FFF ) {
6402        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6403            return propagateFloat128NaN(a, b, status);
6404        }
6405        float_raise(float_flag_invalid, status);
6406        z.low = float128_default_nan_low;
6407        z.high = float128_default_nan_high;
6408        return z;
6409    }
6410    if ( aExp == 0 ) {
6411        aExp = 1;
6412        bExp = 1;
6413    }
6414    if ( bSig0 < aSig0 ) goto aBigger;
6415    if ( aSig0 < bSig0 ) goto bBigger;
6416    if ( bSig1 < aSig1 ) goto aBigger;
6417    if ( aSig1 < bSig1 ) goto bBigger;
6418    return packFloat128(status->float_rounding_mode == float_round_down,
6419                        0, 0, 0);
6420 bExpBigger:
6421    if ( bExp == 0x7FFF ) {
6422        if (bSig0 | bSig1) {
6423            return propagateFloat128NaN(a, b, status);
6424        }
6425        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6426    }
6427    if ( aExp == 0 ) {
6428        ++expDiff;
6429    }
6430    else {
6431        aSig0 |= LIT64( 0x4000000000000000 );
6432    }
6433    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6434    bSig0 |= LIT64( 0x4000000000000000 );
6435 bBigger:
6436    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6437    zExp = bExp;
6438    zSign ^= 1;
6439    goto normalizeRoundAndPack;
6440 aExpBigger:
6441    if ( aExp == 0x7FFF ) {
6442        if (aSig0 | aSig1) {
6443            return propagateFloat128NaN(a, b, status);
6444        }
6445        return a;
6446    }
6447    if ( bExp == 0 ) {
6448        --expDiff;
6449    }
6450    else {
6451        bSig0 |= LIT64( 0x4000000000000000 );
6452    }
6453    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6454    aSig0 |= LIT64( 0x4000000000000000 );
6455 aBigger:
6456    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6457    zExp = aExp;
6458 normalizeRoundAndPack:
6459    --zExp;
6460    return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6461                                         status);
6462
6463}
6464
6465/*----------------------------------------------------------------------------
6466| Returns the result of adding the quadruple-precision floating-point values
6467| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6468| for Binary Floating-Point Arithmetic.
6469*----------------------------------------------------------------------------*/
6470
6471float128 float128_add(float128 a, float128 b, float_status *status)
6472{
6473    flag aSign, bSign;
6474
6475    aSign = extractFloat128Sign( a );
6476    bSign = extractFloat128Sign( b );
6477    if ( aSign == bSign ) {
6478        return addFloat128Sigs(a, b, aSign, status);
6479    }
6480    else {
6481        return subFloat128Sigs(a, b, aSign, status);
6482    }
6483
6484}
6485
6486/*----------------------------------------------------------------------------
6487| Returns the result of subtracting the quadruple-precision floating-point
6488| values `a' and `b'.  The operation is performed according to the IEC/IEEE
6489| Standard for Binary Floating-Point Arithmetic.
6490*----------------------------------------------------------------------------*/
6491
6492float128 float128_sub(float128 a, float128 b, float_status *status)
6493{
6494    flag aSign, bSign;
6495
6496    aSign = extractFloat128Sign( a );
6497    bSign = extractFloat128Sign( b );
6498    if ( aSign == bSign ) {
6499        return subFloat128Sigs(a, b, aSign, status);
6500    }
6501    else {
6502        return addFloat128Sigs(a, b, aSign, status);
6503    }
6504
6505}
6506
6507/*----------------------------------------------------------------------------
6508| Returns the result of multiplying the quadruple-precision floating-point
6509| values `a' and `b'.  The operation is performed according to the IEC/IEEE
6510| Standard for Binary Floating-Point Arithmetic.
6511*----------------------------------------------------------------------------*/
6512
6513float128 float128_mul(float128 a, float128 b, float_status *status)
6514{
6515    flag aSign, bSign, zSign;
6516    int32_t aExp, bExp, zExp;
6517    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6518    float128 z;
6519
6520    aSig1 = extractFloat128Frac1( a );
6521    aSig0 = extractFloat128Frac0( a );
6522    aExp = extractFloat128Exp( a );
6523    aSign = extractFloat128Sign( a );
6524    bSig1 = extractFloat128Frac1( b );
6525    bSig0 = extractFloat128Frac0( b );
6526    bExp = extractFloat128Exp( b );
6527    bSign = extractFloat128Sign( b );
6528    zSign = aSign ^ bSign;
6529    if ( aExp == 0x7FFF ) {
6530        if (    ( aSig0 | aSig1 )
6531             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6532            return propagateFloat128NaN(a, b, status);
6533        }
6534        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6535        return packFloat128( zSign, 0x7FFF, 0, 0 );
6536    }
6537    if ( bExp == 0x7FFF ) {
6538        if (bSig0 | bSig1) {
6539            return propagateFloat128NaN(a, b, status);
6540        }
6541        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6542 invalid:
6543            float_raise(float_flag_invalid, status);
6544            z.low = float128_default_nan_low;
6545            z.high = float128_default_nan_high;
6546            return z;
6547        }
6548        return packFloat128( zSign, 0x7FFF, 0, 0 );
6549    }
6550    if ( aExp == 0 ) {
6551        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6552        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6553    }
6554    if ( bExp == 0 ) {
6555        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6556        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6557    }
6558    zExp = aExp + bExp - 0x4000;
6559    aSig0 |= LIT64( 0x0001000000000000 );
6560    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6561    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6562    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6563    zSig2 |= ( zSig3 != 0 );
6564    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6565        shift128ExtraRightJamming(
6566            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6567        ++zExp;
6568    }
6569    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6570
6571}
6572
6573/*----------------------------------------------------------------------------
6574| Returns the result of dividing the quadruple-precision floating-point value
6575| `a' by the corresponding value `b'.  The operation is performed according to
6576| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6577*----------------------------------------------------------------------------*/
6578
6579float128 float128_div(float128 a, float128 b, float_status *status)
6580{
6581    flag aSign, bSign, zSign;
6582    int32_t aExp, bExp, zExp;
6583    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6584    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6585    float128 z;
6586
6587    aSig1 = extractFloat128Frac1( a );
6588    aSig0 = extractFloat128Frac0( a );
6589    aExp = extractFloat128Exp( a );
6590    aSign = extractFloat128Sign( a );
6591    bSig1 = extractFloat128Frac1( b );
6592    bSig0 = extractFloat128Frac0( b );
6593    bExp = extractFloat128Exp( b );
6594    bSign = extractFloat128Sign( b );
6595    zSign = aSign ^ bSign;
6596    if ( aExp == 0x7FFF ) {
6597        if (aSig0 | aSig1) {
6598            return propagateFloat128NaN(a, b, status);
6599        }
6600        if ( bExp == 0x7FFF ) {
6601            if (bSig0 | bSig1) {
6602                return propagateFloat128NaN(a, b, status);
6603            }
6604            goto invalid;
6605        }
6606        return packFloat128( zSign, 0x7FFF, 0, 0 );
6607    }
6608    if ( bExp == 0x7FFF ) {
6609        if (bSig0 | bSig1) {
6610            return propagateFloat128NaN(a, b, status);
6611        }
6612        return packFloat128( zSign, 0, 0, 0 );
6613    }
6614    if ( bExp == 0 ) {
6615        if ( ( bSig0 | bSig1 ) == 0 ) {
6616            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6617 invalid:
6618                float_raise(float_flag_invalid, status);
6619                z.low = float128_default_nan_low;
6620                z.high = float128_default_nan_high;
6621                return z;
6622            }
6623            float_raise(float_flag_divbyzero, status);
6624            return packFloat128( zSign, 0x7FFF, 0, 0 );
6625        }
6626        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6627    }
6628    if ( aExp == 0 ) {
6629        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6630        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6631    }
6632    zExp = aExp - bExp + 0x3FFD;
6633    shortShift128Left(
6634        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6635    shortShift128Left(
6636        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6637    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6638        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6639        ++zExp;
6640    }
6641    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6642    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6643    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6644    while ( (int64_t) rem0 < 0 ) {
6645        --zSig0;
6646        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6647    }
6648    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6649    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6650        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6651        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6652        while ( (int64_t) rem1 < 0 ) {
6653            --zSig1;
6654            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6655        }
6656        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6657    }
6658    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6659    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6660
6661}
6662
6663/*----------------------------------------------------------------------------
6664| Returns the remainder of the quadruple-precision floating-point value `a'
6665| with respect to the corresponding value `b'.  The operation is performed
6666| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6667*----------------------------------------------------------------------------*/
6668
6669float128 float128_rem(float128 a, float128 b, float_status *status)
6670{
6671    flag aSign, zSign;
6672    int32_t aExp, bExp, expDiff;
6673    uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6674    uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6675    int64_t sigMean0;
6676    float128 z;
6677
6678    aSig1 = extractFloat128Frac1( a );
6679    aSig0 = extractFloat128Frac0( a );
6680    aExp = extractFloat128Exp( a );
6681    aSign = extractFloat128Sign( a );
6682    bSig1 = extractFloat128Frac1( b );
6683    bSig0 = extractFloat128Frac0( b );
6684    bExp = extractFloat128Exp( b );
6685    if ( aExp == 0x7FFF ) {
6686        if (    ( aSig0 | aSig1 )
6687             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6688            return propagateFloat128NaN(a, b, status);
6689        }
6690        goto invalid;
6691    }
6692    if ( bExp == 0x7FFF ) {
6693        if (bSig0 | bSig1) {
6694            return propagateFloat128NaN(a, b, status);
6695        }
6696        return a;
6697    }
6698    if ( bExp == 0 ) {
6699        if ( ( bSig0 | bSig1 ) == 0 ) {
6700 invalid:
6701            float_raise(float_flag_invalid, status);
6702            z.low = float128_default_nan_low;
6703            z.high = float128_default_nan_high;
6704            return z;
6705        }
6706        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6707    }
6708    if ( aExp == 0 ) {
6709        if ( ( aSig0 | aSig1 ) == 0 ) return a;
6710        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6711    }
6712    expDiff = aExp - bExp;
6713    if ( expDiff < -1 ) return a;
6714    shortShift128Left(
6715        aSig0 | LIT64( 0x0001000000000000 ),
6716        aSig1,
6717        15 - ( expDiff < 0 ),
6718        &aSig0,
6719        &aSig1
6720    );
6721    shortShift128Left(
6722        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6723    q = le128( bSig0, bSig1, aSig0, aSig1 );
6724    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6725    expDiff -= 64;
6726    while ( 0 < expDiff ) {
6727        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6728        q = ( 4 < q ) ? q - 4 : 0;
6729        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6730        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6731        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6732        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6733        expDiff -= 61;
6734    }
6735    if ( -64 < expDiff ) {
6736        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6737        q = ( 4 < q ) ? q - 4 : 0;
6738        q >>= - expDiff;
6739        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6740        expDiff += 52;
6741        if ( expDiff < 0 ) {
6742            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6743        }
6744        else {
6745            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6746        }
6747        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6748        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6749    }
6750    else {
6751        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6752        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6753    }
6754    do {
6755        alternateASig0 = aSig0;
6756        alternateASig1 = aSig1;
6757        ++q;
6758        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6759    } while ( 0 <= (int64_t) aSig0 );
6760    add128(
6761        aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6762    if (    ( sigMean0 < 0 )
6763         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6764        aSig0 = alternateASig0;
6765        aSig1 = alternateASig1;
6766    }
6767    zSign = ( (int64_t) aSig0 < 0 );
6768    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6769    return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6770                                         status);
6771}
6772
6773/*----------------------------------------------------------------------------
6774| Returns the square root of the quadruple-precision floating-point value `a'.
6775| The operation is performed according to the IEC/IEEE Standard for Binary
6776| Floating-Point Arithmetic.
6777*----------------------------------------------------------------------------*/
6778
6779float128 float128_sqrt(float128 a, float_status *status)
6780{
6781    flag aSign;
6782    int32_t aExp, zExp;
6783    uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6784    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6785    float128 z;
6786
6787    aSig1 = extractFloat128Frac1( a );
6788    aSig0 = extractFloat128Frac0( a );
6789    aExp = extractFloat128Exp( a );
6790    aSign = extractFloat128Sign( a );
6791    if ( aExp == 0x7FFF ) {
6792        if (aSig0 | aSig1) {
6793            return propagateFloat128NaN(a, a, status);
6794        }
6795        if ( ! aSign ) return a;
6796        goto invalid;
6797    }
6798    if ( aSign ) {
6799        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6800 invalid:
6801        float_raise(float_flag_invalid, status);
6802        z.low = float128_default_nan_low;
6803        z.high = float128_default_nan_high;
6804        return z;
6805    }
6806    if ( aExp == 0 ) {
6807        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6808        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6809    }
6810    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6811    aSig0 |= LIT64( 0x0001000000000000 );
6812    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6813    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6814    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6815    doubleZSig0 = zSig0<<1;
6816    mul64To128( zSig0, zSig0, &term0, &term1 );
6817    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6818    while ( (int64_t) rem0 < 0 ) {
6819        --zSig0;
6820        doubleZSig0 -= 2;
6821        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6822    }
6823    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6824    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6825        if ( zSig1 == 0 ) zSig1 = 1;
6826        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6827        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6828        mul64To128( zSig1, zSig1, &term2, &term3 );
6829        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6830        while ( (int64_t) rem1 < 0 ) {
6831            --zSig1;
6832            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6833            term3 |= 1;
6834            term2 |= doubleZSig0;
6835            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6836        }
6837        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6838    }
6839    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6840    return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6841
6842}
6843
6844/*----------------------------------------------------------------------------
6845| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6846| the corresponding value `b', and 0 otherwise.  The invalid exception is
6847| raised if either operand is a NaN.  Otherwise, the comparison is performed
6848| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6849*----------------------------------------------------------------------------*/
6850
6851int float128_eq(float128 a, float128 b, float_status *status)
6852{
6853
6854    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6855              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6856         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6857              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6858       ) {
6859        float_raise(float_flag_invalid, status);
6860        return 0;
6861    }
6862    return
6863           ( a.low == b.low )
6864        && (    ( a.high == b.high )
6865             || (    ( a.low == 0 )
6866                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6867           );
6868
6869}
6870
6871/*----------------------------------------------------------------------------
6872| Returns 1 if the quadruple-precision floating-point value `a' is less than
6873| or equal to the corresponding value `b', and 0 otherwise.  The invalid
6874| exception is raised if either operand is a NaN.  The comparison is performed
6875| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6876*----------------------------------------------------------------------------*/
6877
6878int float128_le(float128 a, float128 b, float_status *status)
6879{
6880    flag aSign, bSign;
6881
6882    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6883              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6884         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6885              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6886       ) {
6887        float_raise(float_flag_invalid, status);
6888        return 0;
6889    }
6890    aSign = extractFloat128Sign( a );
6891    bSign = extractFloat128Sign( b );
6892    if ( aSign != bSign ) {
6893        return
6894               aSign
6895            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6896                 == 0 );
6897    }
6898    return
6899          aSign ? le128( b.high, b.low, a.high, a.low )
6900        : le128( a.high, a.low, b.high, b.low );
6901
6902}
6903
6904/*----------------------------------------------------------------------------
6905| Returns 1 if the quadruple-precision floating-point value `a' is less than
6906| the corresponding value `b', and 0 otherwise.  The invalid exception is
6907| raised if either operand is a NaN.  The comparison is performed according
6908| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6909*----------------------------------------------------------------------------*/
6910
6911int float128_lt(float128 a, float128 b, float_status *status)
6912{
6913    flag aSign, bSign;
6914
6915    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6916              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6917         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6918              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6919       ) {
6920        float_raise(float_flag_invalid, status);
6921        return 0;
6922    }
6923    aSign = extractFloat128Sign( a );
6924    bSign = extractFloat128Sign( b );
6925    if ( aSign != bSign ) {
6926        return
6927               aSign
6928            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6929                 != 0 );
6930    }
6931    return
6932          aSign ? lt128( b.high, b.low, a.high, a.low )
6933        : lt128( a.high, a.low, b.high, b.low );
6934
6935}
6936
6937/*----------------------------------------------------------------------------
6938| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6939| be compared, and 0 otherwise.  The invalid exception is raised if either
6940| operand is a NaN. The comparison is performed according to the IEC/IEEE
6941| Standard for Binary Floating-Point Arithmetic.
6942*----------------------------------------------------------------------------*/
6943
6944int float128_unordered(float128 a, float128 b, float_status *status)
6945{
6946    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6947              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6948         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6949              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6950       ) {
6951        float_raise(float_flag_invalid, status);
6952        return 1;
6953    }
6954    return 0;
6955}
6956
6957/*----------------------------------------------------------------------------
6958| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6959| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6960| exception.  The comparison is performed according to the IEC/IEEE Standard
6961| for Binary Floating-Point Arithmetic.
6962*----------------------------------------------------------------------------*/
6963
6964int float128_eq_quiet(float128 a, float128 b, float_status *status)
6965{
6966
6967    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6968              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6969         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6970              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6971       ) {
6972        if (    float128_is_signaling_nan( a )
6973             || float128_is_signaling_nan( b ) ) {
6974            float_raise(float_flag_invalid, status);
6975        }
6976        return 0;
6977    }
6978    return
6979           ( a.low == b.low )
6980        && (    ( a.high == b.high )
6981             || (    ( a.low == 0 )
6982                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6983           );
6984
6985}
6986
6987/*----------------------------------------------------------------------------
6988| Returns 1 if the quadruple-precision floating-point value `a' is less than
6989| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6990| cause an exception.  Otherwise, the comparison is performed according to the
6991| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6992*----------------------------------------------------------------------------*/
6993
6994int float128_le_quiet(float128 a, float128 b, float_status *status)
6995{
6996    flag aSign, bSign;
6997
6998    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6999              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7000         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7001              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7002       ) {
7003        if (    float128_is_signaling_nan( a )
7004             || float128_is_signaling_nan( b ) ) {
7005            float_raise(float_flag_invalid, status);
7006        }
7007        return 0;
7008    }
7009    aSign = extractFloat128Sign( a );
7010    bSign = extractFloat128Sign( b );
7011    if ( aSign != bSign ) {
7012        return
7013               aSign
7014            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7015                 == 0 );
7016    }
7017    return
7018          aSign ? le128( b.high, b.low, a.high, a.low )
7019        : le128( a.high, a.low, b.high, b.low );
7020
7021}
7022
7023/*----------------------------------------------------------------------------
7024| Returns 1 if the quadruple-precision floating-point value `a' is less than
7025| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7026| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7027| Standard for Binary Floating-Point Arithmetic.
7028*----------------------------------------------------------------------------*/
7029
7030int float128_lt_quiet(float128 a, float128 b, float_status *status)
7031{
7032    flag aSign, bSign;
7033
7034    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7035              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7036         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7037              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7038       ) {
7039        if (    float128_is_signaling_nan( a )
7040             || float128_is_signaling_nan( b ) ) {
7041            float_raise(float_flag_invalid, status);
7042        }
7043        return 0;
7044    }
7045    aSign = extractFloat128Sign( a );
7046    bSign = extractFloat128Sign( b );
7047    if ( aSign != bSign ) {
7048        return
7049               aSign
7050            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7051                 != 0 );
7052    }
7053    return
7054          aSign ? lt128( b.high, b.low, a.high, a.low )
7055        : lt128( a.high, a.low, b.high, b.low );
7056
7057}
7058
7059/*----------------------------------------------------------------------------
7060| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7061| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7062| comparison is performed according to the IEC/IEEE Standard for Binary
7063| Floating-Point Arithmetic.
7064*----------------------------------------------------------------------------*/
7065
7066int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7067{
7068    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7069              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7070         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7071              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7072       ) {
7073        if (    float128_is_signaling_nan( a )
7074             || float128_is_signaling_nan( b ) ) {
7075            float_raise(float_flag_invalid, status);
7076        }
7077        return 1;
7078    }
7079    return 0;
7080}
7081
7082/* misc functions */
7083float32 uint32_to_float32(uint32_t a, float_status *status)
7084{
7085    return int64_to_float32(a, status);
7086}
7087
7088float64 uint32_to_float64(uint32_t a, float_status *status)
7089{
7090    return int64_to_float64(a, status);
7091}
7092
7093uint32_t float32_to_uint32(float32 a, float_status *status)
7094{
7095    int64_t v;
7096    uint32_t res;
7097    int old_exc_flags = get_float_exception_flags(status);
7098
7099    v = float32_to_int64(a, status);
7100    if (v < 0) {
7101        res = 0;
7102    } else if (v > 0xffffffff) {
7103        res = 0xffffffff;
7104    } else {
7105        return v;
7106    }
7107    set_float_exception_flags(old_exc_flags, status);
7108    float_raise(float_flag_invalid, status);
7109    return res;
7110}
7111
7112uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
7113{
7114    int64_t v;
7115    uint32_t res;
7116    int old_exc_flags = get_float_exception_flags(status);
7117
7118    v = float32_to_int64_round_to_zero(a, status);
7119    if (v < 0) {
7120        res = 0;
7121    } else if (v > 0xffffffff) {
7122        res = 0xffffffff;
7123    } else {
7124        return v;
7125    }
7126    set_float_exception_flags(old_exc_flags, status);
7127    float_raise(float_flag_invalid, status);
7128    return res;
7129}
7130
7131int16_t float32_to_int16(float32 a, float_status *status)
7132{
7133    int32_t v;
7134    int16_t res;
7135    int old_exc_flags = get_float_exception_flags(status);
7136
7137    v = float32_to_int32(a, status);
7138    if (v < -0x8000) {
7139        res = -0x8000;
7140    } else if (v > 0x7fff) {
7141        res = 0x7fff;
7142    } else {
7143        return v;
7144    }
7145
7146    set_float_exception_flags(old_exc_flags, status);
7147    float_raise(float_flag_invalid, status);
7148    return res;
7149}
7150
7151uint16_t float32_to_uint16(float32 a, float_status *status)
7152{
7153    int32_t v;
7154    uint16_t res;
7155    int old_exc_flags = get_float_exception_flags(status);
7156
7157    v = float32_to_int32(a, status);
7158    if (v < 0) {
7159        res = 0;
7160    } else if (v > 0xffff) {
7161        res = 0xffff;
7162    } else {
7163        return v;
7164    }
7165
7166    set_float_exception_flags(old_exc_flags, status);
7167    float_raise(float_flag_invalid, status);
7168    return res;
7169}
7170
7171uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7172{
7173    int64_t v;
7174    uint16_t res;
7175    int old_exc_flags = get_float_exception_flags(status);
7176
7177    v = float32_to_int64_round_to_zero(a, status);
7178    if (v < 0) {
7179        res = 0;
7180    } else if (v > 0xffff) {
7181        res = 0xffff;
7182    } else {
7183        return v;
7184    }
7185    set_float_exception_flags(old_exc_flags, status);
7186    float_raise(float_flag_invalid, status);
7187    return res;
7188}
7189
7190uint32_t float64_to_uint32(float64 a, float_status *status)
7191{
7192    uint64_t v;
7193    uint32_t res;
7194    int old_exc_flags = get_float_exception_flags(status);
7195
7196    v = float64_to_uint64(a, status);
7197    if (v > 0xffffffff) {
7198        res = 0xffffffff;
7199    } else {
7200        return v;
7201    }
7202    set_float_exception_flags(old_exc_flags, status);
7203    float_raise(float_flag_invalid, status);
7204    return res;
7205}
7206
7207uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
7208{
7209    uint64_t v;
7210    uint32_t res;
7211    int old_exc_flags = get_float_exception_flags(status);
7212
7213    v = float64_to_uint64_round_to_zero(a, status);
7214    if (v > 0xffffffff) {
7215        res = 0xffffffff;
7216    } else {
7217        return v;
7218    }
7219    set_float_exception_flags(old_exc_flags, status);
7220    float_raise(float_flag_invalid, status);
7221    return res;
7222}
7223
7224int16_t float64_to_int16(float64 a, float_status *status)
7225{
7226    int64_t v;
7227    int16_t res;
7228    int old_exc_flags = get_float_exception_flags(status);
7229
7230    v = float64_to_int32(a, status);
7231    if (v < -0x8000) {
7232        res = -0x8000;
7233    } else if (v > 0x7fff) {
7234        res = 0x7fff;
7235    } else {
7236        return v;
7237    }
7238
7239    set_float_exception_flags(old_exc_flags, status);
7240    float_raise(float_flag_invalid, status);
7241    return res;
7242}
7243
7244uint16_t float64_to_uint16(float64 a, float_status *status)
7245{
7246    int64_t v;
7247    uint16_t res;
7248    int old_exc_flags = get_float_exception_flags(status);
7249
7250    v = float64_to_int32(a, status);
7251    if (v < 0) {
7252        res = 0;
7253    } else if (v > 0xffff) {
7254        res = 0xffff;
7255    } else {
7256        return v;
7257    }
7258
7259    set_float_exception_flags(old_exc_flags, status);
7260    float_raise(float_flag_invalid, status);
7261    return res;
7262}
7263
7264uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7265{
7266    int64_t v;
7267    uint16_t res;
7268    int old_exc_flags = get_float_exception_flags(status);
7269
7270    v = float64_to_int64_round_to_zero(a, status);
7271    if (v < 0) {
7272        res = 0;
7273    } else if (v > 0xffff) {
7274        res = 0xffff;
7275    } else {
7276        return v;
7277    }
7278    set_float_exception_flags(old_exc_flags, status);
7279    float_raise(float_flag_invalid, status);
7280    return res;
7281}
7282
7283/*----------------------------------------------------------------------------
7284| Returns the result of converting the double-precision floating-point value
7285| `a' to the 64-bit unsigned integer format.  The conversion is
7286| performed according to the IEC/IEEE Standard for Binary Floating-Point
7287| Arithmetic---which means in particular that the conversion is rounded
7288| according to the current rounding mode.  If `a' is a NaN, the largest
7289| positive integer is returned.  If the conversion overflows, the
7290| largest unsigned integer is returned.  If 'a' is negative, the value is
7291| rounded and zero is returned; negative values that do not round to zero
7292| will raise the inexact exception.
7293*----------------------------------------------------------------------------*/
7294
7295uint64_t float64_to_uint64(float64 a, float_status *status)
7296{
7297    flag aSign;
7298    int aExp;
7299    int shiftCount;
7300    uint64_t aSig, aSigExtra;
7301    a = float64_squash_input_denormal(a, status);
7302
7303    aSig = extractFloat64Frac(a);
7304    aExp = extractFloat64Exp(a);
7305    aSign = extractFloat64Sign(a);
7306    if (aSign && (aExp > 1022)) {
7307        float_raise(float_flag_invalid, status);
7308        if (float64_is_any_nan(a)) {
7309            return LIT64(0xFFFFFFFFFFFFFFFF);
7310        } else {
7311            return 0;
7312        }
7313    }
7314    if (aExp) {
7315        aSig |= LIT64(0x0010000000000000);
7316    }
7317    shiftCount = 0x433 - aExp;
7318    if (shiftCount <= 0) {
7319        if (0x43E < aExp) {
7320            float_raise(float_flag_invalid, status);
7321            return LIT64(0xFFFFFFFFFFFFFFFF);
7322        }
7323        aSigExtra = 0;
7324        aSig <<= -shiftCount;
7325    } else {
7326        shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7327    }
7328    return roundAndPackUint64(aSign, aSig, aSigExtra, status);
7329}
7330
7331uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7332{
7333    signed char current_rounding_mode = status->float_rounding_mode;
7334    set_float_rounding_mode(float_round_to_zero, status);
7335    int64_t v = float64_to_uint64(a, status);
7336    set_float_rounding_mode(current_rounding_mode, status);
7337    return v;
7338}
7339
7340#define COMPARE(s, nan_exp)                                                  \
7341static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7342                                      int is_quiet, float_status *status)    \
7343{                                                                            \
7344    flag aSign, bSign;                                                       \
7345    uint ## s ## _t av, bv;                                                  \
7346    a = float ## s ## _squash_input_denormal(a, status);                     \
7347    b = float ## s ## _squash_input_denormal(b, status);                     \
7348                                                                             \
7349    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7350         extractFloat ## s ## Frac( a ) ) ||                                 \
7351        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7352          extractFloat ## s ## Frac( b ) )) {                                \
7353        if (!is_quiet ||                                                     \
7354            float ## s ## _is_signaling_nan( a ) ||                          \
7355            float ## s ## _is_signaling_nan( b ) ) {                         \
7356            float_raise(float_flag_invalid, status);                         \
7357        }                                                                    \
7358        return float_relation_unordered;                                     \
7359    }                                                                        \
7360    aSign = extractFloat ## s ## Sign( a );                                  \
7361    bSign = extractFloat ## s ## Sign( b );                                  \
7362    av = float ## s ## _val(a);                                              \
7363    bv = float ## s ## _val(b);                                              \
7364    if ( aSign != bSign ) {                                                  \
7365        if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7366            /* zero case */                                                  \
7367            return float_relation_equal;                                     \
7368        } else {                                                             \
7369            return 1 - (2 * aSign);                                          \
7370        }                                                                    \
7371    } else {                                                                 \
7372        if (av == bv) {                                                      \
7373            return float_relation_equal;                                     \
7374        } else {                                                             \
7375            return 1 - 2 * (aSign ^ ( av < bv ));                            \
7376        }                                                                    \
7377    }                                                                        \
7378}                                                                            \
7379                                                                             \
7380int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7381{                                                                            \
7382    return float ## s ## _compare_internal(a, b, 0, status);                 \
7383}                                                                            \
7384                                                                             \
7385int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
7386                                 float_status *status)                       \
7387{                                                                            \
7388    return float ## s ## _compare_internal(a, b, 1, status);                 \
7389}
7390
7391COMPARE(32, 0xff)
7392COMPARE(64, 0x7ff)
7393
7394static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7395                                            int is_quiet, float_status *status)
7396{
7397    flag aSign, bSign;
7398
7399    if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7400          ( extractFloatx80Frac( a )<<1 ) ) ||
7401        ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7402          ( extractFloatx80Frac( b )<<1 ) )) {
7403        if (!is_quiet ||
7404            floatx80_is_signaling_nan( a ) ||
7405            floatx80_is_signaling_nan( b ) ) {
7406            float_raise(float_flag_invalid, status);
7407        }
7408        return float_relation_unordered;
7409    }
7410    aSign = extractFloatx80Sign( a );
7411    bSign = extractFloatx80Sign( b );
7412    if ( aSign != bSign ) {
7413
7414        if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7415             ( ( a.low | b.low ) == 0 ) ) {
7416            /* zero case */
7417            return float_relation_equal;
7418        } else {
7419            return 1 - (2 * aSign);
7420        }
7421    } else {
7422        if (a.low == b.low && a.high == b.high) {
7423            return float_relation_equal;
7424        } else {
7425            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7426        }
7427    }
7428}
7429
7430int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7431{
7432    return floatx80_compare_internal(a, b, 0, status);
7433}
7434
7435int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7436{
7437    return floatx80_compare_internal(a, b, 1, status);
7438}
7439
7440static inline int float128_compare_internal(float128 a, float128 b,
7441                                            int is_quiet, float_status *status)
7442{
7443    flag aSign, bSign;
7444
7445    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7446          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7447        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7448          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7449        if (!is_quiet ||
7450            float128_is_signaling_nan( a ) ||
7451            float128_is_signaling_nan( b ) ) {
7452            float_raise(float_flag_invalid, status);
7453        }
7454        return float_relation_unordered;
7455    }
7456    aSign = extractFloat128Sign( a );
7457    bSign = extractFloat128Sign( b );
7458    if ( aSign != bSign ) {
7459        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7460            /* zero case */
7461            return float_relation_equal;
7462        } else {
7463            return 1 - (2 * aSign);
7464        }
7465    } else {
7466        if (a.low == b.low && a.high == b.high) {
7467            return float_relation_equal;
7468        } else {
7469            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7470        }
7471    }
7472}
7473
7474int float128_compare(float128 a, float128 b, float_status *status)
7475{
7476    return float128_compare_internal(a, b, 0, status);
7477}
7478
7479int float128_compare_quiet(float128 a, float128 b, float_status *status)
7480{
7481    return float128_compare_internal(a, b, 1, status);
7482}
7483
7484/* min() and max() functions. These can't be implemented as
7485 * 'compare and pick one input' because that would mishandle
7486 * NaNs and +0 vs -0.
7487 *
7488 * minnum() and maxnum() functions. These are similar to the min()
7489 * and max() functions but if one of the arguments is a QNaN and
7490 * the other is numerical then the numerical argument is returned.
7491 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7492 * and maxNum() operations. min() and max() are the typical min/max
7493 * semantics provided by many CPUs which predate that specification.
7494 *
7495 * minnummag() and maxnummag() functions correspond to minNumMag()
7496 * and minNumMag() from the IEEE-754 2008.
7497 */
7498#define MINMAX(s)                                                       \
7499static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7500                                               int ismin, int isieee,   \
7501                                               int ismag,               \
7502                                               float_status *status)    \
7503{                                                                       \
7504    flag aSign, bSign;                                                  \
7505    uint ## s ## _t av, bv, aav, abv;                                   \
7506    a = float ## s ## _squash_input_denormal(a, status);                \
7507    b = float ## s ## _squash_input_denormal(b, status);                \
7508    if (float ## s ## _is_any_nan(a) ||                                 \
7509        float ## s ## _is_any_nan(b)) {                                 \
7510        if (isieee) {                                                   \
7511            if (float ## s ## _is_quiet_nan(a) &&                       \
7512                !float ## s ##_is_any_nan(b)) {                         \
7513                return b;                                               \
7514            } else if (float ## s ## _is_quiet_nan(b) &&                \
7515                       !float ## s ## _is_any_nan(a)) {                 \
7516                return a;                                               \
7517            }                                                           \
7518        }                                                               \
7519        return propagateFloat ## s ## NaN(a, b, status);                \
7520    }                                                                   \
7521    aSign = extractFloat ## s ## Sign(a);                               \
7522    bSign = extractFloat ## s ## Sign(b);                               \
7523    av = float ## s ## _val(a);                                         \
7524    bv = float ## s ## _val(b);                                         \
7525    if (ismag) {                                                        \
7526        aav = float ## s ## _abs(av);                                   \
7527        abv = float ## s ## _abs(bv);                                   \
7528        if (aav != abv) {                                               \
7529            if (ismin) {                                                \
7530                return (aav < abv) ? a : b;                             \
7531            } else {                                                    \
7532                return (aav < abv) ? b : a;                             \
7533            }                                                           \
7534        }                                                               \
7535    }                                                                   \
7536    if (aSign != bSign) {                                               \
7537        if (ismin) {                                                    \
7538            return aSign ? a : b;                                       \
7539        } else {                                                        \
7540            return aSign ? b : a;                                       \
7541        }                                                               \
7542    } else {                                                            \
7543        if (ismin) {                                                    \
7544            return (aSign ^ (av < bv)) ? a : b;                         \
7545        } else {                                                        \
7546            return (aSign ^ (av < bv)) ? b : a;                         \
7547        }                                                               \
7548    }                                                                   \
7549}                                                                       \
7550                                                                        \
7551float ## s float ## s ## _min(float ## s a, float ## s b,               \
7552                              float_status *status)                     \
7553{                                                                       \
7554    return float ## s ## _minmax(a, b, 1, 0, 0, status);                \
7555}                                                                       \
7556                                                                        \
7557float ## s float ## s ## _max(float ## s a, float ## s b,               \
7558                              float_status *status)                     \
7559{                                                                       \
7560    return float ## s ## _minmax(a, b, 0, 0, 0, status);                \
7561}                                                                       \
7562                                                                        \
7563float ## s float ## s ## _minnum(float ## s a, float ## s b,            \
7564                                 float_status *status)                  \
7565{                                                                       \
7566    return float ## s ## _minmax(a, b, 1, 1, 0, status);                \
7567}                                                                       \
7568                                                                        \
7569float ## s float ## s ## _maxnum(float ## s a, float ## s b,            \
7570                                 float_status *status)                  \
7571{                                                                       \
7572    return float ## s ## _minmax(a, b, 0, 1, 0, status);                \
7573}                                                                       \
7574                                                                        \
7575float ## s float ## s ## _minnummag(float ## s a, float ## s b,         \
7576                                    float_status *status)               \
7577{                                                                       \
7578    return float ## s ## _minmax(a, b, 1, 1, 1, status);                \
7579}                                                                       \
7580                                                                        \
7581float ## s float ## s ## _maxnummag(float ## s a, float ## s b,         \
7582                                    float_status *status)               \
7583{                                                                       \
7584    return float ## s ## _minmax(a, b, 0, 1, 1, status);                \
7585}
7586
7587MINMAX(32)
7588MINMAX(64)
7589
7590
7591/* Multiply A by 2 raised to the power N.  */
7592float32 float32_scalbn(float32 a, int n, float_status *status)
7593{
7594    flag aSign;
7595    int16_t aExp;
7596    uint32_t aSig;
7597
7598    a = float32_squash_input_denormal(a, status);
7599    aSig = extractFloat32Frac( a );
7600    aExp = extractFloat32Exp( a );
7601    aSign = extractFloat32Sign( a );
7602
7603    if ( aExp == 0xFF ) {
7604        if ( aSig ) {
7605            return propagateFloat32NaN(a, a, status);
7606        }
7607        return a;
7608    }
7609    if (aExp != 0) {
7610        aSig |= 0x00800000;
7611    } else if (aSig == 0) {
7612        return a;
7613    } else {
7614        aExp++;
7615    }
7616
7617    if (n > 0x200) {
7618        n = 0x200;
7619    } else if (n < -0x200) {
7620        n = -0x200;
7621    }
7622
7623    aExp += n - 1;
7624    aSig <<= 7;
7625    return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
7626}
7627
7628float64 float64_scalbn(float64 a, int n, float_status *status)
7629{
7630    flag aSign;
7631    int16_t aExp;
7632    uint64_t aSig;
7633
7634    a = float64_squash_input_denormal(a, status);
7635    aSig = extractFloat64Frac( a );
7636    aExp = extractFloat64Exp( a );
7637    aSign = extractFloat64Sign( a );
7638
7639    if ( aExp == 0x7FF ) {
7640        if ( aSig ) {
7641            return propagateFloat64NaN(a, a, status);
7642        }
7643        return a;
7644    }
7645    if (aExp != 0) {
7646        aSig |= LIT64( 0x0010000000000000 );
7647    } else if (aSig == 0) {
7648        return a;
7649    } else {
7650        aExp++;
7651    }
7652
7653    if (n > 0x1000) {
7654        n = 0x1000;
7655    } else if (n < -0x1000) {
7656        n = -0x1000;
7657    }
7658
7659    aExp += n - 1;
7660    aSig <<= 10;
7661    return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
7662}
7663
7664floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7665{
7666    flag aSign;
7667    int32_t aExp;
7668    uint64_t aSig;
7669
7670    aSig = extractFloatx80Frac( a );
7671    aExp = extractFloatx80Exp( a );
7672    aSign = extractFloatx80Sign( a );
7673
7674    if ( aExp == 0x7FFF ) {
7675        if ( aSig<<1 ) {
7676            return propagateFloatx80NaN(a, a, status);
7677        }
7678        return a;
7679    }
7680
7681    if (aExp == 0) {
7682        if (aSig == 0) {
7683            return a;
7684        }
7685        aExp++;
7686    }
7687
7688    if (n > 0x10000) {
7689        n = 0x10000;
7690    } else if (n < -0x10000) {
7691        n = -0x10000;
7692    }
7693
7694    aExp += n;
7695    return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7696                                         aSign, aExp, aSig, 0, status);
7697}
7698
7699float128 float128_scalbn(float128 a, int n, float_status *status)
7700{
7701    flag aSign;
7702    int32_t aExp;
7703    uint64_t aSig0, aSig1;
7704
7705    aSig1 = extractFloat128Frac1( a );
7706    aSig0 = extractFloat128Frac0( a );
7707    aExp = extractFloat128Exp( a );
7708    aSign = extractFloat128Sign( a );
7709    if ( aExp == 0x7FFF ) {
7710        if ( aSig0 | aSig1 ) {
7711            return propagateFloat128NaN(a, a, status);
7712        }
7713        return a;
7714    }
7715    if (aExp != 0) {
7716        aSig0 |= LIT64( 0x0001000000000000 );
7717    } else if (aSig0 == 0 && aSig1 == 0) {
7718        return a;
7719    } else {
7720        aExp++;
7721    }
7722
7723    if (n > 0x10000) {
7724        n = 0x10000;
7725    } else if (n < -0x10000) {
7726        n = -0x10000;
7727    }
7728
7729    aExp += n - 1;
7730    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7731                                         , status);
7732
7733}
7734