qemu/fpu/softfloat.c
<<
>>
Prefs
   1/*
   2 * QEMU float support
   3 *
   4 * The code in this source file is derived from release 2a of the SoftFloat
   5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6 * some later contributions) are provided under that license, as detailed below.
   7 * It has subsequently been modified by contributors to the QEMU Project,
   8 * so some portions are provided under:
   9 *  the SoftFloat-2a license
  10 *  the BSD license
  11 *  GPL-v2-or-later
  12 *
  13 * Any future contributions to this file after December 1st 2014 will be
  14 * taken to be licensed under the Softfloat-2a license unless specifically
  15 * indicated otherwise.
  16 */
  17
  18/*
  19===============================================================================
  20This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21Arithmetic Package, Release 2a.
  22
  23Written by John R. Hauser.  This work was made possible in part by the
  24International Computer Science Institute, located at Suite 600, 1947 Center
  25Street, Berkeley, California 94704.  Funding was partially provided by the
  26National Science Foundation under grant MIP-9311980.  The original version
  27of this code was written as part of a project to build a fixed-point vector
  28processor in collaboration with the University of California at Berkeley,
  29overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31arithmetic/SoftFloat.html'.
  32
  33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39Derivative works are acceptable, even for commercial purposes, so long as
  40(1) they include prominent notice that the work is derivative, and (2) they
  41include prominent notice akin to these four paragraphs for those parts of
  42this code that are retained.
  43
  44===============================================================================
  45*/
  46
  47/* BSD licensing:
  48 * Copyright (c) 2006, Fabrice Bellard
  49 * All rights reserved.
  50 *
  51 * Redistribution and use in source and binary forms, with or without
  52 * modification, are permitted provided that the following conditions are met:
  53 *
  54 * 1. Redistributions of source code must retain the above copyright notice,
  55 * this list of conditions and the following disclaimer.
  56 *
  57 * 2. Redistributions in binary form must reproduce the above copyright notice,
  58 * this list of conditions and the following disclaimer in the documentation
  59 * and/or other materials provided with the distribution.
  60 *
  61 * 3. Neither the name of the copyright holder nor the names of its contributors
  62 * may be used to endorse or promote products derived from this software without
  63 * specific prior written permission.
  64 *
  65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75 * THE POSSIBILITY OF SUCH DAMAGE.
  76 */
  77
  78/* Portions of this work are licensed under the terms of the GNU GPL,
  79 * version 2 or later. See the COPYING file in the top-level directory.
  80 */
  81
  82/* softfloat (and in particular the code in softfloat-specialize.h) is
  83 * target-dependent and needs the TARGET_* macros.
  84 */
  85#include "qemu/osdep.h"
  86
  87#include "fpu/softfloat.h"
  88
  89/* We only need stdlib for abort() */
  90
  91/*----------------------------------------------------------------------------
  92| Primitive arithmetic functions, including multi-word arithmetic, and
  93| division and square root approximations.  (Can be specialized to target if
  94| desired.)
  95*----------------------------------------------------------------------------*/
  96#include "softfloat-macros.h"
  97
  98/*----------------------------------------------------------------------------
  99| Functions and definitions to determine:  (1) whether tininess for underflow
 100| is detected before or after rounding by default, (2) what (if anything)
 101| happens when exceptions are raised, (3) how signaling NaNs are distinguished
 102| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 103| are propagated from function inputs to output.  These details are target-
 104| specific.
 105*----------------------------------------------------------------------------*/
 106#include "softfloat-specialize.h"
 107
 108/*----------------------------------------------------------------------------
 109| Returns the fraction bits of the half-precision floating-point value `a'.
 110*----------------------------------------------------------------------------*/
 111
 112static inline uint32_t extractFloat16Frac(float16 a)
 113{
 114    return float16_val(a) & 0x3ff;
 115}
 116
 117/*----------------------------------------------------------------------------
 118| Returns the exponent bits of the half-precision floating-point value `a'.
 119*----------------------------------------------------------------------------*/
 120
 121static inline int extractFloat16Exp(float16 a)
 122{
 123    return (float16_val(a) >> 10) & 0x1f;
 124}
 125
 126/*----------------------------------------------------------------------------
 127| Returns the sign bit of the single-precision floating-point value `a'.
 128*----------------------------------------------------------------------------*/
 129
 130static inline flag extractFloat16Sign(float16 a)
 131{
 132    return float16_val(a)>>15;
 133}
 134
 135/*----------------------------------------------------------------------------
 136| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 137| and 7, and returns the properly rounded 32-bit integer corresponding to the
 138| input.  If `zSign' is 1, the input is negated before being converted to an
 139| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 140| is simply rounded to an integer, with the inexact exception raised if the
 141| input cannot be represented exactly as an integer.  However, if the fixed-
 142| point input is too large, the invalid exception is raised and the largest
 143| positive or negative integer is returned.
 144*----------------------------------------------------------------------------*/
 145
 146static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
 147{
 148    int8_t roundingMode;
 149    flag roundNearestEven;
 150    int8_t roundIncrement, roundBits;
 151    int32_t z;
 152
 153    roundingMode = status->float_rounding_mode;
 154    roundNearestEven = ( roundingMode == float_round_nearest_even );
 155    switch (roundingMode) {
 156    case float_round_nearest_even:
 157    case float_round_ties_away:
 158        roundIncrement = 0x40;
 159        break;
 160    case float_round_to_zero:
 161        roundIncrement = 0;
 162        break;
 163    case float_round_up:
 164        roundIncrement = zSign ? 0 : 0x7f;
 165        break;
 166    case float_round_down:
 167        roundIncrement = zSign ? 0x7f : 0;
 168        break;
 169    default:
 170        abort();
 171    }
 172    roundBits = absZ & 0x7F;
 173    absZ = ( absZ + roundIncrement )>>7;
 174    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 175    z = absZ;
 176    if ( zSign ) z = - z;
 177    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 178        float_raise(float_flag_invalid, status);
 179        return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 180    }
 181    if (roundBits) {
 182        status->float_exception_flags |= float_flag_inexact;
 183    }
 184    return z;
 185
 186}
 187
 188/*----------------------------------------------------------------------------
 189| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 190| `absZ1', with binary point between bits 63 and 64 (between the input words),
 191| and returns the properly rounded 64-bit integer corresponding to the input.
 192| If `zSign' is 1, the input is negated before being converted to an integer.
 193| Ordinarily, the fixed-point input is simply rounded to an integer, with
 194| the inexact exception raised if the input cannot be represented exactly as
 195| an integer.  However, if the fixed-point input is too large, the invalid
 196| exception is raised and the largest positive or negative integer is
 197| returned.
 198*----------------------------------------------------------------------------*/
 199
 200static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
 201                               float_status *status)
 202{
 203    int8_t roundingMode;
 204    flag roundNearestEven, increment;
 205    int64_t z;
 206
 207    roundingMode = status->float_rounding_mode;
 208    roundNearestEven = ( roundingMode == float_round_nearest_even );
 209    switch (roundingMode) {
 210    case float_round_nearest_even:
 211    case float_round_ties_away:
 212        increment = ((int64_t) absZ1 < 0);
 213        break;
 214    case float_round_to_zero:
 215        increment = 0;
 216        break;
 217    case float_round_up:
 218        increment = !zSign && absZ1;
 219        break;
 220    case float_round_down:
 221        increment = zSign && absZ1;
 222        break;
 223    default:
 224        abort();
 225    }
 226    if ( increment ) {
 227        ++absZ0;
 228        if ( absZ0 == 0 ) goto overflow;
 229        absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 230    }
 231    z = absZ0;
 232    if ( zSign ) z = - z;
 233    if ( z && ( ( z < 0 ) ^ zSign ) ) {
 234 overflow:
 235        float_raise(float_flag_invalid, status);
 236        return
 237              zSign ? (int64_t) LIT64( 0x8000000000000000 )
 238            : LIT64( 0x7FFFFFFFFFFFFFFF );
 239    }
 240    if (absZ1) {
 241        status->float_exception_flags |= float_flag_inexact;
 242    }
 243    return z;
 244
 245}
 246
 247/*----------------------------------------------------------------------------
 248| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 249| `absZ1', with binary point between bits 63 and 64 (between the input words),
 250| and returns the properly rounded 64-bit unsigned integer corresponding to the
 251| input.  Ordinarily, the fixed-point input is simply rounded to an integer,
 252| with the inexact exception raised if the input cannot be represented exactly
 253| as an integer.  However, if the fixed-point input is too large, the invalid
 254| exception is raised and the largest unsigned integer is returned.
 255*----------------------------------------------------------------------------*/
 256
 257static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
 258                                uint64_t absZ1, float_status *status)
 259{
 260    int8_t roundingMode;
 261    flag roundNearestEven, increment;
 262
 263    roundingMode = status->float_rounding_mode;
 264    roundNearestEven = (roundingMode == float_round_nearest_even);
 265    switch (roundingMode) {
 266    case float_round_nearest_even:
 267    case float_round_ties_away:
 268        increment = ((int64_t)absZ1 < 0);
 269        break;
 270    case float_round_to_zero:
 271        increment = 0;
 272        break;
 273    case float_round_up:
 274        increment = !zSign && absZ1;
 275        break;
 276    case float_round_down:
 277        increment = zSign && absZ1;
 278        break;
 279    default:
 280        abort();
 281    }
 282    if (increment) {
 283        ++absZ0;
 284        if (absZ0 == 0) {
 285            float_raise(float_flag_invalid, status);
 286            return LIT64(0xFFFFFFFFFFFFFFFF);
 287        }
 288        absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
 289    }
 290
 291    if (zSign && absZ0) {
 292        float_raise(float_flag_invalid, status);
 293        return 0;
 294    }
 295
 296    if (absZ1) {
 297        status->float_exception_flags |= float_flag_inexact;
 298    }
 299    return absZ0;
 300}
 301
 302/*----------------------------------------------------------------------------
 303| Returns the fraction bits of the single-precision floating-point value `a'.
 304*----------------------------------------------------------------------------*/
 305
 306static inline uint32_t extractFloat32Frac( float32 a )
 307{
 308
 309    return float32_val(a) & 0x007FFFFF;
 310
 311}
 312
 313/*----------------------------------------------------------------------------
 314| Returns the exponent bits of the single-precision floating-point value `a'.
 315*----------------------------------------------------------------------------*/
 316
 317static inline int extractFloat32Exp(float32 a)
 318{
 319
 320    return ( float32_val(a)>>23 ) & 0xFF;
 321
 322}
 323
 324/*----------------------------------------------------------------------------
 325| Returns the sign bit of the single-precision floating-point value `a'.
 326*----------------------------------------------------------------------------*/
 327
 328static inline flag extractFloat32Sign( float32 a )
 329{
 330
 331    return float32_val(a)>>31;
 332
 333}
 334
 335/*----------------------------------------------------------------------------
 336| If `a' is denormal and we are in flush-to-zero mode then set the
 337| input-denormal exception and return zero. Otherwise just return the value.
 338*----------------------------------------------------------------------------*/
 339float32 float32_squash_input_denormal(float32 a, float_status *status)
 340{
 341    if (status->flush_inputs_to_zero) {
 342        if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 343            float_raise(float_flag_input_denormal, status);
 344            return make_float32(float32_val(a) & 0x80000000);
 345        }
 346    }
 347    return a;
 348}
 349
 350/*----------------------------------------------------------------------------
 351| Normalizes the subnormal single-precision floating-point value represented
 352| by the denormalized significand `aSig'.  The normalized exponent and
 353| significand are stored at the locations pointed to by `zExpPtr' and
 354| `zSigPtr', respectively.
 355*----------------------------------------------------------------------------*/
 356
 357static void
 358 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
 359{
 360    int8_t shiftCount;
 361
 362    shiftCount = countLeadingZeros32( aSig ) - 8;
 363    *zSigPtr = aSig<<shiftCount;
 364    *zExpPtr = 1 - shiftCount;
 365
 366}
 367
 368/*----------------------------------------------------------------------------
 369| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 370| single-precision floating-point value, returning the result.  After being
 371| shifted into the proper positions, the three fields are simply added
 372| together to form the result.  This means that any integer portion of `zSig'
 373| will be added into the exponent.  Since a properly normalized significand
 374| will have an integer portion equal to 1, the `zExp' input should be 1 less
 375| than the desired result exponent whenever `zSig' is a complete, normalized
 376| significand.
 377*----------------------------------------------------------------------------*/
 378
 379static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
 380{
 381
 382    return make_float32(
 383          ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 384
 385}
 386
 387/*----------------------------------------------------------------------------
 388| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 389| and significand `zSig', and returns the proper single-precision floating-
 390| point value corresponding to the abstract input.  Ordinarily, the abstract
 391| value is simply rounded and packed into the single-precision format, with
 392| the inexact exception raised if the abstract input cannot be represented
 393| exactly.  However, if the abstract value is too large, the overflow and
 394| inexact exceptions are raised and an infinity or maximal finite value is
 395| returned.  If the abstract value is too small, the input value is rounded to
 396| a subnormal number, and the underflow and inexact exceptions are raised if
 397| the abstract input cannot be represented exactly as a subnormal single-
 398| precision floating-point number.
 399|     The input significand `zSig' has its binary point between bits 30
 400| and 29, which is 7 bits to the left of the usual location.  This shifted
 401| significand must be normalized or smaller.  If `zSig' is not normalized,
 402| `zExp' must be 0; in that case, the result returned is a subnormal number,
 403| and it must not require rounding.  In the usual case that `zSig' is
 404| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 405| The handling of underflow and overflow follows the IEC/IEEE Standard for
 406| Binary Floating-Point Arithmetic.
 407*----------------------------------------------------------------------------*/
 408
 409static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
 410                                   float_status *status)
 411{
 412    int8_t roundingMode;
 413    flag roundNearestEven;
 414    int8_t roundIncrement, roundBits;
 415    flag isTiny;
 416
 417    roundingMode = status->float_rounding_mode;
 418    roundNearestEven = ( roundingMode == float_round_nearest_even );
 419    switch (roundingMode) {
 420    case float_round_nearest_even:
 421    case float_round_ties_away:
 422        roundIncrement = 0x40;
 423        break;
 424    case float_round_to_zero:
 425        roundIncrement = 0;
 426        break;
 427    case float_round_up:
 428        roundIncrement = zSign ? 0 : 0x7f;
 429        break;
 430    case float_round_down:
 431        roundIncrement = zSign ? 0x7f : 0;
 432        break;
 433    default:
 434        abort();
 435        break;
 436    }
 437    roundBits = zSig & 0x7F;
 438    if ( 0xFD <= (uint16_t) zExp ) {
 439        if (    ( 0xFD < zExp )
 440             || (    ( zExp == 0xFD )
 441                  && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 442           ) {
 443            float_raise(float_flag_overflow | float_flag_inexact, status);
 444            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 445        }
 446        if ( zExp < 0 ) {
 447            if (status->flush_to_zero) {
 448                float_raise(float_flag_output_denormal, status);
 449                return packFloat32(zSign, 0, 0);
 450            }
 451            isTiny =
 452                (status->float_detect_tininess
 453                 == float_tininess_before_rounding)
 454                || ( zExp < -1 )
 455                || ( zSig + roundIncrement < 0x80000000 );
 456            shift32RightJamming( zSig, - zExp, &zSig );
 457            zExp = 0;
 458            roundBits = zSig & 0x7F;
 459            if (isTiny && roundBits) {
 460                float_raise(float_flag_underflow, status);
 461            }
 462        }
 463    }
 464    if (roundBits) {
 465        status->float_exception_flags |= float_flag_inexact;
 466    }
 467    zSig = ( zSig + roundIncrement )>>7;
 468    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 469    if ( zSig == 0 ) zExp = 0;
 470    return packFloat32( zSign, zExp, zSig );
 471
 472}
 473
 474/*----------------------------------------------------------------------------
 475| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 476| and significand `zSig', and returns the proper single-precision floating-
 477| point value corresponding to the abstract input.  This routine is just like
 478| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 479| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 480| floating-point exponent.
 481*----------------------------------------------------------------------------*/
 482
 483static float32
 484 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
 485                              float_status *status)
 486{
 487    int8_t shiftCount;
 488
 489    shiftCount = countLeadingZeros32( zSig ) - 1;
 490    return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
 491                               status);
 492
 493}
 494
 495/*----------------------------------------------------------------------------
 496| Returns the fraction bits of the double-precision floating-point value `a'.
 497*----------------------------------------------------------------------------*/
 498
 499static inline uint64_t extractFloat64Frac( float64 a )
 500{
 501
 502    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 503
 504}
 505
 506/*----------------------------------------------------------------------------
 507| Returns the exponent bits of the double-precision floating-point value `a'.
 508*----------------------------------------------------------------------------*/
 509
 510static inline int extractFloat64Exp(float64 a)
 511{
 512
 513    return ( float64_val(a)>>52 ) & 0x7FF;
 514
 515}
 516
 517/*----------------------------------------------------------------------------
 518| Returns the sign bit of the double-precision floating-point value `a'.
 519*----------------------------------------------------------------------------*/
 520
 521static inline flag extractFloat64Sign( float64 a )
 522{
 523
 524    return float64_val(a)>>63;
 525
 526}
 527
 528/*----------------------------------------------------------------------------
 529| If `a' is denormal and we are in flush-to-zero mode then set the
 530| input-denormal exception and return zero. Otherwise just return the value.
 531*----------------------------------------------------------------------------*/
 532float64 float64_squash_input_denormal(float64 a, float_status *status)
 533{
 534    if (status->flush_inputs_to_zero) {
 535        if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 536            float_raise(float_flag_input_denormal, status);
 537            return make_float64(float64_val(a) & (1ULL << 63));
 538        }
 539    }
 540    return a;
 541}
 542
 543/*----------------------------------------------------------------------------
 544| Normalizes the subnormal double-precision floating-point value represented
 545| by the denormalized significand `aSig'.  The normalized exponent and
 546| significand are stored at the locations pointed to by `zExpPtr' and
 547| `zSigPtr', respectively.
 548*----------------------------------------------------------------------------*/
 549
 550static void
 551 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
 552{
 553    int8_t shiftCount;
 554
 555    shiftCount = countLeadingZeros64( aSig ) - 11;
 556    *zSigPtr = aSig<<shiftCount;
 557    *zExpPtr = 1 - shiftCount;
 558
 559}
 560
 561/*----------------------------------------------------------------------------
 562| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 563| double-precision floating-point value, returning the result.  After being
 564| shifted into the proper positions, the three fields are simply added
 565| together to form the result.  This means that any integer portion of `zSig'
 566| will be added into the exponent.  Since a properly normalized significand
 567| will have an integer portion equal to 1, the `zExp' input should be 1 less
 568| than the desired result exponent whenever `zSig' is a complete, normalized
 569| significand.
 570*----------------------------------------------------------------------------*/
 571
 572static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
 573{
 574
 575    return make_float64(
 576        ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 577
 578}
 579
 580/*----------------------------------------------------------------------------
 581| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 582| and significand `zSig', and returns the proper double-precision floating-
 583| point value corresponding to the abstract input.  Ordinarily, the abstract
 584| value is simply rounded and packed into the double-precision format, with
 585| the inexact exception raised if the abstract input cannot be represented
 586| exactly.  However, if the abstract value is too large, the overflow and
 587| inexact exceptions are raised and an infinity or maximal finite value is
 588| returned.  If the abstract value is too small, the input value is rounded to
 589| a subnormal number, and the underflow and inexact exceptions are raised if
 590| the abstract input cannot be represented exactly as a subnormal double-
 591| precision floating-point number.
 592|     The input significand `zSig' has its binary point between bits 62
 593| and 61, which is 10 bits to the left of the usual location.  This shifted
 594| significand must be normalized or smaller.  If `zSig' is not normalized,
 595| `zExp' must be 0; in that case, the result returned is a subnormal number,
 596| and it must not require rounding.  In the usual case that `zSig' is
 597| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 598| The handling of underflow and overflow follows the IEC/IEEE Standard for
 599| Binary Floating-Point Arithmetic.
 600*----------------------------------------------------------------------------*/
 601
 602static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
 603                                   float_status *status)
 604{
 605    int8_t roundingMode;
 606    flag roundNearestEven;
 607    int roundIncrement, roundBits;
 608    flag isTiny;
 609
 610    roundingMode = status->float_rounding_mode;
 611    roundNearestEven = ( roundingMode == float_round_nearest_even );
 612    switch (roundingMode) {
 613    case float_round_nearest_even:
 614    case float_round_ties_away:
 615        roundIncrement = 0x200;
 616        break;
 617    case float_round_to_zero:
 618        roundIncrement = 0;
 619        break;
 620    case float_round_up:
 621        roundIncrement = zSign ? 0 : 0x3ff;
 622        break;
 623    case float_round_down:
 624        roundIncrement = zSign ? 0x3ff : 0;
 625        break;
 626    case float_round_to_odd:
 627        roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
 628        break;
 629    default:
 630        abort();
 631    }
 632    roundBits = zSig & 0x3FF;
 633    if ( 0x7FD <= (uint16_t) zExp ) {
 634        if (    ( 0x7FD < zExp )
 635             || (    ( zExp == 0x7FD )
 636                  && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 637           ) {
 638            bool overflow_to_inf = roundingMode != float_round_to_odd &&
 639                                   roundIncrement != 0;
 640            float_raise(float_flag_overflow | float_flag_inexact, status);
 641            return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
 642        }
 643        if ( zExp < 0 ) {
 644            if (status->flush_to_zero) {
 645                float_raise(float_flag_output_denormal, status);
 646                return packFloat64(zSign, 0, 0);
 647            }
 648            isTiny =
 649                   (status->float_detect_tininess
 650                    == float_tininess_before_rounding)
 651                || ( zExp < -1 )
 652                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 653            shift64RightJamming( zSig, - zExp, &zSig );
 654            zExp = 0;
 655            roundBits = zSig & 0x3FF;
 656            if (isTiny && roundBits) {
 657                float_raise(float_flag_underflow, status);
 658            }
 659            if (roundingMode == float_round_to_odd) {
 660                /*
 661                 * For round-to-odd case, the roundIncrement depends on
 662                 * zSig which just changed.
 663                 */
 664                roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
 665            }
 666        }
 667    }
 668    if (roundBits) {
 669        status->float_exception_flags |= float_flag_inexact;
 670    }
 671    zSig = ( zSig + roundIncrement )>>10;
 672    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 673    if ( zSig == 0 ) zExp = 0;
 674    return packFloat64( zSign, zExp, zSig );
 675
 676}
 677
 678/*----------------------------------------------------------------------------
 679| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 680| and significand `zSig', and returns the proper double-precision floating-
 681| point value corresponding to the abstract input.  This routine is just like
 682| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 683| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 684| floating-point exponent.
 685*----------------------------------------------------------------------------*/
 686
 687static float64
 688 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
 689                              float_status *status)
 690{
 691    int8_t shiftCount;
 692
 693    shiftCount = countLeadingZeros64( zSig ) - 1;
 694    return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
 695                               status);
 696
 697}
 698
 699/*----------------------------------------------------------------------------
 700| Returns the fraction bits of the extended double-precision floating-point
 701| value `a'.
 702*----------------------------------------------------------------------------*/
 703
 704static inline uint64_t extractFloatx80Frac( floatx80 a )
 705{
 706
 707    return a.low;
 708
 709}
 710
 711/*----------------------------------------------------------------------------
 712| Returns the exponent bits of the extended double-precision floating-point
 713| value `a'.
 714*----------------------------------------------------------------------------*/
 715
 716static inline int32_t extractFloatx80Exp( floatx80 a )
 717{
 718
 719    return a.high & 0x7FFF;
 720
 721}
 722
 723/*----------------------------------------------------------------------------
 724| Returns the sign bit of the extended double-precision floating-point value
 725| `a'.
 726*----------------------------------------------------------------------------*/
 727
 728static inline flag extractFloatx80Sign( floatx80 a )
 729{
 730
 731    return a.high>>15;
 732
 733}
 734
 735/*----------------------------------------------------------------------------
 736| Normalizes the subnormal extended double-precision floating-point value
 737| represented by the denormalized significand `aSig'.  The normalized exponent
 738| and significand are stored at the locations pointed to by `zExpPtr' and
 739| `zSigPtr', respectively.
 740*----------------------------------------------------------------------------*/
 741
 742static void
 743 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
 744{
 745    int8_t shiftCount;
 746
 747    shiftCount = countLeadingZeros64( aSig );
 748    *zSigPtr = aSig<<shiftCount;
 749    *zExpPtr = 1 - shiftCount;
 750
 751}
 752
 753/*----------------------------------------------------------------------------
 754| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 755| extended double-precision floating-point value, returning the result.
 756*----------------------------------------------------------------------------*/
 757
 758static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
 759{
 760    floatx80 z;
 761
 762    z.low = zSig;
 763    z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 764    return z;
 765
 766}
 767
 768/*----------------------------------------------------------------------------
 769| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 770| and extended significand formed by the concatenation of `zSig0' and `zSig1',
 771| and returns the proper extended double-precision floating-point value
 772| corresponding to the abstract input.  Ordinarily, the abstract value is
 773| rounded and packed into the extended double-precision format, with the
 774| inexact exception raised if the abstract input cannot be represented
 775| exactly.  However, if the abstract value is too large, the overflow and
 776| inexact exceptions are raised and an infinity or maximal finite value is
 777| returned.  If the abstract value is too small, the input value is rounded to
 778| a subnormal number, and the underflow and inexact exceptions are raised if
 779| the abstract input cannot be represented exactly as a subnormal extended
 780| double-precision floating-point number.
 781|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 782| number of bits as single or double precision, respectively.  Otherwise, the
 783| result is rounded to the full precision of the extended double-precision
 784| format.
 785|     The input significand must be normalized or smaller.  If the input
 786| significand is not normalized, `zExp' must be 0; in that case, the result
 787| returned is a subnormal number, and it must not require rounding.  The
 788| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 789| Floating-Point Arithmetic.
 790*----------------------------------------------------------------------------*/
 791
 792static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
 793                                     int32_t zExp, uint64_t zSig0, uint64_t zSig1,
 794                                     float_status *status)
 795{
 796    int8_t roundingMode;
 797    flag roundNearestEven, increment, isTiny;
 798    int64_t roundIncrement, roundMask, roundBits;
 799
 800    roundingMode = status->float_rounding_mode;
 801    roundNearestEven = ( roundingMode == float_round_nearest_even );
 802    if ( roundingPrecision == 80 ) goto precision80;
 803    if ( roundingPrecision == 64 ) {
 804        roundIncrement = LIT64( 0x0000000000000400 );
 805        roundMask = LIT64( 0x00000000000007FF );
 806    }
 807    else if ( roundingPrecision == 32 ) {
 808        roundIncrement = LIT64( 0x0000008000000000 );
 809        roundMask = LIT64( 0x000000FFFFFFFFFF );
 810    }
 811    else {
 812        goto precision80;
 813    }
 814    zSig0 |= ( zSig1 != 0 );
 815    switch (roundingMode) {
 816    case float_round_nearest_even:
 817    case float_round_ties_away:
 818        break;
 819    case float_round_to_zero:
 820        roundIncrement = 0;
 821        break;
 822    case float_round_up:
 823        roundIncrement = zSign ? 0 : roundMask;
 824        break;
 825    case float_round_down:
 826        roundIncrement = zSign ? roundMask : 0;
 827        break;
 828    default:
 829        abort();
 830    }
 831    roundBits = zSig0 & roundMask;
 832    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 833        if (    ( 0x7FFE < zExp )
 834             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 835           ) {
 836            goto overflow;
 837        }
 838        if ( zExp <= 0 ) {
 839            if (status->flush_to_zero) {
 840                float_raise(float_flag_output_denormal, status);
 841                return packFloatx80(zSign, 0, 0);
 842            }
 843            isTiny =
 844                   (status->float_detect_tininess
 845                    == float_tininess_before_rounding)
 846                || ( zExp < 0 )
 847                || ( zSig0 <= zSig0 + roundIncrement );
 848            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 849            zExp = 0;
 850            roundBits = zSig0 & roundMask;
 851            if (isTiny && roundBits) {
 852                float_raise(float_flag_underflow, status);
 853            }
 854            if (roundBits) {
 855                status->float_exception_flags |= float_flag_inexact;
 856            }
 857            zSig0 += roundIncrement;
 858            if ( (int64_t) zSig0 < 0 ) zExp = 1;
 859            roundIncrement = roundMask + 1;
 860            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 861                roundMask |= roundIncrement;
 862            }
 863            zSig0 &= ~ roundMask;
 864            return packFloatx80( zSign, zExp, zSig0 );
 865        }
 866    }
 867    if (roundBits) {
 868        status->float_exception_flags |= float_flag_inexact;
 869    }
 870    zSig0 += roundIncrement;
 871    if ( zSig0 < roundIncrement ) {
 872        ++zExp;
 873        zSig0 = LIT64( 0x8000000000000000 );
 874    }
 875    roundIncrement = roundMask + 1;
 876    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 877        roundMask |= roundIncrement;
 878    }
 879    zSig0 &= ~ roundMask;
 880    if ( zSig0 == 0 ) zExp = 0;
 881    return packFloatx80( zSign, zExp, zSig0 );
 882 precision80:
 883    switch (roundingMode) {
 884    case float_round_nearest_even:
 885    case float_round_ties_away:
 886        increment = ((int64_t)zSig1 < 0);
 887        break;
 888    case float_round_to_zero:
 889        increment = 0;
 890        break;
 891    case float_round_up:
 892        increment = !zSign && zSig1;
 893        break;
 894    case float_round_down:
 895        increment = zSign && zSig1;
 896        break;
 897    default:
 898        abort();
 899    }
 900    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 901        if (    ( 0x7FFE < zExp )
 902             || (    ( zExp == 0x7FFE )
 903                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 904                  && increment
 905                )
 906           ) {
 907            roundMask = 0;
 908 overflow:
 909            float_raise(float_flag_overflow | float_flag_inexact, status);
 910            if (    ( roundingMode == float_round_to_zero )
 911                 || ( zSign && ( roundingMode == float_round_up ) )
 912                 || ( ! zSign && ( roundingMode == float_round_down ) )
 913               ) {
 914                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 915            }
 916            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 917        }
 918        if ( zExp <= 0 ) {
 919            isTiny =
 920                   (status->float_detect_tininess
 921                    == float_tininess_before_rounding)
 922                || ( zExp < 0 )
 923                || ! increment
 924                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 925            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 926            zExp = 0;
 927            if (isTiny && zSig1) {
 928                float_raise(float_flag_underflow, status);
 929            }
 930            if (zSig1) {
 931                status->float_exception_flags |= float_flag_inexact;
 932            }
 933            switch (roundingMode) {
 934            case float_round_nearest_even:
 935            case float_round_ties_away:
 936                increment = ((int64_t)zSig1 < 0);
 937                break;
 938            case float_round_to_zero:
 939                increment = 0;
 940                break;
 941            case float_round_up:
 942                increment = !zSign && zSig1;
 943                break;
 944            case float_round_down:
 945                increment = zSign && zSig1;
 946                break;
 947            default:
 948                abort();
 949            }
 950            if ( increment ) {
 951                ++zSig0;
 952                zSig0 &=
 953                    ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 954                if ( (int64_t) zSig0 < 0 ) zExp = 1;
 955            }
 956            return packFloatx80( zSign, zExp, zSig0 );
 957        }
 958    }
 959    if (zSig1) {
 960        status->float_exception_flags |= float_flag_inexact;
 961    }
 962    if ( increment ) {
 963        ++zSig0;
 964        if ( zSig0 == 0 ) {
 965            ++zExp;
 966            zSig0 = LIT64( 0x8000000000000000 );
 967        }
 968        else {
 969            zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 970        }
 971    }
 972    else {
 973        if ( zSig0 == 0 ) zExp = 0;
 974    }
 975    return packFloatx80( zSign, zExp, zSig0 );
 976
 977}
 978
 979/*----------------------------------------------------------------------------
 980| Takes an abstract floating-point value having sign `zSign', exponent
 981| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 982| and returns the proper extended double-precision floating-point value
 983| corresponding to the abstract input.  This routine is just like
 984| `roundAndPackFloatx80' except that the input significand does not have to be
 985| normalized.
 986*----------------------------------------------------------------------------*/
 987
 988static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
 989                                              flag zSign, int32_t zExp,
 990                                              uint64_t zSig0, uint64_t zSig1,
 991                                              float_status *status)
 992{
 993    int8_t shiftCount;
 994
 995    if ( zSig0 == 0 ) {
 996        zSig0 = zSig1;
 997        zSig1 = 0;
 998        zExp -= 64;
 999    }
1000    shiftCount = countLeadingZeros64( zSig0 );
1001    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1002    zExp -= shiftCount;
1003    return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
1004                                zSig0, zSig1, status);
1005
1006}
1007
1008/*----------------------------------------------------------------------------
1009| Returns the least-significant 64 fraction bits of the quadruple-precision
1010| floating-point value `a'.
1011*----------------------------------------------------------------------------*/
1012
1013static inline uint64_t extractFloat128Frac1( float128 a )
1014{
1015
1016    return a.low;
1017
1018}
1019
1020/*----------------------------------------------------------------------------
1021| Returns the most-significant 48 fraction bits of the quadruple-precision
1022| floating-point value `a'.
1023*----------------------------------------------------------------------------*/
1024
1025static inline uint64_t extractFloat128Frac0( float128 a )
1026{
1027
1028    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1029
1030}
1031
1032/*----------------------------------------------------------------------------
1033| Returns the exponent bits of the quadruple-precision floating-point value
1034| `a'.
1035*----------------------------------------------------------------------------*/
1036
1037static inline int32_t extractFloat128Exp( float128 a )
1038{
1039
1040    return ( a.high>>48 ) & 0x7FFF;
1041
1042}
1043
1044/*----------------------------------------------------------------------------
1045| Returns the sign bit of the quadruple-precision floating-point value `a'.
1046*----------------------------------------------------------------------------*/
1047
1048static inline flag extractFloat128Sign( float128 a )
1049{
1050
1051    return a.high>>63;
1052
1053}
1054
1055/*----------------------------------------------------------------------------
1056| Normalizes the subnormal quadruple-precision floating-point value
1057| represented by the denormalized significand formed by the concatenation of
1058| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
1059| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
1060| significand are stored at the location pointed to by `zSig0Ptr', and the
1061| least significant 64 bits of the normalized significand are stored at the
1062| location pointed to by `zSig1Ptr'.
1063*----------------------------------------------------------------------------*/
1064
1065static void
1066 normalizeFloat128Subnormal(
1067     uint64_t aSig0,
1068     uint64_t aSig1,
1069     int32_t *zExpPtr,
1070     uint64_t *zSig0Ptr,
1071     uint64_t *zSig1Ptr
1072 )
1073{
1074    int8_t shiftCount;
1075
1076    if ( aSig0 == 0 ) {
1077        shiftCount = countLeadingZeros64( aSig1 ) - 15;
1078        if ( shiftCount < 0 ) {
1079            *zSig0Ptr = aSig1>>( - shiftCount );
1080            *zSig1Ptr = aSig1<<( shiftCount & 63 );
1081        }
1082        else {
1083            *zSig0Ptr = aSig1<<shiftCount;
1084            *zSig1Ptr = 0;
1085        }
1086        *zExpPtr = - shiftCount - 63;
1087    }
1088    else {
1089        shiftCount = countLeadingZeros64( aSig0 ) - 15;
1090        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1091        *zExpPtr = 1 - shiftCount;
1092    }
1093
1094}
1095
1096/*----------------------------------------------------------------------------
1097| Packs the sign `zSign', the exponent `zExp', and the significand formed
1098| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1099| floating-point value, returning the result.  After being shifted into the
1100| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1101| added together to form the most significant 32 bits of the result.  This
1102| means that any integer portion of `zSig0' will be added into the exponent.
1103| Since a properly normalized significand will have an integer portion equal
1104| to 1, the `zExp' input should be 1 less than the desired result exponent
1105| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1106| significand.
1107*----------------------------------------------------------------------------*/
1108
1109static inline float128
1110 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
1111{
1112    float128 z;
1113
1114    z.low = zSig1;
1115    z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1116    return z;
1117
1118}
1119
1120/*----------------------------------------------------------------------------
1121| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1122| and extended significand formed by the concatenation of `zSig0', `zSig1',
1123| and `zSig2', and returns the proper quadruple-precision floating-point value
1124| corresponding to the abstract input.  Ordinarily, the abstract value is
1125| simply rounded and packed into the quadruple-precision format, with the
1126| inexact exception raised if the abstract input cannot be represented
1127| exactly.  However, if the abstract value is too large, the overflow and
1128| inexact exceptions are raised and an infinity or maximal finite value is
1129| returned.  If the abstract value is too small, the input value is rounded to
1130| a subnormal number, and the underflow and inexact exceptions are raised if
1131| the abstract input cannot be represented exactly as a subnormal quadruple-
1132| precision floating-point number.
1133|     The input significand must be normalized or smaller.  If the input
1134| significand is not normalized, `zExp' must be 0; in that case, the result
1135| returned is a subnormal number, and it must not require rounding.  In the
1136| usual case that the input significand is normalized, `zExp' must be 1 less
1137| than the ``true'' floating-point exponent.  The handling of underflow and
1138| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139*----------------------------------------------------------------------------*/
1140
1141static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
1142                                     uint64_t zSig0, uint64_t zSig1,
1143                                     uint64_t zSig2, float_status *status)
1144{
1145    int8_t roundingMode;
1146    flag roundNearestEven, increment, isTiny;
1147
1148    roundingMode = status->float_rounding_mode;
1149    roundNearestEven = ( roundingMode == float_round_nearest_even );
1150    switch (roundingMode) {
1151    case float_round_nearest_even:
1152    case float_round_ties_away:
1153        increment = ((int64_t)zSig2 < 0);
1154        break;
1155    case float_round_to_zero:
1156        increment = 0;
1157        break;
1158    case float_round_up:
1159        increment = !zSign && zSig2;
1160        break;
1161    case float_round_down:
1162        increment = zSign && zSig2;
1163        break;
1164    case float_round_to_odd:
1165        increment = !(zSig1 & 0x1) && zSig2;
1166        break;
1167    default:
1168        abort();
1169    }
1170    if ( 0x7FFD <= (uint32_t) zExp ) {
1171        if (    ( 0x7FFD < zExp )
1172             || (    ( zExp == 0x7FFD )
1173                  && eq128(
1174                         LIT64( 0x0001FFFFFFFFFFFF ),
1175                         LIT64( 0xFFFFFFFFFFFFFFFF ),
1176                         zSig0,
1177                         zSig1
1178                     )
1179                  && increment
1180                )
1181           ) {
1182            float_raise(float_flag_overflow | float_flag_inexact, status);
1183            if (    ( roundingMode == float_round_to_zero )
1184                 || ( zSign && ( roundingMode == float_round_up ) )
1185                 || ( ! zSign && ( roundingMode == float_round_down ) )
1186                 || (roundingMode == float_round_to_odd)
1187               ) {
1188                return
1189                    packFloat128(
1190                        zSign,
1191                        0x7FFE,
1192                        LIT64( 0x0000FFFFFFFFFFFF ),
1193                        LIT64( 0xFFFFFFFFFFFFFFFF )
1194                    );
1195            }
1196            return packFloat128( zSign, 0x7FFF, 0, 0 );
1197        }
1198        if ( zExp < 0 ) {
1199            if (status->flush_to_zero) {
1200                float_raise(float_flag_output_denormal, status);
1201                return packFloat128(zSign, 0, 0, 0);
1202            }
1203            isTiny =
1204                   (status->float_detect_tininess
1205                    == float_tininess_before_rounding)
1206                || ( zExp < -1 )
1207                || ! increment
1208                || lt128(
1209                       zSig0,
1210                       zSig1,
1211                       LIT64( 0x0001FFFFFFFFFFFF ),
1212                       LIT64( 0xFFFFFFFFFFFFFFFF )
1213                   );
1214            shift128ExtraRightJamming(
1215                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1216            zExp = 0;
1217            if (isTiny && zSig2) {
1218                float_raise(float_flag_underflow, status);
1219            }
1220            switch (roundingMode) {
1221            case float_round_nearest_even:
1222            case float_round_ties_away:
1223                increment = ((int64_t)zSig2 < 0);
1224                break;
1225            case float_round_to_zero:
1226                increment = 0;
1227                break;
1228            case float_round_up:
1229                increment = !zSign && zSig2;
1230                break;
1231            case float_round_down:
1232                increment = zSign && zSig2;
1233                break;
1234            case float_round_to_odd:
1235                increment = !(zSig1 & 0x1) && zSig2;
1236                break;
1237            default:
1238                abort();
1239            }
1240        }
1241    }
1242    if (zSig2) {
1243        status->float_exception_flags |= float_flag_inexact;
1244    }
1245    if ( increment ) {
1246        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1247        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1248    }
1249    else {
1250        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1251    }
1252    return packFloat128( zSign, zExp, zSig0, zSig1 );
1253
1254}
1255
1256/*----------------------------------------------------------------------------
1257| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1258| and significand formed by the concatenation of `zSig0' and `zSig1', and
1259| returns the proper quadruple-precision floating-point value corresponding
1260| to the abstract input.  This routine is just like `roundAndPackFloat128'
1261| except that the input significand has fewer bits and does not have to be
1262| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1263| point exponent.
1264*----------------------------------------------------------------------------*/
1265
1266static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
1267                                              uint64_t zSig0, uint64_t zSig1,
1268                                              float_status *status)
1269{
1270    int8_t shiftCount;
1271    uint64_t zSig2;
1272
1273    if ( zSig0 == 0 ) {
1274        zSig0 = zSig1;
1275        zSig1 = 0;
1276        zExp -= 64;
1277    }
1278    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1279    if ( 0 <= shiftCount ) {
1280        zSig2 = 0;
1281        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1282    }
1283    else {
1284        shift128ExtraRightJamming(
1285            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1286    }
1287    zExp -= shiftCount;
1288    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
1289
1290}
1291
1292/*----------------------------------------------------------------------------
1293| Returns the result of converting the 32-bit two's complement integer `a'
1294| to the single-precision floating-point format.  The conversion is performed
1295| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1296*----------------------------------------------------------------------------*/
1297
1298float32 int32_to_float32(int32_t a, float_status *status)
1299{
1300    flag zSign;
1301
1302    if ( a == 0 ) return float32_zero;
1303    if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1304    zSign = ( a < 0 );
1305    return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
1306}
1307
1308/*----------------------------------------------------------------------------
1309| Returns the result of converting the 32-bit two's complement integer `a'
1310| to the double-precision floating-point format.  The conversion is performed
1311| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1312*----------------------------------------------------------------------------*/
1313
1314float64 int32_to_float64(int32_t a, float_status *status)
1315{
1316    flag zSign;
1317    uint32_t absA;
1318    int8_t shiftCount;
1319    uint64_t zSig;
1320
1321    if ( a == 0 ) return float64_zero;
1322    zSign = ( a < 0 );
1323    absA = zSign ? - a : a;
1324    shiftCount = countLeadingZeros32( absA ) + 21;
1325    zSig = absA;
1326    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1327
1328}
1329
1330/*----------------------------------------------------------------------------
1331| Returns the result of converting the 32-bit two's complement integer `a'
1332| to the extended double-precision floating-point format.  The conversion
1333| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1334| Arithmetic.
1335*----------------------------------------------------------------------------*/
1336
1337floatx80 int32_to_floatx80(int32_t a, float_status *status)
1338{
1339    flag zSign;
1340    uint32_t absA;
1341    int8_t shiftCount;
1342    uint64_t zSig;
1343
1344    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1345    zSign = ( a < 0 );
1346    absA = zSign ? - a : a;
1347    shiftCount = countLeadingZeros32( absA ) + 32;
1348    zSig = absA;
1349    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1350
1351}
1352
1353/*----------------------------------------------------------------------------
1354| Returns the result of converting the 32-bit two's complement integer `a' to
1355| the quadruple-precision floating-point format.  The conversion is performed
1356| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1357*----------------------------------------------------------------------------*/
1358
1359float128 int32_to_float128(int32_t a, float_status *status)
1360{
1361    flag zSign;
1362    uint32_t absA;
1363    int8_t shiftCount;
1364    uint64_t zSig0;
1365
1366    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1367    zSign = ( a < 0 );
1368    absA = zSign ? - a : a;
1369    shiftCount = countLeadingZeros32( absA ) + 17;
1370    zSig0 = absA;
1371    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1372
1373}
1374
1375/*----------------------------------------------------------------------------
1376| Returns the result of converting the 64-bit two's complement integer `a'
1377| to the single-precision floating-point format.  The conversion is performed
1378| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1379*----------------------------------------------------------------------------*/
1380
1381float32 int64_to_float32(int64_t a, float_status *status)
1382{
1383    flag zSign;
1384    uint64_t absA;
1385    int8_t shiftCount;
1386
1387    if ( a == 0 ) return float32_zero;
1388    zSign = ( a < 0 );
1389    absA = zSign ? - a : a;
1390    shiftCount = countLeadingZeros64( absA ) - 40;
1391    if ( 0 <= shiftCount ) {
1392        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1393    }
1394    else {
1395        shiftCount += 7;
1396        if ( shiftCount < 0 ) {
1397            shift64RightJamming( absA, - shiftCount, &absA );
1398        }
1399        else {
1400            absA <<= shiftCount;
1401        }
1402        return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
1403    }
1404
1405}
1406
1407/*----------------------------------------------------------------------------
1408| Returns the result of converting the 64-bit two's complement integer `a'
1409| to the double-precision floating-point format.  The conversion is performed
1410| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1411*----------------------------------------------------------------------------*/
1412
1413float64 int64_to_float64(int64_t a, float_status *status)
1414{
1415    flag zSign;
1416
1417    if ( a == 0 ) return float64_zero;
1418    if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1419        return packFloat64( 1, 0x43E, 0 );
1420    }
1421    zSign = ( a < 0 );
1422    return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
1423}
1424
1425/*----------------------------------------------------------------------------
1426| Returns the result of converting the 64-bit two's complement integer `a'
1427| to the extended double-precision floating-point format.  The conversion
1428| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1429| Arithmetic.
1430*----------------------------------------------------------------------------*/
1431
1432floatx80 int64_to_floatx80(int64_t a, float_status *status)
1433{
1434    flag zSign;
1435    uint64_t absA;
1436    int8_t shiftCount;
1437
1438    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1439    zSign = ( a < 0 );
1440    absA = zSign ? - a : a;
1441    shiftCount = countLeadingZeros64( absA );
1442    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1443
1444}
1445
1446/*----------------------------------------------------------------------------
1447| Returns the result of converting the 64-bit two's complement integer `a' to
1448| the quadruple-precision floating-point format.  The conversion is performed
1449| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1450*----------------------------------------------------------------------------*/
1451
1452float128 int64_to_float128(int64_t a, float_status *status)
1453{
1454    flag zSign;
1455    uint64_t absA;
1456    int8_t shiftCount;
1457    int32_t zExp;
1458    uint64_t zSig0, zSig1;
1459
1460    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1461    zSign = ( a < 0 );
1462    absA = zSign ? - a : a;
1463    shiftCount = countLeadingZeros64( absA ) + 49;
1464    zExp = 0x406E - shiftCount;
1465    if ( 64 <= shiftCount ) {
1466        zSig1 = 0;
1467        zSig0 = absA;
1468        shiftCount -= 64;
1469    }
1470    else {
1471        zSig1 = absA;
1472        zSig0 = 0;
1473    }
1474    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1475    return packFloat128( zSign, zExp, zSig0, zSig1 );
1476
1477}
1478
1479/*----------------------------------------------------------------------------
1480| Returns the result of converting the 64-bit unsigned integer `a'
1481| to the single-precision floating-point format.  The conversion is performed
1482| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1483*----------------------------------------------------------------------------*/
1484
1485float32 uint64_to_float32(uint64_t a, float_status *status)
1486{
1487    int shiftcount;
1488
1489    if (a == 0) {
1490        return float32_zero;
1491    }
1492
1493    /* Determine (left) shift needed to put first set bit into bit posn 23
1494     * (since packFloat32() expects the binary point between bits 23 and 22);
1495     * this is the fast case for smallish numbers.
1496     */
1497    shiftcount = countLeadingZeros64(a) - 40;
1498    if (shiftcount >= 0) {
1499        return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1500    }
1501    /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1502     * expects the binary point between bits 30 and 29, hence the + 7.
1503     */
1504    shiftcount += 7;
1505    if (shiftcount < 0) {
1506        shift64RightJamming(a, -shiftcount, &a);
1507    } else {
1508        a <<= shiftcount;
1509    }
1510
1511    return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
1512}
1513
1514/*----------------------------------------------------------------------------
1515| Returns the result of converting the 64-bit unsigned integer `a'
1516| to the double-precision floating-point format.  The conversion is performed
1517| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1518*----------------------------------------------------------------------------*/
1519
1520float64 uint64_to_float64(uint64_t a, float_status *status)
1521{
1522    int exp = 0x43C;
1523    int shiftcount;
1524
1525    if (a == 0) {
1526        return float64_zero;
1527    }
1528
1529    shiftcount = countLeadingZeros64(a) - 1;
1530    if (shiftcount < 0) {
1531        shift64RightJamming(a, -shiftcount, &a);
1532    } else {
1533        a <<= shiftcount;
1534    }
1535    return roundAndPackFloat64(0, exp - shiftcount, a, status);
1536}
1537
1538/*----------------------------------------------------------------------------
1539| Returns the result of converting the 64-bit unsigned integer `a'
1540| to the quadruple-precision floating-point format.  The conversion is performed
1541| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1542*----------------------------------------------------------------------------*/
1543
1544float128 uint64_to_float128(uint64_t a, float_status *status)
1545{
1546    if (a == 0) {
1547        return float128_zero;
1548    }
1549    return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1550}
1551
1552/*----------------------------------------------------------------------------
1553| Returns the result of converting the single-precision floating-point value
1554| `a' to the 32-bit two's complement integer format.  The conversion is
1555| performed according to the IEC/IEEE Standard for Binary Floating-Point
1556| Arithmetic---which means in particular that the conversion is rounded
1557| according to the current rounding mode.  If `a' is a NaN, the largest
1558| positive integer is returned.  Otherwise, if the conversion overflows, the
1559| largest integer with the same sign as `a' is returned.
1560*----------------------------------------------------------------------------*/
1561
1562int32_t float32_to_int32(float32 a, float_status *status)
1563{
1564    flag aSign;
1565    int aExp;
1566    int shiftCount;
1567    uint32_t aSig;
1568    uint64_t aSig64;
1569
1570    a = float32_squash_input_denormal(a, status);
1571    aSig = extractFloat32Frac( a );
1572    aExp = extractFloat32Exp( a );
1573    aSign = extractFloat32Sign( a );
1574    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1575    if ( aExp ) aSig |= 0x00800000;
1576    shiftCount = 0xAF - aExp;
1577    aSig64 = aSig;
1578    aSig64 <<= 32;
1579    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1580    return roundAndPackInt32(aSign, aSig64, status);
1581
1582}
1583
1584/*----------------------------------------------------------------------------
1585| Returns the result of converting the single-precision floating-point value
1586| `a' to the 32-bit two's complement integer format.  The conversion is
1587| performed according to the IEC/IEEE Standard for Binary Floating-Point
1588| Arithmetic, except that the conversion is always rounded toward zero.
1589| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1590| the conversion overflows, the largest integer with the same sign as `a' is
1591| returned.
1592*----------------------------------------------------------------------------*/
1593
1594int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
1595{
1596    flag aSign;
1597    int aExp;
1598    int shiftCount;
1599    uint32_t aSig;
1600    int32_t z;
1601    a = float32_squash_input_denormal(a, status);
1602
1603    aSig = extractFloat32Frac( a );
1604    aExp = extractFloat32Exp( a );
1605    aSign = extractFloat32Sign( a );
1606    shiftCount = aExp - 0x9E;
1607    if ( 0 <= shiftCount ) {
1608        if ( float32_val(a) != 0xCF000000 ) {
1609            float_raise(float_flag_invalid, status);
1610            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1611        }
1612        return (int32_t) 0x80000000;
1613    }
1614    else if ( aExp <= 0x7E ) {
1615        if (aExp | aSig) {
1616            status->float_exception_flags |= float_flag_inexact;
1617        }
1618        return 0;
1619    }
1620    aSig = ( aSig | 0x00800000 )<<8;
1621    z = aSig>>( - shiftCount );
1622    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1623        status->float_exception_flags |= float_flag_inexact;
1624    }
1625    if ( aSign ) z = - z;
1626    return z;
1627
1628}
1629
1630/*----------------------------------------------------------------------------
1631| Returns the result of converting the single-precision floating-point value
1632| `a' to the 16-bit two's complement integer format.  The conversion is
1633| performed according to the IEC/IEEE Standard for Binary Floating-Point
1634| Arithmetic, except that the conversion is always rounded toward zero.
1635| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1636| the conversion overflows, the largest integer with the same sign as `a' is
1637| returned.
1638*----------------------------------------------------------------------------*/
1639
1640int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
1641{
1642    flag aSign;
1643    int aExp;
1644    int shiftCount;
1645    uint32_t aSig;
1646    int32_t z;
1647
1648    aSig = extractFloat32Frac( a );
1649    aExp = extractFloat32Exp( a );
1650    aSign = extractFloat32Sign( a );
1651    shiftCount = aExp - 0x8E;
1652    if ( 0 <= shiftCount ) {
1653        if ( float32_val(a) != 0xC7000000 ) {
1654            float_raise(float_flag_invalid, status);
1655            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1656                return 0x7FFF;
1657            }
1658        }
1659        return (int32_t) 0xffff8000;
1660    }
1661    else if ( aExp <= 0x7E ) {
1662        if ( aExp | aSig ) {
1663            status->float_exception_flags |= float_flag_inexact;
1664        }
1665        return 0;
1666    }
1667    shiftCount -= 0x10;
1668    aSig = ( aSig | 0x00800000 )<<8;
1669    z = aSig>>( - shiftCount );
1670    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1671        status->float_exception_flags |= float_flag_inexact;
1672    }
1673    if ( aSign ) {
1674        z = - z;
1675    }
1676    return z;
1677
1678}
1679
1680/*----------------------------------------------------------------------------
1681| Returns the result of converting the single-precision floating-point value
1682| `a' to the 64-bit two's complement integer format.  The conversion is
1683| performed according to the IEC/IEEE Standard for Binary Floating-Point
1684| Arithmetic---which means in particular that the conversion is rounded
1685| according to the current rounding mode.  If `a' is a NaN, the largest
1686| positive integer is returned.  Otherwise, if the conversion overflows, the
1687| largest integer with the same sign as `a' is returned.
1688*----------------------------------------------------------------------------*/
1689
1690int64_t float32_to_int64(float32 a, float_status *status)
1691{
1692    flag aSign;
1693    int aExp;
1694    int shiftCount;
1695    uint32_t aSig;
1696    uint64_t aSig64, aSigExtra;
1697    a = float32_squash_input_denormal(a, status);
1698
1699    aSig = extractFloat32Frac( a );
1700    aExp = extractFloat32Exp( a );
1701    aSign = extractFloat32Sign( a );
1702    shiftCount = 0xBE - aExp;
1703    if ( shiftCount < 0 ) {
1704        float_raise(float_flag_invalid, status);
1705        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1706            return LIT64( 0x7FFFFFFFFFFFFFFF );
1707        }
1708        return (int64_t) LIT64( 0x8000000000000000 );
1709    }
1710    if ( aExp ) aSig |= 0x00800000;
1711    aSig64 = aSig;
1712    aSig64 <<= 40;
1713    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1714    return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
1715
1716}
1717
1718/*----------------------------------------------------------------------------
1719| Returns the result of converting the single-precision floating-point value
1720| `a' to the 64-bit unsigned integer format.  The conversion is
1721| performed according to the IEC/IEEE Standard for Binary Floating-Point
1722| Arithmetic---which means in particular that the conversion is rounded
1723| according to the current rounding mode.  If `a' is a NaN, the largest
1724| unsigned integer is returned.  Otherwise, if the conversion overflows, the
1725| largest unsigned integer is returned.  If the 'a' is negative, the result
1726| is rounded and zero is returned; values that do not round to zero will
1727| raise the inexact exception flag.
1728*----------------------------------------------------------------------------*/
1729
1730uint64_t float32_to_uint64(float32 a, float_status *status)
1731{
1732    flag aSign;
1733    int aExp;
1734    int shiftCount;
1735    uint32_t aSig;
1736    uint64_t aSig64, aSigExtra;
1737    a = float32_squash_input_denormal(a, status);
1738
1739    aSig = extractFloat32Frac(a);
1740    aExp = extractFloat32Exp(a);
1741    aSign = extractFloat32Sign(a);
1742    if ((aSign) && (aExp > 126)) {
1743        float_raise(float_flag_invalid, status);
1744        if (float32_is_any_nan(a)) {
1745            return LIT64(0xFFFFFFFFFFFFFFFF);
1746        } else {
1747            return 0;
1748        }
1749    }
1750    shiftCount = 0xBE - aExp;
1751    if (aExp) {
1752        aSig |= 0x00800000;
1753    }
1754    if (shiftCount < 0) {
1755        float_raise(float_flag_invalid, status);
1756        return LIT64(0xFFFFFFFFFFFFFFFF);
1757    }
1758
1759    aSig64 = aSig;
1760    aSig64 <<= 40;
1761    shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1762    return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
1763}
1764
1765/*----------------------------------------------------------------------------
1766| Returns the result of converting the single-precision floating-point value
1767| `a' to the 64-bit unsigned integer format.  The conversion is
1768| performed according to the IEC/IEEE Standard for Binary Floating-Point
1769| Arithmetic, except that the conversion is always rounded toward zero.  If
1770| `a' is a NaN, the largest unsigned integer is returned.  Otherwise, if the
1771| conversion overflows, the largest unsigned integer is returned.  If the
1772| 'a' is negative, the result is rounded and zero is returned; values that do
1773| not round to zero will raise the inexact flag.
1774*----------------------------------------------------------------------------*/
1775
1776uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
1777{
1778    signed char current_rounding_mode = status->float_rounding_mode;
1779    set_float_rounding_mode(float_round_to_zero, status);
1780    int64_t v = float32_to_uint64(a, status);
1781    set_float_rounding_mode(current_rounding_mode, status);
1782    return v;
1783}
1784
1785/*----------------------------------------------------------------------------
1786| Returns the result of converting the single-precision floating-point value
1787| `a' to the 64-bit two's complement integer format.  The conversion is
1788| performed according to the IEC/IEEE Standard for Binary Floating-Point
1789| Arithmetic, except that the conversion is always rounded toward zero.  If
1790| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1791| conversion overflows, the largest integer with the same sign as `a' is
1792| returned.
1793*----------------------------------------------------------------------------*/
1794
1795int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
1796{
1797    flag aSign;
1798    int aExp;
1799    int shiftCount;
1800    uint32_t aSig;
1801    uint64_t aSig64;
1802    int64_t z;
1803    a = float32_squash_input_denormal(a, status);
1804
1805    aSig = extractFloat32Frac( a );
1806    aExp = extractFloat32Exp( a );
1807    aSign = extractFloat32Sign( a );
1808    shiftCount = aExp - 0xBE;
1809    if ( 0 <= shiftCount ) {
1810        if ( float32_val(a) != 0xDF000000 ) {
1811            float_raise(float_flag_invalid, status);
1812            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1813                return LIT64( 0x7FFFFFFFFFFFFFFF );
1814            }
1815        }
1816        return (int64_t) LIT64( 0x8000000000000000 );
1817    }
1818    else if ( aExp <= 0x7E ) {
1819        if (aExp | aSig) {
1820            status->float_exception_flags |= float_flag_inexact;
1821        }
1822        return 0;
1823    }
1824    aSig64 = aSig | 0x00800000;
1825    aSig64 <<= 40;
1826    z = aSig64>>( - shiftCount );
1827    if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1828        status->float_exception_flags |= float_flag_inexact;
1829    }
1830    if ( aSign ) z = - z;
1831    return z;
1832
1833}
1834
1835/*----------------------------------------------------------------------------
1836| Returns the result of converting the single-precision floating-point value
1837| `a' to the double-precision floating-point format.  The conversion is
1838| performed according to the IEC/IEEE Standard for Binary Floating-Point
1839| Arithmetic.
1840*----------------------------------------------------------------------------*/
1841
1842float64 float32_to_float64(float32 a, float_status *status)
1843{
1844    flag aSign;
1845    int aExp;
1846    uint32_t aSig;
1847    a = float32_squash_input_denormal(a, status);
1848
1849    aSig = extractFloat32Frac( a );
1850    aExp = extractFloat32Exp( a );
1851    aSign = extractFloat32Sign( a );
1852    if ( aExp == 0xFF ) {
1853        if (aSig) {
1854            return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1855        }
1856        return packFloat64( aSign, 0x7FF, 0 );
1857    }
1858    if ( aExp == 0 ) {
1859        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1860        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1861        --aExp;
1862    }
1863    return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1864
1865}
1866
1867/*----------------------------------------------------------------------------
1868| Returns the result of converting the single-precision floating-point value
1869| `a' to the extended double-precision floating-point format.  The conversion
1870| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1871| Arithmetic.
1872*----------------------------------------------------------------------------*/
1873
1874floatx80 float32_to_floatx80(float32 a, float_status *status)
1875{
1876    flag aSign;
1877    int aExp;
1878    uint32_t aSig;
1879
1880    a = float32_squash_input_denormal(a, status);
1881    aSig = extractFloat32Frac( a );
1882    aExp = extractFloat32Exp( a );
1883    aSign = extractFloat32Sign( a );
1884    if ( aExp == 0xFF ) {
1885        if (aSig) {
1886            return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1887        }
1888        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1889    }
1890    if ( aExp == 0 ) {
1891        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1892        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1893    }
1894    aSig |= 0x00800000;
1895    return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1896
1897}
1898
1899/*----------------------------------------------------------------------------
1900| Returns the result of converting the single-precision floating-point value
1901| `a' to the double-precision floating-point format.  The conversion is
1902| performed according to the IEC/IEEE Standard for Binary Floating-Point
1903| Arithmetic.
1904*----------------------------------------------------------------------------*/
1905
1906float128 float32_to_float128(float32 a, float_status *status)
1907{
1908    flag aSign;
1909    int aExp;
1910    uint32_t aSig;
1911
1912    a = float32_squash_input_denormal(a, status);
1913    aSig = extractFloat32Frac( a );
1914    aExp = extractFloat32Exp( a );
1915    aSign = extractFloat32Sign( a );
1916    if ( aExp == 0xFF ) {
1917        if (aSig) {
1918            return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1919        }
1920        return packFloat128( aSign, 0x7FFF, 0, 0 );
1921    }
1922    if ( aExp == 0 ) {
1923        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1924        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1925        --aExp;
1926    }
1927    return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1928
1929}
1930
1931/*----------------------------------------------------------------------------
1932| Rounds the single-precision floating-point value `a' to an integer, and
1933| returns the result as a single-precision floating-point value.  The
1934| operation is performed according to the IEC/IEEE Standard for Binary
1935| Floating-Point Arithmetic.
1936*----------------------------------------------------------------------------*/
1937
1938float32 float32_round_to_int(float32 a, float_status *status)
1939{
1940    flag aSign;
1941    int aExp;
1942    uint32_t lastBitMask, roundBitsMask;
1943    uint32_t z;
1944    a = float32_squash_input_denormal(a, status);
1945
1946    aExp = extractFloat32Exp( a );
1947    if ( 0x96 <= aExp ) {
1948        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1949            return propagateFloat32NaN(a, a, status);
1950        }
1951        return a;
1952    }
1953    if ( aExp <= 0x7E ) {
1954        if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1955        status->float_exception_flags |= float_flag_inexact;
1956        aSign = extractFloat32Sign( a );
1957        switch (status->float_rounding_mode) {
1958         case float_round_nearest_even:
1959            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1960                return packFloat32( aSign, 0x7F, 0 );
1961            }
1962            break;
1963        case float_round_ties_away:
1964            if (aExp == 0x7E) {
1965                return packFloat32(aSign, 0x7F, 0);
1966            }
1967            break;
1968         case float_round_down:
1969            return make_float32(aSign ? 0xBF800000 : 0);
1970         case float_round_up:
1971            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1972        }
1973        return packFloat32( aSign, 0, 0 );
1974    }
1975    lastBitMask = 1;
1976    lastBitMask <<= 0x96 - aExp;
1977    roundBitsMask = lastBitMask - 1;
1978    z = float32_val(a);
1979    switch (status->float_rounding_mode) {
1980    case float_round_nearest_even:
1981        z += lastBitMask>>1;
1982        if ((z & roundBitsMask) == 0) {
1983            z &= ~lastBitMask;
1984        }
1985        break;
1986    case float_round_ties_away:
1987        z += lastBitMask >> 1;
1988        break;
1989    case float_round_to_zero:
1990        break;
1991    case float_round_up:
1992        if (!extractFloat32Sign(make_float32(z))) {
1993            z += roundBitsMask;
1994        }
1995        break;
1996    case float_round_down:
1997        if (extractFloat32Sign(make_float32(z))) {
1998            z += roundBitsMask;
1999        }
2000        break;
2001    default:
2002        abort();
2003    }
2004    z &= ~ roundBitsMask;
2005    if (z != float32_val(a)) {
2006        status->float_exception_flags |= float_flag_inexact;
2007    }
2008    return make_float32(z);
2009
2010}
2011
2012/*----------------------------------------------------------------------------
2013| Returns the result of adding the absolute values of the single-precision
2014| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2015| before being returned.  `zSign' is ignored if the result is a NaN.
2016| The addition is performed according to the IEC/IEEE Standard for Binary
2017| Floating-Point Arithmetic.
2018*----------------------------------------------------------------------------*/
2019
2020static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
2021                              float_status *status)
2022{
2023    int aExp, bExp, zExp;
2024    uint32_t aSig, bSig, zSig;
2025    int expDiff;
2026
2027    aSig = extractFloat32Frac( a );
2028    aExp = extractFloat32Exp( a );
2029    bSig = extractFloat32Frac( b );
2030    bExp = extractFloat32Exp( b );
2031    expDiff = aExp - bExp;
2032    aSig <<= 6;
2033    bSig <<= 6;
2034    if ( 0 < expDiff ) {
2035        if ( aExp == 0xFF ) {
2036            if (aSig) {
2037                return propagateFloat32NaN(a, b, status);
2038            }
2039            return a;
2040        }
2041        if ( bExp == 0 ) {
2042            --expDiff;
2043        }
2044        else {
2045            bSig |= 0x20000000;
2046        }
2047        shift32RightJamming( bSig, expDiff, &bSig );
2048        zExp = aExp;
2049    }
2050    else if ( expDiff < 0 ) {
2051        if ( bExp == 0xFF ) {
2052            if (bSig) {
2053                return propagateFloat32NaN(a, b, status);
2054            }
2055            return packFloat32( zSign, 0xFF, 0 );
2056        }
2057        if ( aExp == 0 ) {
2058            ++expDiff;
2059        }
2060        else {
2061            aSig |= 0x20000000;
2062        }
2063        shift32RightJamming( aSig, - expDiff, &aSig );
2064        zExp = bExp;
2065    }
2066    else {
2067        if ( aExp == 0xFF ) {
2068            if (aSig | bSig) {
2069                return propagateFloat32NaN(a, b, status);
2070            }
2071            return a;
2072        }
2073        if ( aExp == 0 ) {
2074            if (status->flush_to_zero) {
2075                if (aSig | bSig) {
2076                    float_raise(float_flag_output_denormal, status);
2077                }
2078                return packFloat32(zSign, 0, 0);
2079            }
2080            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2081        }
2082        zSig = 0x40000000 + aSig + bSig;
2083        zExp = aExp;
2084        goto roundAndPack;
2085    }
2086    aSig |= 0x20000000;
2087    zSig = ( aSig + bSig )<<1;
2088    --zExp;
2089    if ( (int32_t) zSig < 0 ) {
2090        zSig = aSig + bSig;
2091        ++zExp;
2092    }
2093 roundAndPack:
2094    return roundAndPackFloat32(zSign, zExp, zSig, status);
2095
2096}
2097
2098/*----------------------------------------------------------------------------
2099| Returns the result of subtracting the absolute values of the single-
2100| precision floating-point values `a' and `b'.  If `zSign' is 1, the
2101| difference is negated before being returned.  `zSign' is ignored if the
2102| result is a NaN.  The subtraction is performed according to the IEC/IEEE
2103| Standard for Binary Floating-Point Arithmetic.
2104*----------------------------------------------------------------------------*/
2105
2106static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2107                              float_status *status)
2108{
2109    int aExp, bExp, zExp;
2110    uint32_t aSig, bSig, zSig;
2111    int expDiff;
2112
2113    aSig = extractFloat32Frac( a );
2114    aExp = extractFloat32Exp( a );
2115    bSig = extractFloat32Frac( b );
2116    bExp = extractFloat32Exp( b );
2117    expDiff = aExp - bExp;
2118    aSig <<= 7;
2119    bSig <<= 7;
2120    if ( 0 < expDiff ) goto aExpBigger;
2121    if ( expDiff < 0 ) goto bExpBigger;
2122    if ( aExp == 0xFF ) {
2123        if (aSig | bSig) {
2124            return propagateFloat32NaN(a, b, status);
2125        }
2126        float_raise(float_flag_invalid, status);
2127        return float32_default_nan(status);
2128    }
2129    if ( aExp == 0 ) {
2130        aExp = 1;
2131        bExp = 1;
2132    }
2133    if ( bSig < aSig ) goto aBigger;
2134    if ( aSig < bSig ) goto bBigger;
2135    return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
2136 bExpBigger:
2137    if ( bExp == 0xFF ) {
2138        if (bSig) {
2139            return propagateFloat32NaN(a, b, status);
2140        }
2141        return packFloat32( zSign ^ 1, 0xFF, 0 );
2142    }
2143    if ( aExp == 0 ) {
2144        ++expDiff;
2145    }
2146    else {
2147        aSig |= 0x40000000;
2148    }
2149    shift32RightJamming( aSig, - expDiff, &aSig );
2150    bSig |= 0x40000000;
2151 bBigger:
2152    zSig = bSig - aSig;
2153    zExp = bExp;
2154    zSign ^= 1;
2155    goto normalizeRoundAndPack;
2156 aExpBigger:
2157    if ( aExp == 0xFF ) {
2158        if (aSig) {
2159            return propagateFloat32NaN(a, b, status);
2160        }
2161        return a;
2162    }
2163    if ( bExp == 0 ) {
2164        --expDiff;
2165    }
2166    else {
2167        bSig |= 0x40000000;
2168    }
2169    shift32RightJamming( bSig, expDiff, &bSig );
2170    aSig |= 0x40000000;
2171 aBigger:
2172    zSig = aSig - bSig;
2173    zExp = aExp;
2174 normalizeRoundAndPack:
2175    --zExp;
2176    return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
2177
2178}
2179
2180/*----------------------------------------------------------------------------
2181| Returns the result of adding the single-precision floating-point values `a'
2182| and `b'.  The operation is performed according to the IEC/IEEE Standard for
2183| Binary Floating-Point Arithmetic.
2184*----------------------------------------------------------------------------*/
2185
2186float32 float32_add(float32 a, float32 b, float_status *status)
2187{
2188    flag aSign, bSign;
2189    a = float32_squash_input_denormal(a, status);
2190    b = float32_squash_input_denormal(b, status);
2191
2192    aSign = extractFloat32Sign( a );
2193    bSign = extractFloat32Sign( b );
2194    if ( aSign == bSign ) {
2195        return addFloat32Sigs(a, b, aSign, status);
2196    }
2197    else {
2198        return subFloat32Sigs(a, b, aSign, status);
2199    }
2200
2201}
2202
2203/*----------------------------------------------------------------------------
2204| Returns the result of subtracting the single-precision floating-point values
2205| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2206| for Binary Floating-Point Arithmetic.
2207*----------------------------------------------------------------------------*/
2208
2209float32 float32_sub(float32 a, float32 b, float_status *status)
2210{
2211    flag aSign, bSign;
2212    a = float32_squash_input_denormal(a, status);
2213    b = float32_squash_input_denormal(b, status);
2214
2215    aSign = extractFloat32Sign( a );
2216    bSign = extractFloat32Sign( b );
2217    if ( aSign == bSign ) {
2218        return subFloat32Sigs(a, b, aSign, status);
2219    }
2220    else {
2221        return addFloat32Sigs(a, b, aSign, status);
2222    }
2223
2224}
2225
2226/*----------------------------------------------------------------------------
2227| Returns the result of multiplying the single-precision floating-point values
2228| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2229| for Binary Floating-Point Arithmetic.
2230*----------------------------------------------------------------------------*/
2231
2232float32 float32_mul(float32 a, float32 b, float_status *status)
2233{
2234    flag aSign, bSign, zSign;
2235    int aExp, bExp, zExp;
2236    uint32_t aSig, bSig;
2237    uint64_t zSig64;
2238    uint32_t zSig;
2239
2240    a = float32_squash_input_denormal(a, status);
2241    b = float32_squash_input_denormal(b, status);
2242
2243    aSig = extractFloat32Frac( a );
2244    aExp = extractFloat32Exp( a );
2245    aSign = extractFloat32Sign( a );
2246    bSig = extractFloat32Frac( b );
2247    bExp = extractFloat32Exp( b );
2248    bSign = extractFloat32Sign( b );
2249    zSign = aSign ^ bSign;
2250    if ( aExp == 0xFF ) {
2251        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2252            return propagateFloat32NaN(a, b, status);
2253        }
2254        if ( ( bExp | bSig ) == 0 ) {
2255            float_raise(float_flag_invalid, status);
2256            return float32_default_nan(status);
2257        }
2258        return packFloat32( zSign, 0xFF, 0 );
2259    }
2260    if ( bExp == 0xFF ) {
2261        if (bSig) {
2262            return propagateFloat32NaN(a, b, status);
2263        }
2264        if ( ( aExp | aSig ) == 0 ) {
2265            float_raise(float_flag_invalid, status);
2266            return float32_default_nan(status);
2267        }
2268        return packFloat32( zSign, 0xFF, 0 );
2269    }
2270    if ( aExp == 0 ) {
2271        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2272        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2273    }
2274    if ( bExp == 0 ) {
2275        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2276        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2277    }
2278    zExp = aExp + bExp - 0x7F;
2279    aSig = ( aSig | 0x00800000 )<<7;
2280    bSig = ( bSig | 0x00800000 )<<8;
2281    shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2282    zSig = zSig64;
2283    if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2284        zSig <<= 1;
2285        --zExp;
2286    }
2287    return roundAndPackFloat32(zSign, zExp, zSig, status);
2288
2289}
2290
2291/*----------------------------------------------------------------------------
2292| Returns the result of dividing the single-precision floating-point value `a'
2293| by the corresponding value `b'.  The operation is performed according to the
2294| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2295*----------------------------------------------------------------------------*/
2296
2297float32 float32_div(float32 a, float32 b, float_status *status)
2298{
2299    flag aSign, bSign, zSign;
2300    int aExp, bExp, zExp;
2301    uint32_t aSig, bSig, zSig;
2302    a = float32_squash_input_denormal(a, status);
2303    b = float32_squash_input_denormal(b, status);
2304
2305    aSig = extractFloat32Frac( a );
2306    aExp = extractFloat32Exp( a );
2307    aSign = extractFloat32Sign( a );
2308    bSig = extractFloat32Frac( b );
2309    bExp = extractFloat32Exp( b );
2310    bSign = extractFloat32Sign( b );
2311    zSign = aSign ^ bSign;
2312    if ( aExp == 0xFF ) {
2313        if (aSig) {
2314            return propagateFloat32NaN(a, b, status);
2315        }
2316        if ( bExp == 0xFF ) {
2317            if (bSig) {
2318                return propagateFloat32NaN(a, b, status);
2319            }
2320            float_raise(float_flag_invalid, status);
2321            return float32_default_nan(status);
2322        }
2323        return packFloat32( zSign, 0xFF, 0 );
2324    }
2325    if ( bExp == 0xFF ) {
2326        if (bSig) {
2327            return propagateFloat32NaN(a, b, status);
2328        }
2329        return packFloat32( zSign, 0, 0 );
2330    }
2331    if ( bExp == 0 ) {
2332        if ( bSig == 0 ) {
2333            if ( ( aExp | aSig ) == 0 ) {
2334                float_raise(float_flag_invalid, status);
2335                return float32_default_nan(status);
2336            }
2337            float_raise(float_flag_divbyzero, status);
2338            return packFloat32( zSign, 0xFF, 0 );
2339        }
2340        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2341    }
2342    if ( aExp == 0 ) {
2343        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2344        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2345    }
2346    zExp = aExp - bExp + 0x7D;
2347    aSig = ( aSig | 0x00800000 )<<7;
2348    bSig = ( bSig | 0x00800000 )<<8;
2349    if ( bSig <= ( aSig + aSig ) ) {
2350        aSig >>= 1;
2351        ++zExp;
2352    }
2353    zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2354    if ( ( zSig & 0x3F ) == 0 ) {
2355        zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2356    }
2357    return roundAndPackFloat32(zSign, zExp, zSig, status);
2358
2359}
2360
2361/*----------------------------------------------------------------------------
2362| Returns the remainder of the single-precision floating-point value `a'
2363| with respect to the corresponding value `b'.  The operation is performed
2364| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2365*----------------------------------------------------------------------------*/
2366
2367float32 float32_rem(float32 a, float32 b, float_status *status)
2368{
2369    flag aSign, zSign;
2370    int aExp, bExp, expDiff;
2371    uint32_t aSig, bSig;
2372    uint32_t q;
2373    uint64_t aSig64, bSig64, q64;
2374    uint32_t alternateASig;
2375    int32_t sigMean;
2376    a = float32_squash_input_denormal(a, status);
2377    b = float32_squash_input_denormal(b, status);
2378
2379    aSig = extractFloat32Frac( a );
2380    aExp = extractFloat32Exp( a );
2381    aSign = extractFloat32Sign( a );
2382    bSig = extractFloat32Frac( b );
2383    bExp = extractFloat32Exp( b );
2384    if ( aExp == 0xFF ) {
2385        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2386            return propagateFloat32NaN(a, b, status);
2387        }
2388        float_raise(float_flag_invalid, status);
2389        return float32_default_nan(status);
2390    }
2391    if ( bExp == 0xFF ) {
2392        if (bSig) {
2393            return propagateFloat32NaN(a, b, status);
2394        }
2395        return a;
2396    }
2397    if ( bExp == 0 ) {
2398        if ( bSig == 0 ) {
2399            float_raise(float_flag_invalid, status);
2400            return float32_default_nan(status);
2401        }
2402        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2403    }
2404    if ( aExp == 0 ) {
2405        if ( aSig == 0 ) return a;
2406        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2407    }
2408    expDiff = aExp - bExp;
2409    aSig |= 0x00800000;
2410    bSig |= 0x00800000;
2411    if ( expDiff < 32 ) {
2412        aSig <<= 8;
2413        bSig <<= 8;
2414        if ( expDiff < 0 ) {
2415            if ( expDiff < -1 ) return a;
2416            aSig >>= 1;
2417        }
2418        q = ( bSig <= aSig );
2419        if ( q ) aSig -= bSig;
2420        if ( 0 < expDiff ) {
2421            q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2422            q >>= 32 - expDiff;
2423            bSig >>= 2;
2424            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2425        }
2426        else {
2427            aSig >>= 2;
2428            bSig >>= 2;
2429        }
2430    }
2431    else {
2432        if ( bSig <= aSig ) aSig -= bSig;
2433        aSig64 = ( (uint64_t) aSig )<<40;
2434        bSig64 = ( (uint64_t) bSig )<<40;
2435        expDiff -= 64;
2436        while ( 0 < expDiff ) {
2437            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2438            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2439            aSig64 = - ( ( bSig * q64 )<<38 );
2440            expDiff -= 62;
2441        }
2442        expDiff += 64;
2443        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2444        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2445        q = q64>>( 64 - expDiff );
2446        bSig <<= 6;
2447        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2448    }
2449    do {
2450        alternateASig = aSig;
2451        ++q;
2452        aSig -= bSig;
2453    } while ( 0 <= (int32_t) aSig );
2454    sigMean = aSig + alternateASig;
2455    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2456        aSig = alternateASig;
2457    }
2458    zSign = ( (int32_t) aSig < 0 );
2459    if ( zSign ) aSig = - aSig;
2460    return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
2461}
2462
2463/*----------------------------------------------------------------------------
2464| Returns the result of multiplying the single-precision floating-point values
2465| `a' and `b' then adding 'c', with no intermediate rounding step after the
2466| multiplication.  The operation is performed according to the IEC/IEEE
2467| Standard for Binary Floating-Point Arithmetic 754-2008.
2468| The flags argument allows the caller to select negation of the
2469| addend, the intermediate product, or the final result. (The difference
2470| between this and having the caller do a separate negation is that negating
2471| externally will flip the sign bit on NaNs.)
2472*----------------------------------------------------------------------------*/
2473
2474float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2475                       float_status *status)
2476{
2477    flag aSign, bSign, cSign, zSign;
2478    int aExp, bExp, cExp, pExp, zExp, expDiff;
2479    uint32_t aSig, bSig, cSig;
2480    flag pInf, pZero, pSign;
2481    uint64_t pSig64, cSig64, zSig64;
2482    uint32_t pSig;
2483    int shiftcount;
2484    flag signflip, infzero;
2485
2486    a = float32_squash_input_denormal(a, status);
2487    b = float32_squash_input_denormal(b, status);
2488    c = float32_squash_input_denormal(c, status);
2489    aSig = extractFloat32Frac(a);
2490    aExp = extractFloat32Exp(a);
2491    aSign = extractFloat32Sign(a);
2492    bSig = extractFloat32Frac(b);
2493    bExp = extractFloat32Exp(b);
2494    bSign = extractFloat32Sign(b);
2495    cSig = extractFloat32Frac(c);
2496    cExp = extractFloat32Exp(c);
2497    cSign = extractFloat32Sign(c);
2498
2499    infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2500               (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2501
2502    /* It is implementation-defined whether the cases of (0,inf,qnan)
2503     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2504     * they return if they do), so we have to hand this information
2505     * off to the target-specific pick-a-NaN routine.
2506     */
2507    if (((aExp == 0xff) && aSig) ||
2508        ((bExp == 0xff) && bSig) ||
2509        ((cExp == 0xff) && cSig)) {
2510        return propagateFloat32MulAddNaN(a, b, c, infzero, status);
2511    }
2512
2513    if (infzero) {
2514        float_raise(float_flag_invalid, status);
2515        return float32_default_nan(status);
2516    }
2517
2518    if (flags & float_muladd_negate_c) {
2519        cSign ^= 1;
2520    }
2521
2522    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2523
2524    /* Work out the sign and type of the product */
2525    pSign = aSign ^ bSign;
2526    if (flags & float_muladd_negate_product) {
2527        pSign ^= 1;
2528    }
2529    pInf = (aExp == 0xff) || (bExp == 0xff);
2530    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2531
2532    if (cExp == 0xff) {
2533        if (pInf && (pSign ^ cSign)) {
2534            /* addition of opposite-signed infinities => InvalidOperation */
2535            float_raise(float_flag_invalid, status);
2536            return float32_default_nan(status);
2537        }
2538        /* Otherwise generate an infinity of the same sign */
2539        return packFloat32(cSign ^ signflip, 0xff, 0);
2540    }
2541
2542    if (pInf) {
2543        return packFloat32(pSign ^ signflip, 0xff, 0);
2544    }
2545
2546    if (pZero) {
2547        if (cExp == 0) {
2548            if (cSig == 0) {
2549                /* Adding two exact zeroes */
2550                if (pSign == cSign) {
2551                    zSign = pSign;
2552                } else if (status->float_rounding_mode == float_round_down) {
2553                    zSign = 1;
2554                } else {
2555                    zSign = 0;
2556                }
2557                return packFloat32(zSign ^ signflip, 0, 0);
2558            }
2559            /* Exact zero plus a denorm */
2560            if (status->flush_to_zero) {
2561                float_raise(float_flag_output_denormal, status);
2562                return packFloat32(cSign ^ signflip, 0, 0);
2563            }
2564        }
2565        /* Zero plus something non-zero : just return the something */
2566        if (flags & float_muladd_halve_result) {
2567            if (cExp == 0) {
2568                normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2569            }
2570            /* Subtract one to halve, and one again because roundAndPackFloat32
2571             * wants one less than the true exponent.
2572             */
2573            cExp -= 2;
2574            cSig = (cSig | 0x00800000) << 7;
2575            return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
2576        }
2577        return packFloat32(cSign ^ signflip, cExp, cSig);
2578    }
2579
2580    if (aExp == 0) {
2581        normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2582    }
2583    if (bExp == 0) {
2584        normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2585    }
2586
2587    /* Calculate the actual result a * b + c */
2588
2589    /* Multiply first; this is easy. */
2590    /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2591     * because we want the true exponent, not the "one-less-than"
2592     * flavour that roundAndPackFloat32() takes.
2593     */
2594    pExp = aExp + bExp - 0x7e;
2595    aSig = (aSig | 0x00800000) << 7;
2596    bSig = (bSig | 0x00800000) << 8;
2597    pSig64 = (uint64_t)aSig * bSig;
2598    if ((int64_t)(pSig64 << 1) >= 0) {
2599        pSig64 <<= 1;
2600        pExp--;
2601    }
2602
2603    zSign = pSign ^ signflip;
2604
2605    /* Now pSig64 is the significand of the multiply, with the explicit bit in
2606     * position 62.
2607     */
2608    if (cExp == 0) {
2609        if (!cSig) {
2610            /* Throw out the special case of c being an exact zero now */
2611            shift64RightJamming(pSig64, 32, &pSig64);
2612            pSig = pSig64;
2613            if (flags & float_muladd_halve_result) {
2614                pExp--;
2615            }
2616            return roundAndPackFloat32(zSign, pExp - 1,
2617                                       pSig, status);
2618        }
2619        normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2620    }
2621
2622    cSig64 = (uint64_t)cSig << (62 - 23);
2623    cSig64 |= LIT64(0x4000000000000000);
2624    expDiff = pExp - cExp;
2625
2626    if (pSign == cSign) {
2627        /* Addition */
2628        if (expDiff > 0) {
2629            /* scale c to match p */
2630            shift64RightJamming(cSig64, expDiff, &cSig64);
2631            zExp = pExp;
2632        } else if (expDiff < 0) {
2633            /* scale p to match c */
2634            shift64RightJamming(pSig64, -expDiff, &pSig64);
2635            zExp = cExp;
2636        } else {
2637            /* no scaling needed */
2638            zExp = cExp;
2639        }
2640        /* Add significands and make sure explicit bit ends up in posn 62 */
2641        zSig64 = pSig64 + cSig64;
2642        if ((int64_t)zSig64 < 0) {
2643            shift64RightJamming(zSig64, 1, &zSig64);
2644        } else {
2645            zExp--;
2646        }
2647    } else {
2648        /* Subtraction */
2649        if (expDiff > 0) {
2650            shift64RightJamming(cSig64, expDiff, &cSig64);
2651            zSig64 = pSig64 - cSig64;
2652            zExp = pExp;
2653        } else if (expDiff < 0) {
2654            shift64RightJamming(pSig64, -expDiff, &pSig64);
2655            zSig64 = cSig64 - pSig64;
2656            zExp = cExp;
2657            zSign ^= 1;
2658        } else {
2659            zExp = pExp;
2660            if (cSig64 < pSig64) {
2661                zSig64 = pSig64 - cSig64;
2662            } else if (pSig64 < cSig64) {
2663                zSig64 = cSig64 - pSig64;
2664                zSign ^= 1;
2665            } else {
2666                /* Exact zero */
2667                zSign = signflip;
2668                if (status->float_rounding_mode == float_round_down) {
2669                    zSign ^= 1;
2670                }
2671                return packFloat32(zSign, 0, 0);
2672            }
2673        }
2674        --zExp;
2675        /* Normalize to put the explicit bit back into bit 62. */
2676        shiftcount = countLeadingZeros64(zSig64) - 1;
2677        zSig64 <<= shiftcount;
2678        zExp -= shiftcount;
2679    }
2680    if (flags & float_muladd_halve_result) {
2681        zExp--;
2682    }
2683
2684    shift64RightJamming(zSig64, 32, &zSig64);
2685    return roundAndPackFloat32(zSign, zExp, zSig64, status);
2686}
2687
2688
2689/*----------------------------------------------------------------------------
2690| Returns the square root of the single-precision floating-point value `a'.
2691| The operation is performed according to the IEC/IEEE Standard for Binary
2692| Floating-Point Arithmetic.
2693*----------------------------------------------------------------------------*/
2694
2695float32 float32_sqrt(float32 a, float_status *status)
2696{
2697    flag aSign;
2698    int aExp, zExp;
2699    uint32_t aSig, zSig;
2700    uint64_t rem, term;
2701    a = float32_squash_input_denormal(a, status);
2702
2703    aSig = extractFloat32Frac( a );
2704    aExp = extractFloat32Exp( a );
2705    aSign = extractFloat32Sign( a );
2706    if ( aExp == 0xFF ) {
2707        if (aSig) {
2708            return propagateFloat32NaN(a, float32_zero, status);
2709        }
2710        if ( ! aSign ) return a;
2711        float_raise(float_flag_invalid, status);
2712        return float32_default_nan(status);
2713    }
2714    if ( aSign ) {
2715        if ( ( aExp | aSig ) == 0 ) return a;
2716        float_raise(float_flag_invalid, status);
2717        return float32_default_nan(status);
2718    }
2719    if ( aExp == 0 ) {
2720        if ( aSig == 0 ) return float32_zero;
2721        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2722    }
2723    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2724    aSig = ( aSig | 0x00800000 )<<8;
2725    zSig = estimateSqrt32( aExp, aSig ) + 2;
2726    if ( ( zSig & 0x7F ) <= 5 ) {
2727        if ( zSig < 2 ) {
2728            zSig = 0x7FFFFFFF;
2729            goto roundAndPack;
2730        }
2731        aSig >>= aExp & 1;
2732        term = ( (uint64_t) zSig ) * zSig;
2733        rem = ( ( (uint64_t) aSig )<<32 ) - term;
2734        while ( (int64_t) rem < 0 ) {
2735            --zSig;
2736            rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2737        }
2738        zSig |= ( rem != 0 );
2739    }
2740    shift32RightJamming( zSig, 1, &zSig );
2741 roundAndPack:
2742    return roundAndPackFloat32(0, zExp, zSig, status);
2743
2744}
2745
2746/*----------------------------------------------------------------------------
2747| Returns the binary exponential of the single-precision floating-point value
2748| `a'. The operation is performed according to the IEC/IEEE Standard for
2749| Binary Floating-Point Arithmetic.
2750|
2751| Uses the following identities:
2752|
2753| 1. -------------------------------------------------------------------------
2754|      x    x*ln(2)
2755|     2  = e
2756|
2757| 2. -------------------------------------------------------------------------
2758|                      2     3     4     5           n
2759|      x        x     x     x     x     x           x
2760|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2761|               1!    2!    3!    4!    5!          n!
2762*----------------------------------------------------------------------------*/
2763
2764static const float64 float32_exp2_coefficients[15] =
2765{
2766    const_float64( 0x3ff0000000000000ll ), /*  1 */
2767    const_float64( 0x3fe0000000000000ll ), /*  2 */
2768    const_float64( 0x3fc5555555555555ll ), /*  3 */
2769    const_float64( 0x3fa5555555555555ll ), /*  4 */
2770    const_float64( 0x3f81111111111111ll ), /*  5 */
2771    const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2772    const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2773    const_float64( 0x3efa01a01a01a01all ), /*  8 */
2774    const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2775    const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2776    const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2777    const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2778    const_float64( 0x3de6124613a86d09ll ), /* 13 */
2779    const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2780    const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2781};
2782
2783float32 float32_exp2(float32 a, float_status *status)
2784{
2785    flag aSign;
2786    int aExp;
2787    uint32_t aSig;
2788    float64 r, x, xn;
2789    int i;
2790    a = float32_squash_input_denormal(a, status);
2791
2792    aSig = extractFloat32Frac( a );
2793    aExp = extractFloat32Exp( a );
2794    aSign = extractFloat32Sign( a );
2795
2796    if ( aExp == 0xFF) {
2797        if (aSig) {
2798            return propagateFloat32NaN(a, float32_zero, status);
2799        }
2800        return (aSign) ? float32_zero : a;
2801    }
2802    if (aExp == 0) {
2803        if (aSig == 0) return float32_one;
2804    }
2805
2806    float_raise(float_flag_inexact, status);
2807
2808    /* ******************************* */
2809    /* using float64 for approximation */
2810    /* ******************************* */
2811    x = float32_to_float64(a, status);
2812    x = float64_mul(x, float64_ln2, status);
2813
2814    xn = x;
2815    r = float64_one;
2816    for (i = 0 ; i < 15 ; i++) {
2817        float64 f;
2818
2819        f = float64_mul(xn, float32_exp2_coefficients[i], status);
2820        r = float64_add(r, f, status);
2821
2822        xn = float64_mul(xn, x, status);
2823    }
2824
2825    return float64_to_float32(r, status);
2826}
2827
2828/*----------------------------------------------------------------------------
2829| Returns the binary log of the single-precision floating-point value `a'.
2830| The operation is performed according to the IEC/IEEE Standard for Binary
2831| Floating-Point Arithmetic.
2832*----------------------------------------------------------------------------*/
2833float32 float32_log2(float32 a, float_status *status)
2834{
2835    flag aSign, zSign;
2836    int aExp;
2837    uint32_t aSig, zSig, i;
2838
2839    a = float32_squash_input_denormal(a, status);
2840    aSig = extractFloat32Frac( a );
2841    aExp = extractFloat32Exp( a );
2842    aSign = extractFloat32Sign( a );
2843
2844    if ( aExp == 0 ) {
2845        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2846        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2847    }
2848    if ( aSign ) {
2849        float_raise(float_flag_invalid, status);
2850        return float32_default_nan(status);
2851    }
2852    if ( aExp == 0xFF ) {
2853        if (aSig) {
2854            return propagateFloat32NaN(a, float32_zero, status);
2855        }
2856        return a;
2857    }
2858
2859    aExp -= 0x7F;
2860    aSig |= 0x00800000;
2861    zSign = aExp < 0;
2862    zSig = aExp << 23;
2863
2864    for (i = 1 << 22; i > 0; i >>= 1) {
2865        aSig = ( (uint64_t)aSig * aSig ) >> 23;
2866        if ( aSig & 0x01000000 ) {
2867            aSig >>= 1;
2868            zSig |= i;
2869        }
2870    }
2871
2872    if ( zSign )
2873        zSig = -zSig;
2874
2875    return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
2876}
2877
2878/*----------------------------------------------------------------------------
2879| Returns 1 if the single-precision floating-point value `a' is equal to
2880| the corresponding value `b', and 0 otherwise.  The invalid exception is
2881| raised if either operand is a NaN.  Otherwise, the comparison is performed
2882| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2883*----------------------------------------------------------------------------*/
2884
2885int float32_eq(float32 a, float32 b, float_status *status)
2886{
2887    uint32_t av, bv;
2888    a = float32_squash_input_denormal(a, status);
2889    b = float32_squash_input_denormal(b, status);
2890
2891    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2892         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2893       ) {
2894        float_raise(float_flag_invalid, status);
2895        return 0;
2896    }
2897    av = float32_val(a);
2898    bv = float32_val(b);
2899    return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2900}
2901
2902/*----------------------------------------------------------------------------
2903| Returns 1 if the single-precision floating-point value `a' is less than
2904| or equal to the corresponding value `b', and 0 otherwise.  The invalid
2905| exception is raised if either operand is a NaN.  The comparison is performed
2906| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2907*----------------------------------------------------------------------------*/
2908
2909int float32_le(float32 a, float32 b, float_status *status)
2910{
2911    flag aSign, bSign;
2912    uint32_t av, bv;
2913    a = float32_squash_input_denormal(a, status);
2914    b = float32_squash_input_denormal(b, status);
2915
2916    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2917         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2918       ) {
2919        float_raise(float_flag_invalid, status);
2920        return 0;
2921    }
2922    aSign = extractFloat32Sign( a );
2923    bSign = extractFloat32Sign( b );
2924    av = float32_val(a);
2925    bv = float32_val(b);
2926    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2927    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2928
2929}
2930
2931/*----------------------------------------------------------------------------
2932| Returns 1 if the single-precision floating-point value `a' is less than
2933| the corresponding value `b', and 0 otherwise.  The invalid exception is
2934| raised if either operand is a NaN.  The comparison is performed according
2935| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2936*----------------------------------------------------------------------------*/
2937
2938int float32_lt(float32 a, float32 b, float_status *status)
2939{
2940    flag aSign, bSign;
2941    uint32_t av, bv;
2942    a = float32_squash_input_denormal(a, status);
2943    b = float32_squash_input_denormal(b, status);
2944
2945    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2946         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2947       ) {
2948        float_raise(float_flag_invalid, status);
2949        return 0;
2950    }
2951    aSign = extractFloat32Sign( a );
2952    bSign = extractFloat32Sign( b );
2953    av = float32_val(a);
2954    bv = float32_val(b);
2955    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2956    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2957
2958}
2959
2960/*----------------------------------------------------------------------------
2961| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2962| be compared, and 0 otherwise.  The invalid exception is raised if either
2963| operand is a NaN.  The comparison is performed according to the IEC/IEEE
2964| Standard for Binary Floating-Point Arithmetic.
2965*----------------------------------------------------------------------------*/
2966
2967int float32_unordered(float32 a, float32 b, float_status *status)
2968{
2969    a = float32_squash_input_denormal(a, status);
2970    b = float32_squash_input_denormal(b, status);
2971
2972    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2973         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2974       ) {
2975        float_raise(float_flag_invalid, status);
2976        return 1;
2977    }
2978    return 0;
2979}
2980
2981/*----------------------------------------------------------------------------
2982| Returns 1 if the single-precision floating-point value `a' is equal to
2983| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2984| exception.  The comparison is performed according to the IEC/IEEE Standard
2985| for Binary Floating-Point Arithmetic.
2986*----------------------------------------------------------------------------*/
2987
2988int float32_eq_quiet(float32 a, float32 b, float_status *status)
2989{
2990    a = float32_squash_input_denormal(a, status);
2991    b = float32_squash_input_denormal(b, status);
2992
2993    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2994         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2995       ) {
2996        if (float32_is_signaling_nan(a, status)
2997         || float32_is_signaling_nan(b, status)) {
2998            float_raise(float_flag_invalid, status);
2999        }
3000        return 0;
3001    }
3002    return ( float32_val(a) == float32_val(b) ) ||
3003            ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3004}
3005
3006/*----------------------------------------------------------------------------
3007| Returns 1 if the single-precision floating-point value `a' is less than or
3008| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3009| cause an exception.  Otherwise, the comparison is performed according to the
3010| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3011*----------------------------------------------------------------------------*/
3012
3013int float32_le_quiet(float32 a, float32 b, float_status *status)
3014{
3015    flag aSign, bSign;
3016    uint32_t av, bv;
3017    a = float32_squash_input_denormal(a, status);
3018    b = float32_squash_input_denormal(b, status);
3019
3020    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3021         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3022       ) {
3023        if (float32_is_signaling_nan(a, status)
3024         || float32_is_signaling_nan(b, status)) {
3025            float_raise(float_flag_invalid, status);
3026        }
3027        return 0;
3028    }
3029    aSign = extractFloat32Sign( a );
3030    bSign = extractFloat32Sign( b );
3031    av = float32_val(a);
3032    bv = float32_val(b);
3033    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3034    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3035
3036}
3037
3038/*----------------------------------------------------------------------------
3039| Returns 1 if the single-precision floating-point value `a' is less than
3040| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3041| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3042| Standard for Binary Floating-Point Arithmetic.
3043*----------------------------------------------------------------------------*/
3044
3045int float32_lt_quiet(float32 a, float32 b, float_status *status)
3046{
3047    flag aSign, bSign;
3048    uint32_t av, bv;
3049    a = float32_squash_input_denormal(a, status);
3050    b = float32_squash_input_denormal(b, status);
3051
3052    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3053         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3054       ) {
3055        if (float32_is_signaling_nan(a, status)
3056         || float32_is_signaling_nan(b, status)) {
3057            float_raise(float_flag_invalid, status);
3058        }
3059        return 0;
3060    }
3061    aSign = extractFloat32Sign( a );
3062    bSign = extractFloat32Sign( b );
3063    av = float32_val(a);
3064    bv = float32_val(b);
3065    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3066    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3067
3068}
3069
3070/*----------------------------------------------------------------------------
3071| Returns 1 if the single-precision floating-point values `a' and `b' cannot
3072| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3073| comparison is performed according to the IEC/IEEE Standard for Binary
3074| Floating-Point Arithmetic.
3075*----------------------------------------------------------------------------*/
3076
3077int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3078{
3079    a = float32_squash_input_denormal(a, status);
3080    b = float32_squash_input_denormal(b, status);
3081
3082    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3083         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3084       ) {
3085        if (float32_is_signaling_nan(a, status)
3086         || float32_is_signaling_nan(b, status)) {
3087            float_raise(float_flag_invalid, status);
3088        }
3089        return 1;
3090    }
3091    return 0;
3092}
3093
3094/*----------------------------------------------------------------------------
3095| Returns the result of converting the double-precision floating-point value
3096| `a' to the 32-bit two's complement integer format.  The conversion is
3097| performed according to the IEC/IEEE Standard for Binary Floating-Point
3098| Arithmetic---which means in particular that the conversion is rounded
3099| according to the current rounding mode.  If `a' is a NaN, the largest
3100| positive integer is returned.  Otherwise, if the conversion overflows, the
3101| largest integer with the same sign as `a' is returned.
3102*----------------------------------------------------------------------------*/
3103
3104int32_t float64_to_int32(float64 a, float_status *status)
3105{
3106    flag aSign;
3107    int aExp;
3108    int shiftCount;
3109    uint64_t aSig;
3110    a = float64_squash_input_denormal(a, status);
3111
3112    aSig = extractFloat64Frac( a );
3113    aExp = extractFloat64Exp( a );
3114    aSign = extractFloat64Sign( a );
3115    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3116    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3117    shiftCount = 0x42C - aExp;
3118    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3119    return roundAndPackInt32(aSign, aSig, status);
3120
3121}
3122
3123/*----------------------------------------------------------------------------
3124| Returns the result of converting the double-precision floating-point value
3125| `a' to the 32-bit two's complement integer format.  The conversion is
3126| performed according to the IEC/IEEE Standard for Binary Floating-Point
3127| Arithmetic, except that the conversion is always rounded toward zero.
3128| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3129| the conversion overflows, the largest integer with the same sign as `a' is
3130| returned.
3131*----------------------------------------------------------------------------*/
3132
3133int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
3134{
3135    flag aSign;
3136    int aExp;
3137    int shiftCount;
3138    uint64_t aSig, savedASig;
3139    int32_t z;
3140    a = float64_squash_input_denormal(a, status);
3141
3142    aSig = extractFloat64Frac( a );
3143    aExp = extractFloat64Exp( a );
3144    aSign = extractFloat64Sign( a );
3145    if ( 0x41E < aExp ) {
3146        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3147        goto invalid;
3148    }
3149    else if ( aExp < 0x3FF ) {
3150        if (aExp || aSig) {
3151            status->float_exception_flags |= float_flag_inexact;
3152        }
3153        return 0;
3154    }
3155    aSig |= LIT64( 0x0010000000000000 );
3156    shiftCount = 0x433 - aExp;
3157    savedASig = aSig;
3158    aSig >>= shiftCount;
3159    z = aSig;
3160    if ( aSign ) z = - z;
3161    if ( ( z < 0 ) ^ aSign ) {
3162 invalid:
3163        float_raise(float_flag_invalid, status);
3164        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3165    }
3166    if ( ( aSig<<shiftCount ) != savedASig ) {
3167        status->float_exception_flags |= float_flag_inexact;
3168    }
3169    return z;
3170
3171}
3172
3173/*----------------------------------------------------------------------------
3174| Returns the result of converting the double-precision floating-point value
3175| `a' to the 16-bit two's complement integer format.  The conversion is
3176| performed according to the IEC/IEEE Standard for Binary Floating-Point
3177| Arithmetic, except that the conversion is always rounded toward zero.
3178| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3179| the conversion overflows, the largest integer with the same sign as `a' is
3180| returned.
3181*----------------------------------------------------------------------------*/
3182
3183int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3184{
3185    flag aSign;
3186    int aExp;
3187    int shiftCount;
3188    uint64_t aSig, savedASig;
3189    int32_t z;
3190
3191    aSig = extractFloat64Frac( a );
3192    aExp = extractFloat64Exp( a );
3193    aSign = extractFloat64Sign( a );
3194    if ( 0x40E < aExp ) {
3195        if ( ( aExp == 0x7FF ) && aSig ) {
3196            aSign = 0;
3197        }
3198        goto invalid;
3199    }
3200    else if ( aExp < 0x3FF ) {
3201        if ( aExp || aSig ) {
3202            status->float_exception_flags |= float_flag_inexact;
3203        }
3204        return 0;
3205    }
3206    aSig |= LIT64( 0x0010000000000000 );
3207    shiftCount = 0x433 - aExp;
3208    savedASig = aSig;
3209    aSig >>= shiftCount;
3210    z = aSig;
3211    if ( aSign ) {
3212        z = - z;
3213    }
3214    if ( ( (int16_t)z < 0 ) ^ aSign ) {
3215 invalid:
3216        float_raise(float_flag_invalid, status);
3217        return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3218    }
3219    if ( ( aSig<<shiftCount ) != savedASig ) {
3220        status->float_exception_flags |= float_flag_inexact;
3221    }
3222    return z;
3223}
3224
3225/*----------------------------------------------------------------------------
3226| Returns the result of converting the double-precision floating-point value
3227| `a' to the 64-bit two's complement integer format.  The conversion is
3228| performed according to the IEC/IEEE Standard for Binary Floating-Point
3229| Arithmetic---which means in particular that the conversion is rounded
3230| according to the current rounding mode.  If `a' is a NaN, the largest
3231| positive integer is returned.  Otherwise, if the conversion overflows, the
3232| largest integer with the same sign as `a' is returned.
3233*----------------------------------------------------------------------------*/
3234
3235int64_t float64_to_int64(float64 a, float_status *status)
3236{
3237    flag aSign;
3238    int aExp;
3239    int shiftCount;
3240    uint64_t aSig, aSigExtra;
3241    a = float64_squash_input_denormal(a, status);
3242
3243    aSig = extractFloat64Frac( a );
3244    aExp = extractFloat64Exp( a );
3245    aSign = extractFloat64Sign( a );
3246    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3247    shiftCount = 0x433 - aExp;
3248    if ( shiftCount <= 0 ) {
3249        if ( 0x43E < aExp ) {
3250            float_raise(float_flag_invalid, status);
3251            if (    ! aSign
3252                 || (    ( aExp == 0x7FF )
3253                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
3254               ) {
3255                return LIT64( 0x7FFFFFFFFFFFFFFF );
3256            }
3257            return (int64_t) LIT64( 0x8000000000000000 );
3258        }
3259        aSigExtra = 0;
3260        aSig <<= - shiftCount;
3261    }
3262    else {
3263        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3264    }
3265    return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3266
3267}
3268
3269/*----------------------------------------------------------------------------
3270| Returns the result of converting the double-precision floating-point value
3271| `a' to the 64-bit two's complement integer format.  The conversion is
3272| performed according to the IEC/IEEE Standard for Binary Floating-Point
3273| Arithmetic, except that the conversion is always rounded toward zero.
3274| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3275| the conversion overflows, the largest integer with the same sign as `a' is
3276| returned.
3277*----------------------------------------------------------------------------*/
3278
3279int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
3280{
3281    flag aSign;
3282    int aExp;
3283    int shiftCount;
3284    uint64_t aSig;
3285    int64_t z;
3286    a = float64_squash_input_denormal(a, status);
3287
3288    aSig = extractFloat64Frac( a );
3289    aExp = extractFloat64Exp( a );
3290    aSign = extractFloat64Sign( a );
3291    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3292    shiftCount = aExp - 0x433;
3293    if ( 0 <= shiftCount ) {
3294        if ( 0x43E <= aExp ) {
3295            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3296                float_raise(float_flag_invalid, status);
3297                if (    ! aSign
3298                     || (    ( aExp == 0x7FF )
3299                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
3300                   ) {
3301                    return LIT64( 0x7FFFFFFFFFFFFFFF );
3302                }
3303            }
3304            return (int64_t) LIT64( 0x8000000000000000 );
3305        }
3306        z = aSig<<shiftCount;
3307    }
3308    else {
3309        if ( aExp < 0x3FE ) {
3310            if (aExp | aSig) {
3311                status->float_exception_flags |= float_flag_inexact;
3312            }
3313            return 0;
3314        }
3315        z = aSig>>( - shiftCount );
3316        if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3317            status->float_exception_flags |= float_flag_inexact;
3318        }
3319    }
3320    if ( aSign ) z = - z;
3321    return z;
3322
3323}
3324
3325/*----------------------------------------------------------------------------
3326| Returns the result of converting the double-precision floating-point value
3327| `a' to the single-precision floating-point format.  The conversion is
3328| performed according to the IEC/IEEE Standard for Binary Floating-Point
3329| Arithmetic.
3330*----------------------------------------------------------------------------*/
3331
3332float32 float64_to_float32(float64 a, float_status *status)
3333{
3334    flag aSign;
3335    int aExp;
3336    uint64_t aSig;
3337    uint32_t zSig;
3338    a = float64_squash_input_denormal(a, status);
3339
3340    aSig = extractFloat64Frac( a );
3341    aExp = extractFloat64Exp( a );
3342    aSign = extractFloat64Sign( a );
3343    if ( aExp == 0x7FF ) {
3344        if (aSig) {
3345            return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3346        }
3347        return packFloat32( aSign, 0xFF, 0 );
3348    }
3349    shift64RightJamming( aSig, 22, &aSig );
3350    zSig = aSig;
3351    if ( aExp || zSig ) {
3352        zSig |= 0x40000000;
3353        aExp -= 0x381;
3354    }
3355    return roundAndPackFloat32(aSign, aExp, zSig, status);
3356
3357}
3358
3359
3360/*----------------------------------------------------------------------------
3361| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3362| half-precision floating-point value, returning the result.  After being
3363| shifted into the proper positions, the three fields are simply added
3364| together to form the result.  This means that any integer portion of `zSig'
3365| will be added into the exponent.  Since a properly normalized significand
3366| will have an integer portion equal to 1, the `zExp' input should be 1 less
3367| than the desired result exponent whenever `zSig' is a complete, normalized
3368| significand.
3369*----------------------------------------------------------------------------*/
3370static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3371{
3372    return make_float16(
3373        (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3374}
3375
3376/*----------------------------------------------------------------------------
3377| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3378| and significand `zSig', and returns the proper half-precision floating-
3379| point value corresponding to the abstract input.  Ordinarily, the abstract
3380| value is simply rounded and packed into the half-precision format, with
3381| the inexact exception raised if the abstract input cannot be represented
3382| exactly.  However, if the abstract value is too large, the overflow and
3383| inexact exceptions are raised and an infinity or maximal finite value is
3384| returned.  If the abstract value is too small, the input value is rounded to
3385| a subnormal number, and the underflow and inexact exceptions are raised if
3386| the abstract input cannot be represented exactly as a subnormal half-
3387| precision floating-point number.
3388| The `ieee' flag indicates whether to use IEEE standard half precision, or
3389| ARM-style "alternative representation", which omits the NaN and Inf
3390| encodings in order to raise the maximum representable exponent by one.
3391|     The input significand `zSig' has its binary point between bits 22
3392| and 23, which is 13 bits to the left of the usual location.  This shifted
3393| significand must be normalized or smaller.  If `zSig' is not normalized,
3394| `zExp' must be 0; in that case, the result returned is a subnormal number,
3395| and it must not require rounding.  In the usual case that `zSig' is
3396| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3397| Note the slightly odd position of the binary point in zSig compared with the
3398| other roundAndPackFloat functions. This should probably be fixed if we
3399| need to implement more float16 routines than just conversion.
3400| The handling of underflow and overflow follows the IEC/IEEE Standard for
3401| Binary Floating-Point Arithmetic.
3402*----------------------------------------------------------------------------*/
3403
3404static float16 roundAndPackFloat16(flag zSign, int zExp,
3405                                   uint32_t zSig, flag ieee,
3406                                   float_status *status)
3407{
3408    int maxexp = ieee ? 29 : 30;
3409    uint32_t mask;
3410    uint32_t increment;
3411    bool rounding_bumps_exp;
3412    bool is_tiny = false;
3413
3414    /* Calculate the mask of bits of the mantissa which are not
3415     * representable in half-precision and will be lost.
3416     */
3417    if (zExp < 1) {
3418        /* Will be denormal in halfprec */
3419        mask = 0x00ffffff;
3420        if (zExp >= -11) {
3421            mask >>= 11 + zExp;
3422        }
3423    } else {
3424        /* Normal number in halfprec */
3425        mask = 0x00001fff;
3426    }
3427
3428    switch (status->float_rounding_mode) {
3429    case float_round_nearest_even:
3430        increment = (mask + 1) >> 1;
3431        if ((zSig & mask) == increment) {
3432            increment = zSig & (increment << 1);
3433        }
3434        break;
3435    case float_round_ties_away:
3436        increment = (mask + 1) >> 1;
3437        break;
3438    case float_round_up:
3439        increment = zSign ? 0 : mask;
3440        break;
3441    case float_round_down:
3442        increment = zSign ? mask : 0;
3443        break;
3444    default: /* round_to_zero */
3445        increment = 0;
3446        break;
3447    }
3448
3449    rounding_bumps_exp = (zSig + increment >= 0x01000000);
3450
3451    if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3452        if (ieee) {
3453            float_raise(float_flag_overflow | float_flag_inexact, status);
3454            return packFloat16(zSign, 0x1f, 0);
3455        } else {
3456            float_raise(float_flag_invalid, status);
3457            return packFloat16(zSign, 0x1f, 0x3ff);
3458        }
3459    }
3460
3461    if (zExp < 0) {
3462        /* Note that flush-to-zero does not affect half-precision results */
3463        is_tiny =
3464            (status->float_detect_tininess == float_tininess_before_rounding)
3465            || (zExp < -1)
3466            || (!rounding_bumps_exp);
3467    }
3468    if (zSig & mask) {
3469        float_raise(float_flag_inexact, status);
3470        if (is_tiny) {
3471            float_raise(float_flag_underflow, status);
3472        }
3473    }
3474
3475    zSig += increment;
3476    if (rounding_bumps_exp) {
3477        zSig >>= 1;
3478        zExp++;
3479    }
3480
3481    if (zExp < -10) {
3482        return packFloat16(zSign, 0, 0);
3483    }
3484    if (zExp < 0) {
3485        zSig >>= -zExp;
3486        zExp = 0;
3487    }
3488    return packFloat16(zSign, zExp, zSig >> 13);
3489}
3490
3491static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3492                                      uint32_t *zSigPtr)
3493{
3494    int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3495    *zSigPtr = aSig << shiftCount;
3496    *zExpPtr = 1 - shiftCount;
3497}
3498
3499/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3500   The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3501
3502float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3503{
3504    flag aSign;
3505    int aExp;
3506    uint32_t aSig;
3507
3508    aSign = extractFloat16Sign(a);
3509    aExp = extractFloat16Exp(a);
3510    aSig = extractFloat16Frac(a);
3511
3512    if (aExp == 0x1f && ieee) {
3513        if (aSig) {
3514            return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3515        }
3516        return packFloat32(aSign, 0xff, 0);
3517    }
3518    if (aExp == 0) {
3519        if (aSig == 0) {
3520            return packFloat32(aSign, 0, 0);
3521        }
3522
3523        normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3524        aExp--;
3525    }
3526    return packFloat32( aSign, aExp + 0x70, aSig << 13);
3527}
3528
3529float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3530{
3531    flag aSign;
3532    int aExp;
3533    uint32_t aSig;
3534
3535    a = float32_squash_input_denormal(a, status);
3536
3537    aSig = extractFloat32Frac( a );
3538    aExp = extractFloat32Exp( a );
3539    aSign = extractFloat32Sign( a );
3540    if ( aExp == 0xFF ) {
3541        if (aSig) {
3542            /* Input is a NaN */
3543            if (!ieee) {
3544                float_raise(float_flag_invalid, status);
3545                return packFloat16(aSign, 0, 0);
3546            }
3547            return commonNaNToFloat16(
3548                float32ToCommonNaN(a, status), status);
3549        }
3550        /* Infinity */
3551        if (!ieee) {
3552            float_raise(float_flag_invalid, status);
3553            return packFloat16(aSign, 0x1f, 0x3ff);
3554        }
3555        return packFloat16(aSign, 0x1f, 0);
3556    }
3557    if (aExp == 0 && aSig == 0) {
3558        return packFloat16(aSign, 0, 0);
3559    }
3560    /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3561     * even if the input is denormal; however this is harmless because
3562     * the largest possible single-precision denormal is still smaller
3563     * than the smallest representable half-precision denormal, and so we
3564     * will end up ignoring aSig and returning via the "always return zero"
3565     * codepath.
3566     */
3567    aSig |= 0x00800000;
3568    aExp -= 0x71;
3569
3570    return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3571}
3572
3573float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3574{
3575    flag aSign;
3576    int aExp;
3577    uint32_t aSig;
3578
3579    aSign = extractFloat16Sign(a);
3580    aExp = extractFloat16Exp(a);
3581    aSig = extractFloat16Frac(a);
3582
3583    if (aExp == 0x1f && ieee) {
3584        if (aSig) {
3585            return commonNaNToFloat64(
3586                float16ToCommonNaN(a, status), status);
3587        }
3588        return packFloat64(aSign, 0x7ff, 0);
3589    }
3590    if (aExp == 0) {
3591        if (aSig == 0) {
3592            return packFloat64(aSign, 0, 0);
3593        }
3594
3595        normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3596        aExp--;
3597    }
3598    return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3599}
3600
3601float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3602{
3603    flag aSign;
3604    int aExp;
3605    uint64_t aSig;
3606    uint32_t zSig;
3607
3608    a = float64_squash_input_denormal(a, status);
3609
3610    aSig = extractFloat64Frac(a);
3611    aExp = extractFloat64Exp(a);
3612    aSign = extractFloat64Sign(a);
3613    if (aExp == 0x7FF) {
3614        if (aSig) {
3615            /* Input is a NaN */
3616            if (!ieee) {
3617                float_raise(float_flag_invalid, status);
3618                return packFloat16(aSign, 0, 0);
3619            }
3620            return commonNaNToFloat16(
3621                float64ToCommonNaN(a, status), status);
3622        }
3623        /* Infinity */
3624        if (!ieee) {
3625            float_raise(float_flag_invalid, status);
3626            return packFloat16(aSign, 0x1f, 0x3ff);
3627        }
3628        return packFloat16(aSign, 0x1f, 0);
3629    }
3630    shift64RightJamming(aSig, 29, &aSig);
3631    zSig = aSig;
3632    if (aExp == 0 && zSig == 0) {
3633        return packFloat16(aSign, 0, 0);
3634    }
3635    /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3636     * even if the input is denormal; however this is harmless because
3637     * the largest possible single-precision denormal is still smaller
3638     * than the smallest representable half-precision denormal, and so we
3639     * will end up ignoring aSig and returning via the "always return zero"
3640     * codepath.
3641     */
3642    zSig |= 0x00800000;
3643    aExp -= 0x3F1;
3644
3645    return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3646}
3647
3648/*----------------------------------------------------------------------------
3649| Returns the result of converting the double-precision floating-point value
3650| `a' to the extended double-precision floating-point format.  The conversion
3651| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3652| Arithmetic.
3653*----------------------------------------------------------------------------*/
3654
3655floatx80 float64_to_floatx80(float64 a, float_status *status)
3656{
3657    flag aSign;
3658    int aExp;
3659    uint64_t aSig;
3660
3661    a = float64_squash_input_denormal(a, status);
3662    aSig = extractFloat64Frac( a );
3663    aExp = extractFloat64Exp( a );
3664    aSign = extractFloat64Sign( a );
3665    if ( aExp == 0x7FF ) {
3666        if (aSig) {
3667            return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3668        }
3669        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3670    }
3671    if ( aExp == 0 ) {
3672        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3673        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3674    }
3675    return
3676        packFloatx80(
3677            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3678
3679}
3680
3681/*----------------------------------------------------------------------------
3682| Returns the result of converting the double-precision floating-point value
3683| `a' to the quadruple-precision floating-point format.  The conversion is
3684| performed according to the IEC/IEEE Standard for Binary Floating-Point
3685| Arithmetic.
3686*----------------------------------------------------------------------------*/
3687
3688float128 float64_to_float128(float64 a, float_status *status)
3689{
3690    flag aSign;
3691    int aExp;
3692    uint64_t aSig, zSig0, zSig1;
3693
3694    a = float64_squash_input_denormal(a, status);
3695    aSig = extractFloat64Frac( a );
3696    aExp = extractFloat64Exp( a );
3697    aSign = extractFloat64Sign( a );
3698    if ( aExp == 0x7FF ) {
3699        if (aSig) {
3700            return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3701        }
3702        return packFloat128( aSign, 0x7FFF, 0, 0 );
3703    }
3704    if ( aExp == 0 ) {
3705        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3706        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3707        --aExp;
3708    }
3709    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3710    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3711
3712}
3713
3714/*----------------------------------------------------------------------------
3715| Rounds the double-precision floating-point value `a' to an integer, and
3716| returns the result as a double-precision floating-point value.  The
3717| operation is performed according to the IEC/IEEE Standard for Binary
3718| Floating-Point Arithmetic.
3719*----------------------------------------------------------------------------*/
3720
3721float64 float64_round_to_int(float64 a, float_status *status)
3722{
3723    flag aSign;
3724    int aExp;
3725    uint64_t lastBitMask, roundBitsMask;
3726    uint64_t z;
3727    a = float64_squash_input_denormal(a, status);
3728
3729    aExp = extractFloat64Exp( a );
3730    if ( 0x433 <= aExp ) {
3731        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3732            return propagateFloat64NaN(a, a, status);
3733        }
3734        return a;
3735    }
3736    if ( aExp < 0x3FF ) {
3737        if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3738        status->float_exception_flags |= float_flag_inexact;
3739        aSign = extractFloat64Sign( a );
3740        switch (status->float_rounding_mode) {
3741         case float_round_nearest_even:
3742            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3743                return packFloat64( aSign, 0x3FF, 0 );
3744            }
3745            break;
3746        case float_round_ties_away:
3747            if (aExp == 0x3FE) {
3748                return packFloat64(aSign, 0x3ff, 0);
3749            }
3750            break;
3751         case float_round_down:
3752            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3753         case float_round_up:
3754            return make_float64(
3755            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3756        }
3757        return packFloat64( aSign, 0, 0 );
3758    }
3759    lastBitMask = 1;
3760    lastBitMask <<= 0x433 - aExp;
3761    roundBitsMask = lastBitMask - 1;
3762    z = float64_val(a);
3763    switch (status->float_rounding_mode) {
3764    case float_round_nearest_even:
3765        z += lastBitMask >> 1;
3766        if ((z & roundBitsMask) == 0) {
3767            z &= ~lastBitMask;
3768        }
3769        break;
3770    case float_round_ties_away:
3771        z += lastBitMask >> 1;
3772        break;
3773    case float_round_to_zero:
3774        break;
3775    case float_round_up:
3776        if (!extractFloat64Sign(make_float64(z))) {
3777            z += roundBitsMask;
3778        }
3779        break;
3780    case float_round_down:
3781        if (extractFloat64Sign(make_float64(z))) {
3782            z += roundBitsMask;
3783        }
3784        break;
3785    default:
3786        abort();
3787    }
3788    z &= ~ roundBitsMask;
3789    if (z != float64_val(a)) {
3790        status->float_exception_flags |= float_flag_inexact;
3791    }
3792    return make_float64(z);
3793
3794}
3795
3796float64 float64_trunc_to_int(float64 a, float_status *status)
3797{
3798    int oldmode;
3799    float64 res;
3800    oldmode = status->float_rounding_mode;
3801    status->float_rounding_mode = float_round_to_zero;
3802    res = float64_round_to_int(a, status);
3803    status->float_rounding_mode = oldmode;
3804    return res;
3805}
3806
3807/*----------------------------------------------------------------------------
3808| Returns the result of adding the absolute values of the double-precision
3809| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3810| before being returned.  `zSign' is ignored if the result is a NaN.
3811| The addition is performed according to the IEC/IEEE Standard for Binary
3812| Floating-Point Arithmetic.
3813*----------------------------------------------------------------------------*/
3814
3815static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3816                              float_status *status)
3817{
3818    int aExp, bExp, zExp;
3819    uint64_t aSig, bSig, zSig;
3820    int expDiff;
3821
3822    aSig = extractFloat64Frac( a );
3823    aExp = extractFloat64Exp( a );
3824    bSig = extractFloat64Frac( b );
3825    bExp = extractFloat64Exp( b );
3826    expDiff = aExp - bExp;
3827    aSig <<= 9;
3828    bSig <<= 9;
3829    if ( 0 < expDiff ) {
3830        if ( aExp == 0x7FF ) {
3831            if (aSig) {
3832                return propagateFloat64NaN(a, b, status);
3833            }
3834            return a;
3835        }
3836        if ( bExp == 0 ) {
3837            --expDiff;
3838        }
3839        else {
3840            bSig |= LIT64( 0x2000000000000000 );
3841        }
3842        shift64RightJamming( bSig, expDiff, &bSig );
3843        zExp = aExp;
3844    }
3845    else if ( expDiff < 0 ) {
3846        if ( bExp == 0x7FF ) {
3847            if (bSig) {
3848                return propagateFloat64NaN(a, b, status);
3849            }
3850            return packFloat64( zSign, 0x7FF, 0 );
3851        }
3852        if ( aExp == 0 ) {
3853            ++expDiff;
3854        }
3855        else {
3856            aSig |= LIT64( 0x2000000000000000 );
3857        }
3858        shift64RightJamming( aSig, - expDiff, &aSig );
3859        zExp = bExp;
3860    }
3861    else {
3862        if ( aExp == 0x7FF ) {
3863            if (aSig | bSig) {
3864                return propagateFloat64NaN(a, b, status);
3865            }
3866            return a;
3867        }
3868        if ( aExp == 0 ) {
3869            if (status->flush_to_zero) {
3870                if (aSig | bSig) {
3871                    float_raise(float_flag_output_denormal, status);
3872                }
3873                return packFloat64(zSign, 0, 0);
3874            }
3875            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3876        }
3877        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3878        zExp = aExp;
3879        goto roundAndPack;
3880    }
3881    aSig |= LIT64( 0x2000000000000000 );
3882    zSig = ( aSig + bSig )<<1;
3883    --zExp;
3884    if ( (int64_t) zSig < 0 ) {
3885        zSig = aSig + bSig;
3886        ++zExp;
3887    }
3888 roundAndPack:
3889    return roundAndPackFloat64(zSign, zExp, zSig, status);
3890
3891}
3892
3893/*----------------------------------------------------------------------------
3894| Returns the result of subtracting the absolute values of the double-
3895| precision floating-point values `a' and `b'.  If `zSign' is 1, the
3896| difference is negated before being returned.  `zSign' is ignored if the
3897| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3898| Standard for Binary Floating-Point Arithmetic.
3899*----------------------------------------------------------------------------*/
3900
3901static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3902                              float_status *status)
3903{
3904    int aExp, bExp, zExp;
3905    uint64_t aSig, bSig, zSig;
3906    int expDiff;
3907
3908    aSig = extractFloat64Frac( a );
3909    aExp = extractFloat64Exp( a );
3910    bSig = extractFloat64Frac( b );
3911    bExp = extractFloat64Exp( b );
3912    expDiff = aExp - bExp;
3913    aSig <<= 10;
3914    bSig <<= 10;
3915    if ( 0 < expDiff ) goto aExpBigger;
3916    if ( expDiff < 0 ) goto bExpBigger;
3917    if ( aExp == 0x7FF ) {
3918        if (aSig | bSig) {
3919            return propagateFloat64NaN(a, b, status);
3920        }
3921        float_raise(float_flag_invalid, status);
3922        return float64_default_nan(status);
3923    }
3924    if ( aExp == 0 ) {
3925        aExp = 1;
3926        bExp = 1;
3927    }
3928    if ( bSig < aSig ) goto aBigger;
3929    if ( aSig < bSig ) goto bBigger;
3930    return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
3931 bExpBigger:
3932    if ( bExp == 0x7FF ) {
3933        if (bSig) {
3934            return propagateFloat64NaN(a, b, status);
3935        }
3936        return packFloat64( zSign ^ 1, 0x7FF, 0 );
3937    }
3938    if ( aExp == 0 ) {
3939        ++expDiff;
3940    }
3941    else {
3942        aSig |= LIT64( 0x4000000000000000 );
3943    }
3944    shift64RightJamming( aSig, - expDiff, &aSig );
3945    bSig |= LIT64( 0x4000000000000000 );
3946 bBigger:
3947    zSig = bSig - aSig;
3948    zExp = bExp;
3949    zSign ^= 1;
3950    goto normalizeRoundAndPack;
3951 aExpBigger:
3952    if ( aExp == 0x7FF ) {
3953        if (aSig) {
3954            return propagateFloat64NaN(a, b, status);
3955        }
3956        return a;
3957    }
3958    if ( bExp == 0 ) {
3959        --expDiff;
3960    }
3961    else {
3962        bSig |= LIT64( 0x4000000000000000 );
3963    }
3964    shift64RightJamming( bSig, expDiff, &bSig );
3965    aSig |= LIT64( 0x4000000000000000 );
3966 aBigger:
3967    zSig = aSig - bSig;
3968    zExp = aExp;
3969 normalizeRoundAndPack:
3970    --zExp;
3971    return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
3972
3973}
3974
3975/*----------------------------------------------------------------------------
3976| Returns the result of adding the double-precision floating-point values `a'
3977| and `b'.  The operation is performed according to the IEC/IEEE Standard for
3978| Binary Floating-Point Arithmetic.
3979*----------------------------------------------------------------------------*/
3980
3981float64 float64_add(float64 a, float64 b, float_status *status)
3982{
3983    flag aSign, bSign;
3984    a = float64_squash_input_denormal(a, status);
3985    b = float64_squash_input_denormal(b, status);
3986
3987    aSign = extractFloat64Sign( a );
3988    bSign = extractFloat64Sign( b );
3989    if ( aSign == bSign ) {
3990        return addFloat64Sigs(a, b, aSign, status);
3991    }
3992    else {
3993        return subFloat64Sigs(a, b, aSign, status);
3994    }
3995
3996}
3997
3998/*----------------------------------------------------------------------------
3999| Returns the result of subtracting the double-precision floating-point values
4000| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4001| for Binary Floating-Point Arithmetic.
4002*----------------------------------------------------------------------------*/
4003
4004float64 float64_sub(float64 a, float64 b, float_status *status)
4005{
4006    flag aSign, bSign;
4007    a = float64_squash_input_denormal(a, status);
4008    b = float64_squash_input_denormal(b, status);
4009
4010    aSign = extractFloat64Sign( a );
4011    bSign = extractFloat64Sign( b );
4012    if ( aSign == bSign ) {
4013        return subFloat64Sigs(a, b, aSign, status);
4014    }
4015    else {
4016        return addFloat64Sigs(a, b, aSign, status);
4017    }
4018
4019}
4020
4021/*----------------------------------------------------------------------------
4022| Returns the result of multiplying the double-precision floating-point values
4023| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4024| for Binary Floating-Point Arithmetic.
4025*----------------------------------------------------------------------------*/
4026
4027float64 float64_mul(float64 a, float64 b, float_status *status)
4028{
4029    flag aSign, bSign, zSign;
4030    int aExp, bExp, zExp;
4031    uint64_t aSig, bSig, zSig0, zSig1;
4032
4033    a = float64_squash_input_denormal(a, status);
4034    b = float64_squash_input_denormal(b, status);
4035
4036    aSig = extractFloat64Frac( a );
4037    aExp = extractFloat64Exp( a );
4038    aSign = extractFloat64Sign( a );
4039    bSig = extractFloat64Frac( b );
4040    bExp = extractFloat64Exp( b );
4041    bSign = extractFloat64Sign( b );
4042    zSign = aSign ^ bSign;
4043    if ( aExp == 0x7FF ) {
4044        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4045            return propagateFloat64NaN(a, b, status);
4046        }
4047        if ( ( bExp | bSig ) == 0 ) {
4048            float_raise(float_flag_invalid, status);
4049            return float64_default_nan(status);
4050        }
4051        return packFloat64( zSign, 0x7FF, 0 );
4052    }
4053    if ( bExp == 0x7FF ) {
4054        if (bSig) {
4055            return propagateFloat64NaN(a, b, status);
4056        }
4057        if ( ( aExp | aSig ) == 0 ) {
4058            float_raise(float_flag_invalid, status);
4059            return float64_default_nan(status);
4060        }
4061        return packFloat64( zSign, 0x7FF, 0 );
4062    }
4063    if ( aExp == 0 ) {
4064        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4065        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4066    }
4067    if ( bExp == 0 ) {
4068        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4069        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4070    }
4071    zExp = aExp + bExp - 0x3FF;
4072    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4073    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4074    mul64To128( aSig, bSig, &zSig0, &zSig1 );
4075    zSig0 |= ( zSig1 != 0 );
4076    if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
4077        zSig0 <<= 1;
4078        --zExp;
4079    }
4080    return roundAndPackFloat64(zSign, zExp, zSig0, status);
4081
4082}
4083
4084/*----------------------------------------------------------------------------
4085| Returns the result of dividing the double-precision floating-point value `a'
4086| by the corresponding value `b'.  The operation is performed according to
4087| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4088*----------------------------------------------------------------------------*/
4089
4090float64 float64_div(float64 a, float64 b, float_status *status)
4091{
4092    flag aSign, bSign, zSign;
4093    int aExp, bExp, zExp;
4094    uint64_t aSig, bSig, zSig;
4095    uint64_t rem0, rem1;
4096    uint64_t term0, term1;
4097    a = float64_squash_input_denormal(a, status);
4098    b = float64_squash_input_denormal(b, status);
4099
4100    aSig = extractFloat64Frac( a );
4101    aExp = extractFloat64Exp( a );
4102    aSign = extractFloat64Sign( a );
4103    bSig = extractFloat64Frac( b );
4104    bExp = extractFloat64Exp( b );
4105    bSign = extractFloat64Sign( b );
4106    zSign = aSign ^ bSign;
4107    if ( aExp == 0x7FF ) {
4108        if (aSig) {
4109            return propagateFloat64NaN(a, b, status);
4110        }
4111        if ( bExp == 0x7FF ) {
4112            if (bSig) {
4113                return propagateFloat64NaN(a, b, status);
4114            }
4115            float_raise(float_flag_invalid, status);
4116            return float64_default_nan(status);
4117        }
4118        return packFloat64( zSign, 0x7FF, 0 );
4119    }
4120    if ( bExp == 0x7FF ) {
4121        if (bSig) {
4122            return propagateFloat64NaN(a, b, status);
4123        }
4124        return packFloat64( zSign, 0, 0 );
4125    }
4126    if ( bExp == 0 ) {
4127        if ( bSig == 0 ) {
4128            if ( ( aExp | aSig ) == 0 ) {
4129                float_raise(float_flag_invalid, status);
4130                return float64_default_nan(status);
4131            }
4132            float_raise(float_flag_divbyzero, status);
4133            return packFloat64( zSign, 0x7FF, 0 );
4134        }
4135        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4136    }
4137    if ( aExp == 0 ) {
4138        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4139        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4140    }
4141    zExp = aExp - bExp + 0x3FD;
4142    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4143    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4144    if ( bSig <= ( aSig + aSig ) ) {
4145        aSig >>= 1;
4146        ++zExp;
4147    }
4148    zSig = estimateDiv128To64( aSig, 0, bSig );
4149    if ( ( zSig & 0x1FF ) <= 2 ) {
4150        mul64To128( bSig, zSig, &term0, &term1 );
4151        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4152        while ( (int64_t) rem0 < 0 ) {
4153            --zSig;
4154            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4155        }
4156        zSig |= ( rem1 != 0 );
4157    }
4158    return roundAndPackFloat64(zSign, zExp, zSig, status);
4159
4160}
4161
4162/*----------------------------------------------------------------------------
4163| Returns the remainder of the double-precision floating-point value `a'
4164| with respect to the corresponding value `b'.  The operation is performed
4165| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4166*----------------------------------------------------------------------------*/
4167
4168float64 float64_rem(float64 a, float64 b, float_status *status)
4169{
4170    flag aSign, zSign;
4171    int aExp, bExp, expDiff;
4172    uint64_t aSig, bSig;
4173    uint64_t q, alternateASig;
4174    int64_t sigMean;
4175
4176    a = float64_squash_input_denormal(a, status);
4177    b = float64_squash_input_denormal(b, status);
4178    aSig = extractFloat64Frac( a );
4179    aExp = extractFloat64Exp( a );
4180    aSign = extractFloat64Sign( a );
4181    bSig = extractFloat64Frac( b );
4182    bExp = extractFloat64Exp( b );
4183    if ( aExp == 0x7FF ) {
4184        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4185            return propagateFloat64NaN(a, b, status);
4186        }
4187        float_raise(float_flag_invalid, status);
4188        return float64_default_nan(status);
4189    }
4190    if ( bExp == 0x7FF ) {
4191        if (bSig) {
4192            return propagateFloat64NaN(a, b, status);
4193        }
4194        return a;
4195    }
4196    if ( bExp == 0 ) {
4197        if ( bSig == 0 ) {
4198            float_raise(float_flag_invalid, status);
4199            return float64_default_nan(status);
4200        }
4201        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4202    }
4203    if ( aExp == 0 ) {
4204        if ( aSig == 0 ) return a;
4205        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4206    }
4207    expDiff = aExp - bExp;
4208    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4209    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4210    if ( expDiff < 0 ) {
4211        if ( expDiff < -1 ) return a;
4212        aSig >>= 1;
4213    }
4214    q = ( bSig <= aSig );
4215    if ( q ) aSig -= bSig;
4216    expDiff -= 64;
4217    while ( 0 < expDiff ) {
4218        q = estimateDiv128To64( aSig, 0, bSig );
4219        q = ( 2 < q ) ? q - 2 : 0;
4220        aSig = - ( ( bSig>>2 ) * q );
4221        expDiff -= 62;
4222    }
4223    expDiff += 64;
4224    if ( 0 < expDiff ) {
4225        q = estimateDiv128To64( aSig, 0, bSig );
4226        q = ( 2 < q ) ? q - 2 : 0;
4227        q >>= 64 - expDiff;
4228        bSig >>= 2;
4229        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4230    }
4231    else {
4232        aSig >>= 2;
4233        bSig >>= 2;
4234    }
4235    do {
4236        alternateASig = aSig;
4237        ++q;
4238        aSig -= bSig;
4239    } while ( 0 <= (int64_t) aSig );
4240    sigMean = aSig + alternateASig;
4241    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4242        aSig = alternateASig;
4243    }
4244    zSign = ( (int64_t) aSig < 0 );
4245    if ( zSign ) aSig = - aSig;
4246    return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4247
4248}
4249
4250/*----------------------------------------------------------------------------
4251| Returns the result of multiplying the double-precision floating-point values
4252| `a' and `b' then adding 'c', with no intermediate rounding step after the
4253| multiplication.  The operation is performed according to the IEC/IEEE
4254| Standard for Binary Floating-Point Arithmetic 754-2008.
4255| The flags argument allows the caller to select negation of the
4256| addend, the intermediate product, or the final result. (The difference
4257| between this and having the caller do a separate negation is that negating
4258| externally will flip the sign bit on NaNs.)
4259*----------------------------------------------------------------------------*/
4260
4261float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4262                       float_status *status)
4263{
4264    flag aSign, bSign, cSign, zSign;
4265    int aExp, bExp, cExp, pExp, zExp, expDiff;
4266    uint64_t aSig, bSig, cSig;
4267    flag pInf, pZero, pSign;
4268    uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4269    int shiftcount;
4270    flag signflip, infzero;
4271
4272    a = float64_squash_input_denormal(a, status);
4273    b = float64_squash_input_denormal(b, status);
4274    c = float64_squash_input_denormal(c, status);
4275    aSig = extractFloat64Frac(a);
4276    aExp = extractFloat64Exp(a);
4277    aSign = extractFloat64Sign(a);
4278    bSig = extractFloat64Frac(b);
4279    bExp = extractFloat64Exp(b);
4280    bSign = extractFloat64Sign(b);
4281    cSig = extractFloat64Frac(c);
4282    cExp = extractFloat64Exp(c);
4283    cSign = extractFloat64Sign(c);
4284
4285    infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4286               (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4287
4288    /* It is implementation-defined whether the cases of (0,inf,qnan)
4289     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4290     * they return if they do), so we have to hand this information
4291     * off to the target-specific pick-a-NaN routine.
4292     */
4293    if (((aExp == 0x7ff) && aSig) ||
4294        ((bExp == 0x7ff) && bSig) ||
4295        ((cExp == 0x7ff) && cSig)) {
4296        return propagateFloat64MulAddNaN(a, b, c, infzero, status);
4297    }
4298
4299    if (infzero) {
4300        float_raise(float_flag_invalid, status);
4301        return float64_default_nan(status);
4302    }
4303
4304    if (flags & float_muladd_negate_c) {
4305        cSign ^= 1;
4306    }
4307
4308    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4309
4310    /* Work out the sign and type of the product */
4311    pSign = aSign ^ bSign;
4312    if (flags & float_muladd_negate_product) {
4313        pSign ^= 1;
4314    }
4315    pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4316    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4317
4318    if (cExp == 0x7ff) {
4319        if (pInf && (pSign ^ cSign)) {
4320            /* addition of opposite-signed infinities => InvalidOperation */
4321            float_raise(float_flag_invalid, status);
4322            return float64_default_nan(status);
4323        }
4324        /* Otherwise generate an infinity of the same sign */
4325        return packFloat64(cSign ^ signflip, 0x7ff, 0);
4326    }
4327
4328    if (pInf) {
4329        return packFloat64(pSign ^ signflip, 0x7ff, 0);
4330    }
4331
4332    if (pZero) {
4333        if (cExp == 0) {
4334            if (cSig == 0) {
4335                /* Adding two exact zeroes */
4336                if (pSign == cSign) {
4337                    zSign = pSign;
4338                } else if (status->float_rounding_mode == float_round_down) {
4339                    zSign = 1;
4340                } else {
4341                    zSign = 0;
4342                }
4343                return packFloat64(zSign ^ signflip, 0, 0);
4344            }
4345            /* Exact zero plus a denorm */
4346            if (status->flush_to_zero) {
4347                float_raise(float_flag_output_denormal, status);
4348                return packFloat64(cSign ^ signflip, 0, 0);
4349            }
4350        }
4351        /* Zero plus something non-zero : just return the something */
4352        if (flags & float_muladd_halve_result) {
4353            if (cExp == 0) {
4354                normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4355            }
4356            /* Subtract one to halve, and one again because roundAndPackFloat64
4357             * wants one less than the true exponent.
4358             */
4359            cExp -= 2;
4360            cSig = (cSig | 0x0010000000000000ULL) << 10;
4361            return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
4362        }
4363        return packFloat64(cSign ^ signflip, cExp, cSig);
4364    }
4365
4366    if (aExp == 0) {
4367        normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4368    }
4369    if (bExp == 0) {
4370        normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4371    }
4372
4373    /* Calculate the actual result a * b + c */
4374
4375    /* Multiply first; this is easy. */
4376    /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4377     * because we want the true exponent, not the "one-less-than"
4378     * flavour that roundAndPackFloat64() takes.
4379     */
4380    pExp = aExp + bExp - 0x3fe;
4381    aSig = (aSig | LIT64(0x0010000000000000))<<10;
4382    bSig = (bSig | LIT64(0x0010000000000000))<<11;
4383    mul64To128(aSig, bSig, &pSig0, &pSig1);
4384    if ((int64_t)(pSig0 << 1) >= 0) {
4385        shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4386        pExp--;
4387    }
4388
4389    zSign = pSign ^ signflip;
4390
4391    /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4392     * bit in position 126.
4393     */
4394    if (cExp == 0) {
4395        if (!cSig) {
4396            /* Throw out the special case of c being an exact zero now */
4397            shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4398            if (flags & float_muladd_halve_result) {
4399                pExp--;
4400            }
4401            return roundAndPackFloat64(zSign, pExp - 1,
4402                                       pSig1, status);
4403        }
4404        normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4405    }
4406
4407    /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4408     * significand of the addend, with the explicit bit in position 126.
4409     */
4410    cSig0 = cSig << (126 - 64 - 52);
4411    cSig1 = 0;
4412    cSig0 |= LIT64(0x4000000000000000);
4413    expDiff = pExp - cExp;
4414
4415    if (pSign == cSign) {
4416        /* Addition */
4417        if (expDiff > 0) {
4418            /* scale c to match p */
4419            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4420            zExp = pExp;
4421        } else if (expDiff < 0) {
4422            /* scale p to match c */
4423            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4424            zExp = cExp;
4425        } else {
4426            /* no scaling needed */
4427            zExp = cExp;
4428        }
4429        /* Add significands and make sure explicit bit ends up in posn 126 */
4430        add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4431        if ((int64_t)zSig0 < 0) {
4432            shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4433        } else {
4434            zExp--;
4435        }
4436        shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4437        if (flags & float_muladd_halve_result) {
4438            zExp--;
4439        }
4440        return roundAndPackFloat64(zSign, zExp, zSig1, status);
4441    } else {
4442        /* Subtraction */
4443        if (expDiff > 0) {
4444            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4445            sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4446            zExp = pExp;
4447        } else if (expDiff < 0) {
4448            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4449            sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4450            zExp = cExp;
4451            zSign ^= 1;
4452        } else {
4453            zExp = pExp;
4454            if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4455                sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4456            } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4457                sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4458                zSign ^= 1;
4459            } else {
4460                /* Exact zero */
4461                zSign = signflip;
4462                if (status->float_rounding_mode == float_round_down) {
4463                    zSign ^= 1;
4464                }
4465                return packFloat64(zSign, 0, 0);
4466            }
4467        }
4468        --zExp;
4469        /* Do the equivalent of normalizeRoundAndPackFloat64() but
4470         * starting with the significand in a pair of uint64_t.
4471         */
4472        if (zSig0) {
4473            shiftcount = countLeadingZeros64(zSig0) - 1;
4474            shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4475            if (zSig1) {
4476                zSig0 |= 1;
4477            }
4478            zExp -= shiftcount;
4479        } else {
4480            shiftcount = countLeadingZeros64(zSig1);
4481            if (shiftcount == 0) {
4482                zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4483                zExp -= 63;
4484            } else {
4485                shiftcount--;
4486                zSig0 = zSig1 << shiftcount;
4487                zExp -= (shiftcount + 64);
4488            }
4489        }
4490        if (flags & float_muladd_halve_result) {
4491            zExp--;
4492        }
4493        return roundAndPackFloat64(zSign, zExp, zSig0, status);
4494    }
4495}
4496
4497/*----------------------------------------------------------------------------
4498| Returns the square root of the double-precision floating-point value `a'.
4499| The operation is performed according to the IEC/IEEE Standard for Binary
4500| Floating-Point Arithmetic.
4501*----------------------------------------------------------------------------*/
4502
4503float64 float64_sqrt(float64 a, float_status *status)
4504{
4505    flag aSign;
4506    int aExp, zExp;
4507    uint64_t aSig, zSig, doubleZSig;
4508    uint64_t rem0, rem1, term0, term1;
4509    a = float64_squash_input_denormal(a, status);
4510
4511    aSig = extractFloat64Frac( a );
4512    aExp = extractFloat64Exp( a );
4513    aSign = extractFloat64Sign( a );
4514    if ( aExp == 0x7FF ) {
4515        if (aSig) {
4516            return propagateFloat64NaN(a, a, status);
4517        }
4518        if ( ! aSign ) return a;
4519        float_raise(float_flag_invalid, status);
4520        return float64_default_nan(status);
4521    }
4522    if ( aSign ) {
4523        if ( ( aExp | aSig ) == 0 ) return a;
4524        float_raise(float_flag_invalid, status);
4525        return float64_default_nan(status);
4526    }
4527    if ( aExp == 0 ) {
4528        if ( aSig == 0 ) return float64_zero;
4529        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4530    }
4531    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4532    aSig |= LIT64( 0x0010000000000000 );
4533    zSig = estimateSqrt32( aExp, aSig>>21 );
4534    aSig <<= 9 - ( aExp & 1 );
4535    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4536    if ( ( zSig & 0x1FF ) <= 5 ) {
4537        doubleZSig = zSig<<1;
4538        mul64To128( zSig, zSig, &term0, &term1 );
4539        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4540        while ( (int64_t) rem0 < 0 ) {
4541            --zSig;
4542            doubleZSig -= 2;
4543            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4544        }
4545        zSig |= ( ( rem0 | rem1 ) != 0 );
4546    }
4547    return roundAndPackFloat64(0, zExp, zSig, status);
4548
4549}
4550
4551/*----------------------------------------------------------------------------
4552| Returns the binary log of the double-precision floating-point value `a'.
4553| The operation is performed according to the IEC/IEEE Standard for Binary
4554| Floating-Point Arithmetic.
4555*----------------------------------------------------------------------------*/
4556float64 float64_log2(float64 a, float_status *status)
4557{
4558    flag aSign, zSign;
4559    int aExp;
4560    uint64_t aSig, aSig0, aSig1, zSig, i;
4561    a = float64_squash_input_denormal(a, status);
4562
4563    aSig = extractFloat64Frac( a );
4564    aExp = extractFloat64Exp( a );
4565    aSign = extractFloat64Sign( a );
4566
4567    if ( aExp == 0 ) {
4568        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4569        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4570    }
4571    if ( aSign ) {
4572        float_raise(float_flag_invalid, status);
4573        return float64_default_nan(status);
4574    }
4575    if ( aExp == 0x7FF ) {
4576        if (aSig) {
4577            return propagateFloat64NaN(a, float64_zero, status);
4578        }
4579        return a;
4580    }
4581
4582    aExp -= 0x3FF;
4583    aSig |= LIT64( 0x0010000000000000 );
4584    zSign = aExp < 0;
4585    zSig = (uint64_t)aExp << 52;
4586    for (i = 1LL << 51; i > 0; i >>= 1) {
4587        mul64To128( aSig, aSig, &aSig0, &aSig1 );
4588        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4589        if ( aSig & LIT64( 0x0020000000000000 ) ) {
4590            aSig >>= 1;
4591            zSig |= i;
4592        }
4593    }
4594
4595    if ( zSign )
4596        zSig = -zSig;
4597    return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4598}
4599
4600/*----------------------------------------------------------------------------
4601| Returns 1 if the double-precision floating-point value `a' is equal to the
4602| corresponding value `b', and 0 otherwise.  The invalid exception is raised
4603| if either operand is a NaN.  Otherwise, the comparison is performed
4604| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4605*----------------------------------------------------------------------------*/
4606
4607int float64_eq(float64 a, float64 b, float_status *status)
4608{
4609    uint64_t av, bv;
4610    a = float64_squash_input_denormal(a, status);
4611    b = float64_squash_input_denormal(b, status);
4612
4613    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4614         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4615       ) {
4616        float_raise(float_flag_invalid, status);
4617        return 0;
4618    }
4619    av = float64_val(a);
4620    bv = float64_val(b);
4621    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4622
4623}
4624
4625/*----------------------------------------------------------------------------
4626| Returns 1 if the double-precision floating-point value `a' is less than or
4627| equal to the corresponding value `b', and 0 otherwise.  The invalid
4628| exception is raised if either operand is a NaN.  The comparison is performed
4629| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4630*----------------------------------------------------------------------------*/
4631
4632int float64_le(float64 a, float64 b, float_status *status)
4633{
4634    flag aSign, bSign;
4635    uint64_t av, bv;
4636    a = float64_squash_input_denormal(a, status);
4637    b = float64_squash_input_denormal(b, status);
4638
4639    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4640         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4641       ) {
4642        float_raise(float_flag_invalid, status);
4643        return 0;
4644    }
4645    aSign = extractFloat64Sign( a );
4646    bSign = extractFloat64Sign( b );
4647    av = float64_val(a);
4648    bv = float64_val(b);
4649    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4650    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4651
4652}
4653
4654/*----------------------------------------------------------------------------
4655| Returns 1 if the double-precision floating-point value `a' is less than
4656| the corresponding value `b', and 0 otherwise.  The invalid exception is
4657| raised if either operand is a NaN.  The comparison is performed according
4658| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4659*----------------------------------------------------------------------------*/
4660
4661int float64_lt(float64 a, float64 b, float_status *status)
4662{
4663    flag aSign, bSign;
4664    uint64_t av, bv;
4665
4666    a = float64_squash_input_denormal(a, status);
4667    b = float64_squash_input_denormal(b, status);
4668    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4669         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4670       ) {
4671        float_raise(float_flag_invalid, status);
4672        return 0;
4673    }
4674    aSign = extractFloat64Sign( a );
4675    bSign = extractFloat64Sign( b );
4676    av = float64_val(a);
4677    bv = float64_val(b);
4678    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4679    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4680
4681}
4682
4683/*----------------------------------------------------------------------------
4684| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4685| be compared, and 0 otherwise.  The invalid exception is raised if either
4686| operand is a NaN.  The comparison is performed according to the IEC/IEEE
4687| Standard for Binary Floating-Point Arithmetic.
4688*----------------------------------------------------------------------------*/
4689
4690int float64_unordered(float64 a, float64 b, float_status *status)
4691{
4692    a = float64_squash_input_denormal(a, status);
4693    b = float64_squash_input_denormal(b, status);
4694
4695    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4696         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4697       ) {
4698        float_raise(float_flag_invalid, status);
4699        return 1;
4700    }
4701    return 0;
4702}
4703
4704/*----------------------------------------------------------------------------
4705| Returns 1 if the double-precision floating-point value `a' is equal to the
4706| corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4707| exception.The comparison is performed according to the IEC/IEEE Standard
4708| for Binary Floating-Point Arithmetic.
4709*----------------------------------------------------------------------------*/
4710
4711int float64_eq_quiet(float64 a, float64 b, float_status *status)
4712{
4713    uint64_t av, bv;
4714    a = float64_squash_input_denormal(a, status);
4715    b = float64_squash_input_denormal(b, status);
4716
4717    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4718         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4719       ) {
4720        if (float64_is_signaling_nan(a, status)
4721         || float64_is_signaling_nan(b, status)) {
4722            float_raise(float_flag_invalid, status);
4723        }
4724        return 0;
4725    }
4726    av = float64_val(a);
4727    bv = float64_val(b);
4728    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4729
4730}
4731
4732/*----------------------------------------------------------------------------
4733| Returns 1 if the double-precision floating-point value `a' is less than or
4734| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4735| cause an exception.  Otherwise, the comparison is performed according to the
4736| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4737*----------------------------------------------------------------------------*/
4738
4739int float64_le_quiet(float64 a, float64 b, float_status *status)
4740{
4741    flag aSign, bSign;
4742    uint64_t av, bv;
4743    a = float64_squash_input_denormal(a, status);
4744    b = float64_squash_input_denormal(b, status);
4745
4746    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4747         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4748       ) {
4749        if (float64_is_signaling_nan(a, status)
4750         || float64_is_signaling_nan(b, status)) {
4751            float_raise(float_flag_invalid, status);
4752        }
4753        return 0;
4754    }
4755    aSign = extractFloat64Sign( a );
4756    bSign = extractFloat64Sign( b );
4757    av = float64_val(a);
4758    bv = float64_val(b);
4759    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4760    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4761
4762}
4763
4764/*----------------------------------------------------------------------------
4765| Returns 1 if the double-precision floating-point value `a' is less than
4766| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4767| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4768| Standard for Binary Floating-Point Arithmetic.
4769*----------------------------------------------------------------------------*/
4770
4771int float64_lt_quiet(float64 a, float64 b, float_status *status)
4772{
4773    flag aSign, bSign;
4774    uint64_t av, bv;
4775    a = float64_squash_input_denormal(a, status);
4776    b = float64_squash_input_denormal(b, status);
4777
4778    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4779         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4780       ) {
4781        if (float64_is_signaling_nan(a, status)
4782         || float64_is_signaling_nan(b, status)) {
4783            float_raise(float_flag_invalid, status);
4784        }
4785        return 0;
4786    }
4787    aSign = extractFloat64Sign( a );
4788    bSign = extractFloat64Sign( b );
4789    av = float64_val(a);
4790    bv = float64_val(b);
4791    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4792    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4793
4794}
4795
4796/*----------------------------------------------------------------------------
4797| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4798| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4799| comparison is performed according to the IEC/IEEE Standard for Binary
4800| Floating-Point Arithmetic.
4801*----------------------------------------------------------------------------*/
4802
4803int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4804{
4805    a = float64_squash_input_denormal(a, status);
4806    b = float64_squash_input_denormal(b, status);
4807
4808    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4809         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4810       ) {
4811        if (float64_is_signaling_nan(a, status)
4812         || float64_is_signaling_nan(b, status)) {
4813            float_raise(float_flag_invalid, status);
4814        }
4815        return 1;
4816    }
4817    return 0;
4818}
4819
4820/*----------------------------------------------------------------------------
4821| Returns the result of converting the extended double-precision floating-
4822| point value `a' to the 32-bit two's complement integer format.  The
4823| conversion is performed according to the IEC/IEEE Standard for Binary
4824| Floating-Point Arithmetic---which means in particular that the conversion
4825| is rounded according to the current rounding mode.  If `a' is a NaN, the
4826| largest positive integer is returned.  Otherwise, if the conversion
4827| overflows, the largest integer with the same sign as `a' is returned.
4828*----------------------------------------------------------------------------*/
4829
4830int32_t floatx80_to_int32(floatx80 a, float_status *status)
4831{
4832    flag aSign;
4833    int32_t aExp, shiftCount;
4834    uint64_t aSig;
4835
4836    if (floatx80_invalid_encoding(a)) {
4837        float_raise(float_flag_invalid, status);
4838        return 1 << 31;
4839    }
4840    aSig = extractFloatx80Frac( a );
4841    aExp = extractFloatx80Exp( a );
4842    aSign = extractFloatx80Sign( a );
4843    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4844    shiftCount = 0x4037 - aExp;
4845    if ( shiftCount <= 0 ) shiftCount = 1;
4846    shift64RightJamming( aSig, shiftCount, &aSig );
4847    return roundAndPackInt32(aSign, aSig, status);
4848
4849}
4850
4851/*----------------------------------------------------------------------------
4852| Returns the result of converting the extended double-precision floating-
4853| point value `a' to the 32-bit two's complement integer format.  The
4854| conversion is performed according to the IEC/IEEE Standard for Binary
4855| Floating-Point Arithmetic, except that the conversion is always rounded
4856| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4857| Otherwise, if the conversion overflows, the largest integer with the same
4858| sign as `a' is returned.
4859*----------------------------------------------------------------------------*/
4860
4861int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4862{
4863    flag aSign;
4864    int32_t aExp, shiftCount;
4865    uint64_t aSig, savedASig;
4866    int32_t z;
4867
4868    if (floatx80_invalid_encoding(a)) {
4869        float_raise(float_flag_invalid, status);
4870        return 1 << 31;
4871    }
4872    aSig = extractFloatx80Frac( a );
4873    aExp = extractFloatx80Exp( a );
4874    aSign = extractFloatx80Sign( a );
4875    if ( 0x401E < aExp ) {
4876        if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4877        goto invalid;
4878    }
4879    else if ( aExp < 0x3FFF ) {
4880        if (aExp || aSig) {
4881            status->float_exception_flags |= float_flag_inexact;
4882        }
4883        return 0;
4884    }
4885    shiftCount = 0x403E - aExp;
4886    savedASig = aSig;
4887    aSig >>= shiftCount;
4888    z = aSig;
4889    if ( aSign ) z = - z;
4890    if ( ( z < 0 ) ^ aSign ) {
4891 invalid:
4892        float_raise(float_flag_invalid, status);
4893        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4894    }
4895    if ( ( aSig<<shiftCount ) != savedASig ) {
4896        status->float_exception_flags |= float_flag_inexact;
4897    }
4898    return z;
4899
4900}
4901
4902/*----------------------------------------------------------------------------
4903| Returns the result of converting the extended double-precision floating-
4904| point value `a' to the 64-bit two's complement integer format.  The
4905| conversion is performed according to the IEC/IEEE Standard for Binary
4906| Floating-Point Arithmetic---which means in particular that the conversion
4907| is rounded according to the current rounding mode.  If `a' is a NaN,
4908| the largest positive integer is returned.  Otherwise, if the conversion
4909| overflows, the largest integer with the same sign as `a' is returned.
4910*----------------------------------------------------------------------------*/
4911
4912int64_t floatx80_to_int64(floatx80 a, float_status *status)
4913{
4914    flag aSign;
4915    int32_t aExp, shiftCount;
4916    uint64_t aSig, aSigExtra;
4917
4918    if (floatx80_invalid_encoding(a)) {
4919        float_raise(float_flag_invalid, status);
4920        return 1ULL << 63;
4921    }
4922    aSig = extractFloatx80Frac( a );
4923    aExp = extractFloatx80Exp( a );
4924    aSign = extractFloatx80Sign( a );
4925    shiftCount = 0x403E - aExp;
4926    if ( shiftCount <= 0 ) {
4927        if ( shiftCount ) {
4928            float_raise(float_flag_invalid, status);
4929            if (    ! aSign
4930                 || (    ( aExp == 0x7FFF )
4931                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
4932               ) {
4933                return LIT64( 0x7FFFFFFFFFFFFFFF );
4934            }
4935            return (int64_t) LIT64( 0x8000000000000000 );
4936        }
4937        aSigExtra = 0;
4938    }
4939    else {
4940        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4941    }
4942    return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4943
4944}
4945
4946/*----------------------------------------------------------------------------
4947| Returns the result of converting the extended double-precision floating-
4948| point value `a' to the 64-bit two's complement integer format.  The
4949| conversion is performed according to the IEC/IEEE Standard for Binary
4950| Floating-Point Arithmetic, except that the conversion is always rounded
4951| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4952| Otherwise, if the conversion overflows, the largest integer with the same
4953| sign as `a' is returned.
4954*----------------------------------------------------------------------------*/
4955
4956int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4957{
4958    flag aSign;
4959    int32_t aExp, shiftCount;
4960    uint64_t aSig;
4961    int64_t z;
4962
4963    if (floatx80_invalid_encoding(a)) {
4964        float_raise(float_flag_invalid, status);
4965        return 1ULL << 63;
4966    }
4967    aSig = extractFloatx80Frac( a );
4968    aExp = extractFloatx80Exp( a );
4969    aSign = extractFloatx80Sign( a );
4970    shiftCount = aExp - 0x403E;
4971    if ( 0 <= shiftCount ) {
4972        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4973        if ( ( a.high != 0xC03E ) || aSig ) {
4974            float_raise(float_flag_invalid, status);
4975            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4976                return LIT64( 0x7FFFFFFFFFFFFFFF );
4977            }
4978        }
4979        return (int64_t) LIT64( 0x8000000000000000 );
4980    }
4981    else if ( aExp < 0x3FFF ) {
4982        if (aExp | aSig) {
4983            status->float_exception_flags |= float_flag_inexact;
4984        }
4985        return 0;
4986    }
4987    z = aSig>>( - shiftCount );
4988    if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4989        status->float_exception_flags |= float_flag_inexact;
4990    }
4991    if ( aSign ) z = - z;
4992    return z;
4993
4994}
4995
4996/*----------------------------------------------------------------------------
4997| Returns the result of converting the extended double-precision floating-
4998| point value `a' to the single-precision floating-point format.  The
4999| conversion is performed according to the IEC/IEEE Standard for Binary
5000| Floating-Point Arithmetic.
5001*----------------------------------------------------------------------------*/
5002
5003float32 floatx80_to_float32(floatx80 a, float_status *status)
5004{
5005    flag aSign;
5006    int32_t aExp;
5007    uint64_t aSig;
5008
5009    if (floatx80_invalid_encoding(a)) {
5010        float_raise(float_flag_invalid, status);
5011        return float32_default_nan(status);
5012    }
5013    aSig = extractFloatx80Frac( a );
5014    aExp = extractFloatx80Exp( a );
5015    aSign = extractFloatx80Sign( a );
5016    if ( aExp == 0x7FFF ) {
5017        if ( (uint64_t) ( aSig<<1 ) ) {
5018            return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5019        }
5020        return packFloat32( aSign, 0xFF, 0 );
5021    }
5022    shift64RightJamming( aSig, 33, &aSig );
5023    if ( aExp || aSig ) aExp -= 0x3F81;
5024    return roundAndPackFloat32(aSign, aExp, aSig, status);
5025
5026}
5027
5028/*----------------------------------------------------------------------------
5029| Returns the result of converting the extended double-precision floating-
5030| point value `a' to the double-precision floating-point format.  The
5031| conversion is performed according to the IEC/IEEE Standard for Binary
5032| Floating-Point Arithmetic.
5033*----------------------------------------------------------------------------*/
5034
5035float64 floatx80_to_float64(floatx80 a, float_status *status)
5036{
5037    flag aSign;
5038    int32_t aExp;
5039    uint64_t aSig, zSig;
5040
5041    if (floatx80_invalid_encoding(a)) {
5042        float_raise(float_flag_invalid, status);
5043        return float64_default_nan(status);
5044    }
5045    aSig = extractFloatx80Frac( a );
5046    aExp = extractFloatx80Exp( a );
5047    aSign = extractFloatx80Sign( a );
5048    if ( aExp == 0x7FFF ) {
5049        if ( (uint64_t) ( aSig<<1 ) ) {
5050            return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5051        }
5052        return packFloat64( aSign, 0x7FF, 0 );
5053    }
5054    shift64RightJamming( aSig, 1, &zSig );
5055    if ( aExp || aSig ) aExp -= 0x3C01;
5056    return roundAndPackFloat64(aSign, aExp, zSig, status);
5057
5058}
5059
5060/*----------------------------------------------------------------------------
5061| Returns the result of converting the extended double-precision floating-
5062| point value `a' to the quadruple-precision floating-point format.  The
5063| conversion is performed according to the IEC/IEEE Standard for Binary
5064| Floating-Point Arithmetic.
5065*----------------------------------------------------------------------------*/
5066
5067float128 floatx80_to_float128(floatx80 a, float_status *status)
5068{
5069    flag aSign;
5070    int aExp;
5071    uint64_t aSig, zSig0, zSig1;
5072
5073    if (floatx80_invalid_encoding(a)) {
5074        float_raise(float_flag_invalid, status);
5075        return float128_default_nan(status);
5076    }
5077    aSig = extractFloatx80Frac( a );
5078    aExp = extractFloatx80Exp( a );
5079    aSign = extractFloatx80Sign( a );
5080    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5081        return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5082    }
5083    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5084    return packFloat128( aSign, aExp, zSig0, zSig1 );
5085
5086}
5087
5088/*----------------------------------------------------------------------------
5089| Rounds the extended double-precision floating-point value `a'
5090| to the precision provided by floatx80_rounding_precision and returns the
5091| result as an extended double-precision floating-point value.
5092| The operation is performed according to the IEC/IEEE Standard for Binary
5093| Floating-Point Arithmetic.
5094*----------------------------------------------------------------------------*/
5095
5096floatx80 floatx80_round(floatx80 a, float_status *status)
5097{
5098    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5099                                extractFloatx80Sign(a),
5100                                extractFloatx80Exp(a),
5101                                extractFloatx80Frac(a), 0, status);
5102}
5103
5104/*----------------------------------------------------------------------------
5105| Rounds the extended double-precision floating-point value `a' to an integer,
5106| and returns the result as an extended quadruple-precision floating-point
5107| value.  The operation is performed according to the IEC/IEEE Standard for
5108| Binary Floating-Point Arithmetic.
5109*----------------------------------------------------------------------------*/
5110
5111floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5112{
5113    flag aSign;
5114    int32_t aExp;
5115    uint64_t lastBitMask, roundBitsMask;
5116    floatx80 z;
5117
5118    if (floatx80_invalid_encoding(a)) {
5119        float_raise(float_flag_invalid, status);
5120        return floatx80_default_nan(status);
5121    }
5122    aExp = extractFloatx80Exp( a );
5123    if ( 0x403E <= aExp ) {
5124        if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5125            return propagateFloatx80NaN(a, a, status);
5126        }
5127        return a;
5128    }
5129    if ( aExp < 0x3FFF ) {
5130        if (    ( aExp == 0 )
5131             && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5132            return a;
5133        }
5134        status->float_exception_flags |= float_flag_inexact;
5135        aSign = extractFloatx80Sign( a );
5136        switch (status->float_rounding_mode) {
5137         case float_round_nearest_even:
5138            if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5139               ) {
5140                return
5141                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5142            }
5143            break;
5144        case float_round_ties_away:
5145            if (aExp == 0x3FFE) {
5146                return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5147            }
5148            break;
5149         case float_round_down:
5150            return
5151                  aSign ?
5152                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5153                : packFloatx80( 0, 0, 0 );
5154         case float_round_up:
5155            return
5156                  aSign ? packFloatx80( 1, 0, 0 )
5157                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5158        }
5159        return packFloatx80( aSign, 0, 0 );
5160    }
5161    lastBitMask = 1;
5162    lastBitMask <<= 0x403E - aExp;
5163    roundBitsMask = lastBitMask - 1;
5164    z = a;
5165    switch (status->float_rounding_mode) {
5166    case float_round_nearest_even:
5167        z.low += lastBitMask>>1;
5168        if ((z.low & roundBitsMask) == 0) {
5169            z.low &= ~lastBitMask;
5170        }
5171        break;
5172    case float_round_ties_away:
5173        z.low += lastBitMask >> 1;
5174        break;
5175    case float_round_to_zero:
5176        break;
5177    case float_round_up:
5178        if (!extractFloatx80Sign(z)) {
5179            z.low += roundBitsMask;
5180        }
5181        break;
5182    case float_round_down:
5183        if (extractFloatx80Sign(z)) {
5184            z.low += roundBitsMask;
5185        }
5186        break;
5187    default:
5188        abort();
5189    }
5190    z.low &= ~ roundBitsMask;
5191    if ( z.low == 0 ) {
5192        ++z.high;
5193        z.low = LIT64( 0x8000000000000000 );
5194    }
5195    if (z.low != a.low) {
5196        status->float_exception_flags |= float_flag_inexact;
5197    }
5198    return z;
5199
5200}
5201
5202/*----------------------------------------------------------------------------
5203| Returns the result of adding the absolute values of the extended double-
5204| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5205| negated before being returned.  `zSign' is ignored if the result is a NaN.
5206| The addition is performed according to the IEC/IEEE Standard for Binary
5207| Floating-Point Arithmetic.
5208*----------------------------------------------------------------------------*/
5209
5210static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5211                                float_status *status)
5212{
5213    int32_t aExp, bExp, zExp;
5214    uint64_t aSig, bSig, zSig0, zSig1;
5215    int32_t expDiff;
5216
5217    aSig = extractFloatx80Frac( a );
5218    aExp = extractFloatx80Exp( a );
5219    bSig = extractFloatx80Frac( b );
5220    bExp = extractFloatx80Exp( b );
5221    expDiff = aExp - bExp;
5222    if ( 0 < expDiff ) {
5223        if ( aExp == 0x7FFF ) {
5224            if ((uint64_t)(aSig << 1)) {
5225                return propagateFloatx80NaN(a, b, status);
5226            }
5227            return a;
5228        }
5229        if ( bExp == 0 ) --expDiff;
5230        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5231        zExp = aExp;
5232    }
5233    else if ( expDiff < 0 ) {
5234        if ( bExp == 0x7FFF ) {
5235            if ((uint64_t)(bSig << 1)) {
5236                return propagateFloatx80NaN(a, b, status);
5237            }
5238            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5239        }
5240        if ( aExp == 0 ) ++expDiff;
5241        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5242        zExp = bExp;
5243    }
5244    else {
5245        if ( aExp == 0x7FFF ) {
5246            if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5247                return propagateFloatx80NaN(a, b, status);
5248            }
5249            return a;
5250        }
5251        zSig1 = 0;
5252        zSig0 = aSig + bSig;
5253        if ( aExp == 0 ) {
5254            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5255            goto roundAndPack;
5256        }
5257        zExp = aExp;
5258        goto shiftRight1;
5259    }
5260    zSig0 = aSig + bSig;
5261    if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5262 shiftRight1:
5263    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5264    zSig0 |= LIT64( 0x8000000000000000 );
5265    ++zExp;
5266 roundAndPack:
5267    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5268                                zSign, zExp, zSig0, zSig1, status);
5269}
5270
5271/*----------------------------------------------------------------------------
5272| Returns the result of subtracting the absolute values of the extended
5273| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5274| difference is negated before being returned.  `zSign' is ignored if the
5275| result is a NaN.  The subtraction is performed according to the IEC/IEEE
5276| Standard for Binary Floating-Point Arithmetic.
5277*----------------------------------------------------------------------------*/
5278
5279static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5280                                float_status *status)
5281{
5282    int32_t aExp, bExp, zExp;
5283    uint64_t aSig, bSig, zSig0, zSig1;
5284    int32_t expDiff;
5285
5286    aSig = extractFloatx80Frac( a );
5287    aExp = extractFloatx80Exp( a );
5288    bSig = extractFloatx80Frac( b );
5289    bExp = extractFloatx80Exp( b );
5290    expDiff = aExp - bExp;
5291    if ( 0 < expDiff ) goto aExpBigger;
5292    if ( expDiff < 0 ) goto bExpBigger;
5293    if ( aExp == 0x7FFF ) {
5294        if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5295            return propagateFloatx80NaN(a, b, status);
5296        }
5297        float_raise(float_flag_invalid, status);
5298        return floatx80_default_nan(status);
5299    }
5300    if ( aExp == 0 ) {
5301        aExp = 1;
5302        bExp = 1;
5303    }
5304    zSig1 = 0;
5305    if ( bSig < aSig ) goto aBigger;
5306    if ( aSig < bSig ) goto bBigger;
5307    return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5308 bExpBigger:
5309    if ( bExp == 0x7FFF ) {
5310        if ((uint64_t)(bSig << 1)) {
5311            return propagateFloatx80NaN(a, b, status);
5312        }
5313        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5314    }
5315    if ( aExp == 0 ) ++expDiff;
5316    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5317 bBigger:
5318    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5319    zExp = bExp;
5320    zSign ^= 1;
5321    goto normalizeRoundAndPack;
5322 aExpBigger:
5323    if ( aExp == 0x7FFF ) {
5324        if ((uint64_t)(aSig << 1)) {
5325            return propagateFloatx80NaN(a, b, status);
5326        }
5327        return a;
5328    }
5329    if ( bExp == 0 ) --expDiff;
5330    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5331 aBigger:
5332    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5333    zExp = aExp;
5334 normalizeRoundAndPack:
5335    return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5336                                         zSign, zExp, zSig0, zSig1, status);
5337}
5338
5339/*----------------------------------------------------------------------------
5340| Returns the result of adding the extended double-precision floating-point
5341| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5342| Standard for Binary Floating-Point Arithmetic.
5343*----------------------------------------------------------------------------*/
5344
5345floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5346{
5347    flag aSign, bSign;
5348
5349    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5350        float_raise(float_flag_invalid, status);
5351        return floatx80_default_nan(status);
5352    }
5353    aSign = extractFloatx80Sign( a );
5354    bSign = extractFloatx80Sign( b );
5355    if ( aSign == bSign ) {
5356        return addFloatx80Sigs(a, b, aSign, status);
5357    }
5358    else {
5359        return subFloatx80Sigs(a, b, aSign, status);
5360    }
5361
5362}
5363
5364/*----------------------------------------------------------------------------
5365| Returns the result of subtracting the extended double-precision floating-
5366| point values `a' and `b'.  The operation is performed according to the
5367| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5368*----------------------------------------------------------------------------*/
5369
5370floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5371{
5372    flag aSign, bSign;
5373
5374    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5375        float_raise(float_flag_invalid, status);
5376        return floatx80_default_nan(status);
5377    }
5378    aSign = extractFloatx80Sign( a );
5379    bSign = extractFloatx80Sign( b );
5380    if ( aSign == bSign ) {
5381        return subFloatx80Sigs(a, b, aSign, status);
5382    }
5383    else {
5384        return addFloatx80Sigs(a, b, aSign, status);
5385    }
5386
5387}
5388
5389/*----------------------------------------------------------------------------
5390| Returns the result of multiplying the extended double-precision floating-
5391| point values `a' and `b'.  The operation is performed according to the
5392| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5393*----------------------------------------------------------------------------*/
5394
5395floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5396{
5397    flag aSign, bSign, zSign;
5398    int32_t aExp, bExp, zExp;
5399    uint64_t aSig, bSig, zSig0, zSig1;
5400
5401    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5402        float_raise(float_flag_invalid, status);
5403        return floatx80_default_nan(status);
5404    }
5405    aSig = extractFloatx80Frac( a );
5406    aExp = extractFloatx80Exp( a );
5407    aSign = extractFloatx80Sign( a );
5408    bSig = extractFloatx80Frac( b );
5409    bExp = extractFloatx80Exp( b );
5410    bSign = extractFloatx80Sign( b );
5411    zSign = aSign ^ bSign;
5412    if ( aExp == 0x7FFF ) {
5413        if (    (uint64_t) ( aSig<<1 )
5414             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5415            return propagateFloatx80NaN(a, b, status);
5416        }
5417        if ( ( bExp | bSig ) == 0 ) goto invalid;
5418        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5419    }
5420    if ( bExp == 0x7FFF ) {
5421        if ((uint64_t)(bSig << 1)) {
5422            return propagateFloatx80NaN(a, b, status);
5423        }
5424        if ( ( aExp | aSig ) == 0 ) {
5425 invalid:
5426            float_raise(float_flag_invalid, status);
5427            return floatx80_default_nan(status);
5428        }
5429        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5430    }
5431    if ( aExp == 0 ) {
5432        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5433        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5434    }
5435    if ( bExp == 0 ) {
5436        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5437        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5438    }
5439    zExp = aExp + bExp - 0x3FFE;
5440    mul64To128( aSig, bSig, &zSig0, &zSig1 );
5441    if ( 0 < (int64_t) zSig0 ) {
5442        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5443        --zExp;
5444    }
5445    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5446                                zSign, zExp, zSig0, zSig1, status);
5447}
5448
5449/*----------------------------------------------------------------------------
5450| Returns the result of dividing the extended double-precision floating-point
5451| value `a' by the corresponding value `b'.  The operation is performed
5452| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5453*----------------------------------------------------------------------------*/
5454
5455floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5456{
5457    flag aSign, bSign, zSign;
5458    int32_t aExp, bExp, zExp;
5459    uint64_t aSig, bSig, zSig0, zSig1;
5460    uint64_t rem0, rem1, rem2, term0, term1, term2;
5461
5462    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5463        float_raise(float_flag_invalid, status);
5464        return floatx80_default_nan(status);
5465    }
5466    aSig = extractFloatx80Frac( a );
5467    aExp = extractFloatx80Exp( a );
5468    aSign = extractFloatx80Sign( a );
5469    bSig = extractFloatx80Frac( b );
5470    bExp = extractFloatx80Exp( b );
5471    bSign = extractFloatx80Sign( b );
5472    zSign = aSign ^ bSign;
5473    if ( aExp == 0x7FFF ) {
5474        if ((uint64_t)(aSig << 1)) {
5475            return propagateFloatx80NaN(a, b, status);
5476        }
5477        if ( bExp == 0x7FFF ) {
5478            if ((uint64_t)(bSig << 1)) {
5479                return propagateFloatx80NaN(a, b, status);
5480            }
5481            goto invalid;
5482        }
5483        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5484    }
5485    if ( bExp == 0x7FFF ) {
5486        if ((uint64_t)(bSig << 1)) {
5487            return propagateFloatx80NaN(a, b, status);
5488        }
5489        return packFloatx80( zSign, 0, 0 );
5490    }
5491    if ( bExp == 0 ) {
5492        if ( bSig == 0 ) {
5493            if ( ( aExp | aSig ) == 0 ) {
5494 invalid:
5495                float_raise(float_flag_invalid, status);
5496                return floatx80_default_nan(status);
5497            }
5498            float_raise(float_flag_divbyzero, status);
5499            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5500        }
5501        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5502    }
5503    if ( aExp == 0 ) {
5504        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5505        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5506    }
5507    zExp = aExp - bExp + 0x3FFE;
5508    rem1 = 0;
5509    if ( bSig <= aSig ) {
5510        shift128Right( aSig, 0, 1, &aSig, &rem1 );
5511        ++zExp;
5512    }
5513    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5514    mul64To128( bSig, zSig0, &term0, &term1 );
5515    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5516    while ( (int64_t) rem0 < 0 ) {
5517        --zSig0;
5518        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5519    }
5520    zSig1 = estimateDiv128To64( rem1, 0, bSig );
5521    if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5522        mul64To128( bSig, zSig1, &term1, &term2 );
5523        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5524        while ( (int64_t) rem1 < 0 ) {
5525            --zSig1;
5526            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5527        }
5528        zSig1 |= ( ( rem1 | rem2 ) != 0 );
5529    }
5530    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5531                                zSign, zExp, zSig0, zSig1, status);
5532}
5533
5534/*----------------------------------------------------------------------------
5535| Returns the remainder of the extended double-precision floating-point value
5536| `a' with respect to the corresponding value `b'.  The operation is performed
5537| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5538*----------------------------------------------------------------------------*/
5539
5540floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5541{
5542    flag aSign, zSign;
5543    int32_t aExp, bExp, expDiff;
5544    uint64_t aSig0, aSig1, bSig;
5545    uint64_t q, term0, term1, alternateASig0, alternateASig1;
5546
5547    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5548        float_raise(float_flag_invalid, status);
5549        return floatx80_default_nan(status);
5550    }
5551    aSig0 = extractFloatx80Frac( a );
5552    aExp = extractFloatx80Exp( a );
5553    aSign = extractFloatx80Sign( a );
5554    bSig = extractFloatx80Frac( b );
5555    bExp = extractFloatx80Exp( b );
5556    if ( aExp == 0x7FFF ) {
5557        if (    (uint64_t) ( aSig0<<1 )
5558             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5559            return propagateFloatx80NaN(a, b, status);
5560        }
5561        goto invalid;
5562    }
5563    if ( bExp == 0x7FFF ) {
5564        if ((uint64_t)(bSig << 1)) {
5565            return propagateFloatx80NaN(a, b, status);
5566        }
5567        return a;
5568    }
5569    if ( bExp == 0 ) {
5570        if ( bSig == 0 ) {
5571 invalid:
5572            float_raise(float_flag_invalid, status);
5573            return floatx80_default_nan(status);
5574        }
5575        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5576    }
5577    if ( aExp == 0 ) {
5578        if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5579        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5580    }
5581    bSig |= LIT64( 0x8000000000000000 );
5582    zSign = aSign;
5583    expDiff = aExp - bExp;
5584    aSig1 = 0;
5585    if ( expDiff < 0 ) {
5586        if ( expDiff < -1 ) return a;
5587        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5588        expDiff = 0;
5589    }
5590    q = ( bSig <= aSig0 );
5591    if ( q ) aSig0 -= bSig;
5592    expDiff -= 64;
5593    while ( 0 < expDiff ) {
5594        q = estimateDiv128To64( aSig0, aSig1, bSig );
5595        q = ( 2 < q ) ? q - 2 : 0;
5596        mul64To128( bSig, q, &term0, &term1 );
5597        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5598        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5599        expDiff -= 62;
5600    }
5601    expDiff += 64;
5602    if ( 0 < expDiff ) {
5603        q = estimateDiv128To64( aSig0, aSig1, bSig );
5604        q = ( 2 < q ) ? q - 2 : 0;
5605        q >>= 64 - expDiff;
5606        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5607        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5608        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5609        while ( le128( term0, term1, aSig0, aSig1 ) ) {
5610            ++q;
5611            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5612        }
5613    }
5614    else {
5615        term1 = 0;
5616        term0 = bSig;
5617    }
5618    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5619    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5620         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5621              && ( q & 1 ) )
5622       ) {
5623        aSig0 = alternateASig0;
5624        aSig1 = alternateASig1;
5625        zSign = ! zSign;
5626    }
5627    return
5628        normalizeRoundAndPackFloatx80(
5629            80, zSign, bExp + expDiff, aSig0, aSig1, status);
5630
5631}
5632
5633/*----------------------------------------------------------------------------
5634| Returns the square root of the extended double-precision floating-point
5635| value `a'.  The operation is performed according to the IEC/IEEE Standard
5636| for Binary Floating-Point Arithmetic.
5637*----------------------------------------------------------------------------*/
5638
5639floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5640{
5641    flag aSign;
5642    int32_t aExp, zExp;
5643    uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5644    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5645
5646    if (floatx80_invalid_encoding(a)) {
5647        float_raise(float_flag_invalid, status);
5648        return floatx80_default_nan(status);
5649    }
5650    aSig0 = extractFloatx80Frac( a );
5651    aExp = extractFloatx80Exp( a );
5652    aSign = extractFloatx80Sign( a );
5653    if ( aExp == 0x7FFF ) {
5654        if ((uint64_t)(aSig0 << 1)) {
5655            return propagateFloatx80NaN(a, a, status);
5656        }
5657        if ( ! aSign ) return a;
5658        goto invalid;
5659    }
5660    if ( aSign ) {
5661        if ( ( aExp | aSig0 ) == 0 ) return a;
5662 invalid:
5663        float_raise(float_flag_invalid, status);
5664        return floatx80_default_nan(status);
5665    }
5666    if ( aExp == 0 ) {
5667        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5668        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5669    }
5670    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5671    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5672    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5673    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5674    doubleZSig0 = zSig0<<1;
5675    mul64To128( zSig0, zSig0, &term0, &term1 );
5676    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5677    while ( (int64_t) rem0 < 0 ) {
5678        --zSig0;
5679        doubleZSig0 -= 2;
5680        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5681    }
5682    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5683    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5684        if ( zSig1 == 0 ) zSig1 = 1;
5685        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5686        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5687        mul64To128( zSig1, zSig1, &term2, &term3 );
5688        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5689        while ( (int64_t) rem1 < 0 ) {
5690            --zSig1;
5691            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5692            term3 |= 1;
5693            term2 |= doubleZSig0;
5694            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5695        }
5696        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5697    }
5698    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5699    zSig0 |= doubleZSig0;
5700    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5701                                0, zExp, zSig0, zSig1, status);
5702}
5703
5704/*----------------------------------------------------------------------------
5705| Returns 1 if the extended double-precision floating-point value `a' is equal
5706| to the corresponding value `b', and 0 otherwise.  The invalid exception is
5707| raised if either operand is a NaN.  Otherwise, the comparison is performed
5708| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5709*----------------------------------------------------------------------------*/
5710
5711int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5712{
5713
5714    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5715        || (extractFloatx80Exp(a) == 0x7FFF
5716            && (uint64_t) (extractFloatx80Frac(a) << 1))
5717        || (extractFloatx80Exp(b) == 0x7FFF
5718            && (uint64_t) (extractFloatx80Frac(b) << 1))
5719       ) {
5720        float_raise(float_flag_invalid, status);
5721        return 0;
5722    }
5723    return
5724           ( a.low == b.low )
5725        && (    ( a.high == b.high )
5726             || (    ( a.low == 0 )
5727                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5728           );
5729
5730}
5731
5732/*----------------------------------------------------------------------------
5733| Returns 1 if the extended double-precision floating-point value `a' is
5734| less than or equal to the corresponding value `b', and 0 otherwise.  The
5735| invalid exception is raised if either operand is a NaN.  The comparison is
5736| performed according to the IEC/IEEE Standard for Binary Floating-Point
5737| Arithmetic.
5738*----------------------------------------------------------------------------*/
5739
5740int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5741{
5742    flag aSign, bSign;
5743
5744    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5745        || (extractFloatx80Exp(a) == 0x7FFF
5746            && (uint64_t) (extractFloatx80Frac(a) << 1))
5747        || (extractFloatx80Exp(b) == 0x7FFF
5748            && (uint64_t) (extractFloatx80Frac(b) << 1))
5749       ) {
5750        float_raise(float_flag_invalid, status);
5751        return 0;
5752    }
5753    aSign = extractFloatx80Sign( a );
5754    bSign = extractFloatx80Sign( b );
5755    if ( aSign != bSign ) {
5756        return
5757               aSign
5758            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5759                 == 0 );
5760    }
5761    return
5762          aSign ? le128( b.high, b.low, a.high, a.low )
5763        : le128( a.high, a.low, b.high, b.low );
5764
5765}
5766
5767/*----------------------------------------------------------------------------
5768| Returns 1 if the extended double-precision floating-point value `a' is
5769| less than the corresponding value `b', and 0 otherwise.  The invalid
5770| exception is raised if either operand is a NaN.  The comparison is performed
5771| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5772*----------------------------------------------------------------------------*/
5773
5774int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5775{
5776    flag aSign, bSign;
5777
5778    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5779        || (extractFloatx80Exp(a) == 0x7FFF
5780            && (uint64_t) (extractFloatx80Frac(a) << 1))
5781        || (extractFloatx80Exp(b) == 0x7FFF
5782            && (uint64_t) (extractFloatx80Frac(b) << 1))
5783       ) {
5784        float_raise(float_flag_invalid, status);
5785        return 0;
5786    }
5787    aSign = extractFloatx80Sign( a );
5788    bSign = extractFloatx80Sign( b );
5789    if ( aSign != bSign ) {
5790        return
5791               aSign
5792            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5793                 != 0 );
5794    }
5795    return
5796          aSign ? lt128( b.high, b.low, a.high, a.low )
5797        : lt128( a.high, a.low, b.high, b.low );
5798
5799}
5800
5801/*----------------------------------------------------------------------------
5802| Returns 1 if the extended double-precision floating-point values `a' and `b'
5803| cannot be compared, and 0 otherwise.  The invalid exception is raised if
5804| either operand is a NaN.   The comparison is performed according to the
5805| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5806*----------------------------------------------------------------------------*/
5807int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5808{
5809    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5810        || (extractFloatx80Exp(a) == 0x7FFF
5811            && (uint64_t) (extractFloatx80Frac(a) << 1))
5812        || (extractFloatx80Exp(b) == 0x7FFF
5813            && (uint64_t) (extractFloatx80Frac(b) << 1))
5814       ) {
5815        float_raise(float_flag_invalid, status);
5816        return 1;
5817    }
5818    return 0;
5819}
5820
5821/*----------------------------------------------------------------------------
5822| Returns 1 if the extended double-precision floating-point value `a' is
5823| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5824| cause an exception.  The comparison is performed according to the IEC/IEEE
5825| Standard for Binary Floating-Point Arithmetic.
5826*----------------------------------------------------------------------------*/
5827
5828int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5829{
5830
5831    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5832        float_raise(float_flag_invalid, status);
5833        return 0;
5834    }
5835    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5836              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5837         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5838              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5839       ) {
5840        if (floatx80_is_signaling_nan(a, status)
5841         || floatx80_is_signaling_nan(b, status)) {
5842            float_raise(float_flag_invalid, status);
5843        }
5844        return 0;
5845    }
5846    return
5847           ( a.low == b.low )
5848        && (    ( a.high == b.high )
5849             || (    ( a.low == 0 )
5850                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5851           );
5852
5853}
5854
5855/*----------------------------------------------------------------------------
5856| Returns 1 if the extended double-precision floating-point value `a' is less
5857| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5858| do not cause an exception.  Otherwise, the comparison is performed according
5859| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5860*----------------------------------------------------------------------------*/
5861
5862int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5863{
5864    flag aSign, bSign;
5865
5866    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5867        float_raise(float_flag_invalid, status);
5868        return 0;
5869    }
5870    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5871              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5872         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5873              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5874       ) {
5875        if (floatx80_is_signaling_nan(a, status)
5876         || floatx80_is_signaling_nan(b, status)) {
5877            float_raise(float_flag_invalid, status);
5878        }
5879        return 0;
5880    }
5881    aSign = extractFloatx80Sign( a );
5882    bSign = extractFloatx80Sign( b );
5883    if ( aSign != bSign ) {
5884        return
5885               aSign
5886            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5887                 == 0 );
5888    }
5889    return
5890          aSign ? le128( b.high, b.low, a.high, a.low )
5891        : le128( a.high, a.low, b.high, b.low );
5892
5893}
5894
5895/*----------------------------------------------------------------------------
5896| Returns 1 if the extended double-precision floating-point value `a' is less
5897| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5898| an exception.  Otherwise, the comparison is performed according to the
5899| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5900*----------------------------------------------------------------------------*/
5901
5902int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5903{
5904    flag aSign, bSign;
5905
5906    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5907        float_raise(float_flag_invalid, status);
5908        return 0;
5909    }
5910    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5911              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5912         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5913              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5914       ) {
5915        if (floatx80_is_signaling_nan(a, status)
5916         || floatx80_is_signaling_nan(b, status)) {
5917            float_raise(float_flag_invalid, status);
5918        }
5919        return 0;
5920    }
5921    aSign = extractFloatx80Sign( a );
5922    bSign = extractFloatx80Sign( b );
5923    if ( aSign != bSign ) {
5924        return
5925               aSign
5926            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5927                 != 0 );
5928    }
5929    return
5930          aSign ? lt128( b.high, b.low, a.high, a.low )
5931        : lt128( a.high, a.low, b.high, b.low );
5932
5933}
5934
5935/*----------------------------------------------------------------------------
5936| Returns 1 if the extended double-precision floating-point values `a' and `b'
5937| cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5938| The comparison is performed according to the IEC/IEEE Standard for Binary
5939| Floating-Point Arithmetic.
5940*----------------------------------------------------------------------------*/
5941int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5942{
5943    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5944        float_raise(float_flag_invalid, status);
5945        return 1;
5946    }
5947    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5948              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5949         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5950              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5951       ) {
5952        if (floatx80_is_signaling_nan(a, status)
5953         || floatx80_is_signaling_nan(b, status)) {
5954            float_raise(float_flag_invalid, status);
5955        }
5956        return 1;
5957    }
5958    return 0;
5959}
5960
5961/*----------------------------------------------------------------------------
5962| Returns the result of converting the quadruple-precision floating-point
5963| value `a' to the 32-bit two's complement integer format.  The conversion
5964| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5965| Arithmetic---which means in particular that the conversion is rounded
5966| according to the current rounding mode.  If `a' is a NaN, the largest
5967| positive integer is returned.  Otherwise, if the conversion overflows, the
5968| largest integer with the same sign as `a' is returned.
5969*----------------------------------------------------------------------------*/
5970
5971int32_t float128_to_int32(float128 a, float_status *status)
5972{
5973    flag aSign;
5974    int32_t aExp, shiftCount;
5975    uint64_t aSig0, aSig1;
5976
5977    aSig1 = extractFloat128Frac1( a );
5978    aSig0 = extractFloat128Frac0( a );
5979    aExp = extractFloat128Exp( a );
5980    aSign = extractFloat128Sign( a );
5981    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5982    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5983    aSig0 |= ( aSig1 != 0 );
5984    shiftCount = 0x4028 - aExp;
5985    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5986    return roundAndPackInt32(aSign, aSig0, status);
5987
5988}
5989
5990/*----------------------------------------------------------------------------
5991| Returns the result of converting the quadruple-precision floating-point
5992| value `a' to the 32-bit two's complement integer format.  The conversion
5993| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5994| Arithmetic, except that the conversion is always rounded toward zero.  If
5995| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5996| conversion overflows, the largest integer with the same sign as `a' is
5997| returned.
5998*----------------------------------------------------------------------------*/
5999
6000int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6001{
6002    flag aSign;
6003    int32_t aExp, shiftCount;
6004    uint64_t aSig0, aSig1, savedASig;
6005    int32_t z;
6006
6007    aSig1 = extractFloat128Frac1( a );
6008    aSig0 = extractFloat128Frac0( a );
6009    aExp = extractFloat128Exp( a );
6010    aSign = extractFloat128Sign( a );
6011    aSig0 |= ( aSig1 != 0 );
6012    if ( 0x401E < aExp ) {
6013        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6014        goto invalid;
6015    }
6016    else if ( aExp < 0x3FFF ) {
6017        if (aExp || aSig0) {
6018            status->float_exception_flags |= float_flag_inexact;
6019        }
6020        return 0;
6021    }
6022    aSig0 |= LIT64( 0x0001000000000000 );
6023    shiftCount = 0x402F - aExp;
6024    savedASig = aSig0;
6025    aSig0 >>= shiftCount;
6026    z = aSig0;
6027    if ( aSign ) z = - z;
6028    if ( ( z < 0 ) ^ aSign ) {
6029 invalid:
6030        float_raise(float_flag_invalid, status);
6031        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6032    }
6033    if ( ( aSig0<<shiftCount ) != savedASig ) {
6034        status->float_exception_flags |= float_flag_inexact;
6035    }
6036    return z;
6037
6038}
6039
6040/*----------------------------------------------------------------------------
6041| Returns the result of converting the quadruple-precision floating-point
6042| value `a' to the 64-bit two's complement integer format.  The conversion
6043| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6044| Arithmetic---which means in particular that the conversion is rounded
6045| according to the current rounding mode.  If `a' is a NaN, the largest
6046| positive integer is returned.  Otherwise, if the conversion overflows, the
6047| largest integer with the same sign as `a' is returned.
6048*----------------------------------------------------------------------------*/
6049
6050int64_t float128_to_int64(float128 a, float_status *status)
6051{
6052    flag aSign;
6053    int32_t aExp, shiftCount;
6054    uint64_t aSig0, aSig1;
6055
6056    aSig1 = extractFloat128Frac1( a );
6057    aSig0 = extractFloat128Frac0( a );
6058    aExp = extractFloat128Exp( a );
6059    aSign = extractFloat128Sign( a );
6060    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6061    shiftCount = 0x402F - aExp;
6062    if ( shiftCount <= 0 ) {
6063        if ( 0x403E < aExp ) {
6064            float_raise(float_flag_invalid, status);
6065            if (    ! aSign
6066                 || (    ( aExp == 0x7FFF )
6067                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6068                    )
6069               ) {
6070                return LIT64( 0x7FFFFFFFFFFFFFFF );
6071            }
6072            return (int64_t) LIT64( 0x8000000000000000 );
6073        }
6074        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6075    }
6076    else {
6077        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6078    }
6079    return roundAndPackInt64(aSign, aSig0, aSig1, status);
6080
6081}
6082
6083/*----------------------------------------------------------------------------
6084| Returns the result of converting the quadruple-precision floating-point
6085| value `a' to the 64-bit two's complement integer format.  The conversion
6086| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6087| Arithmetic, except that the conversion is always rounded toward zero.
6088| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6089| the conversion overflows, the largest integer with the same sign as `a' is
6090| returned.
6091*----------------------------------------------------------------------------*/
6092
6093int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6094{
6095    flag aSign;
6096    int32_t aExp, shiftCount;
6097    uint64_t aSig0, aSig1;
6098    int64_t z;
6099
6100    aSig1 = extractFloat128Frac1( a );
6101    aSig0 = extractFloat128Frac0( a );
6102    aExp = extractFloat128Exp( a );
6103    aSign = extractFloat128Sign( a );
6104    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6105    shiftCount = aExp - 0x402F;
6106    if ( 0 < shiftCount ) {
6107        if ( 0x403E <= aExp ) {
6108            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6109            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6110                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6111                if (aSig1) {
6112                    status->float_exception_flags |= float_flag_inexact;
6113                }
6114            }
6115            else {
6116                float_raise(float_flag_invalid, status);
6117                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6118                    return LIT64( 0x7FFFFFFFFFFFFFFF );
6119                }
6120            }
6121            return (int64_t) LIT64( 0x8000000000000000 );
6122        }
6123        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6124        if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6125            status->float_exception_flags |= float_flag_inexact;
6126        }
6127    }
6128    else {
6129        if ( aExp < 0x3FFF ) {
6130            if ( aExp | aSig0 | aSig1 ) {
6131                status->float_exception_flags |= float_flag_inexact;
6132            }
6133            return 0;
6134        }
6135        z = aSig0>>( - shiftCount );
6136        if (    aSig1
6137             || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6138            status->float_exception_flags |= float_flag_inexact;
6139        }
6140    }
6141    if ( aSign ) z = - z;
6142    return z;
6143
6144}
6145
6146/*----------------------------------------------------------------------------
6147| Returns the result of converting the quadruple-precision floating-point value
6148| `a' to the 64-bit unsigned integer format.  The conversion is
6149| performed according to the IEC/IEEE Standard for Binary Floating-Point
6150| Arithmetic---which means in particular that the conversion is rounded
6151| according to the current rounding mode.  If `a' is a NaN, the largest
6152| positive integer is returned.  If the conversion overflows, the
6153| largest unsigned integer is returned.  If 'a' is negative, the value is
6154| rounded and zero is returned; negative values that do not round to zero
6155| will raise the inexact exception.
6156*----------------------------------------------------------------------------*/
6157
6158uint64_t float128_to_uint64(float128 a, float_status *status)
6159{
6160    flag aSign;
6161    int aExp;
6162    int shiftCount;
6163    uint64_t aSig0, aSig1;
6164
6165    aSig0 = extractFloat128Frac0(a);
6166    aSig1 = extractFloat128Frac1(a);
6167    aExp = extractFloat128Exp(a);
6168    aSign = extractFloat128Sign(a);
6169    if (aSign && (aExp > 0x3FFE)) {
6170        float_raise(float_flag_invalid, status);
6171        if (float128_is_any_nan(a)) {
6172            return LIT64(0xFFFFFFFFFFFFFFFF);
6173        } else {
6174            return 0;
6175        }
6176    }
6177    if (aExp) {
6178        aSig0 |= LIT64(0x0001000000000000);
6179    }
6180    shiftCount = 0x402F - aExp;
6181    if (shiftCount <= 0) {
6182        if (0x403E < aExp) {
6183            float_raise(float_flag_invalid, status);
6184            return LIT64(0xFFFFFFFFFFFFFFFF);
6185        }
6186        shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6187    } else {
6188        shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6189    }
6190    return roundAndPackUint64(aSign, aSig0, aSig1, status);
6191}
6192
6193uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6194{
6195    uint64_t v;
6196    signed char current_rounding_mode = status->float_rounding_mode;
6197
6198    set_float_rounding_mode(float_round_to_zero, status);
6199    v = float128_to_uint64(a, status);
6200    set_float_rounding_mode(current_rounding_mode, status);
6201
6202    return v;
6203}
6204
6205/*----------------------------------------------------------------------------
6206| Returns the result of converting the quadruple-precision floating-point
6207| value `a' to the 32-bit unsigned integer format.  The conversion
6208| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6209| Arithmetic except that the conversion is always rounded toward zero.
6210| If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6211| if the conversion overflows, the largest unsigned integer is returned.
6212| If 'a' is negative, the value is rounded and zero is returned; negative
6213| values that do not round to zero will raise the inexact exception.
6214*----------------------------------------------------------------------------*/
6215
6216uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6217{
6218    uint64_t v;
6219    uint32_t res;
6220    int old_exc_flags = get_float_exception_flags(status);
6221
6222    v = float128_to_uint64_round_to_zero(a, status);
6223    if (v > 0xffffffff) {
6224        res = 0xffffffff;
6225    } else {
6226        return v;
6227    }
6228    set_float_exception_flags(old_exc_flags, status);
6229    float_raise(float_flag_invalid, status);
6230    return res;
6231}
6232
6233/*----------------------------------------------------------------------------
6234| Returns the result of converting the quadruple-precision floating-point
6235| value `a' to the single-precision floating-point format.  The conversion
6236| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6237| Arithmetic.
6238*----------------------------------------------------------------------------*/
6239
6240float32 float128_to_float32(float128 a, float_status *status)
6241{
6242    flag aSign;
6243    int32_t aExp;
6244    uint64_t aSig0, aSig1;
6245    uint32_t zSig;
6246
6247    aSig1 = extractFloat128Frac1( a );
6248    aSig0 = extractFloat128Frac0( a );
6249    aExp = extractFloat128Exp( a );
6250    aSign = extractFloat128Sign( a );
6251    if ( aExp == 0x7FFF ) {
6252        if ( aSig0 | aSig1 ) {
6253            return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6254        }
6255        return packFloat32( aSign, 0xFF, 0 );
6256    }
6257    aSig0 |= ( aSig1 != 0 );
6258    shift64RightJamming( aSig0, 18, &aSig0 );
6259    zSig = aSig0;
6260    if ( aExp || zSig ) {
6261        zSig |= 0x40000000;
6262        aExp -= 0x3F81;
6263    }
6264    return roundAndPackFloat32(aSign, aExp, zSig, status);
6265
6266}
6267
6268/*----------------------------------------------------------------------------
6269| Returns the result of converting the quadruple-precision floating-point
6270| value `a' to the double-precision floating-point format.  The conversion
6271| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6272| Arithmetic.
6273*----------------------------------------------------------------------------*/
6274
6275float64 float128_to_float64(float128 a, float_status *status)
6276{
6277    flag aSign;
6278    int32_t aExp;
6279    uint64_t aSig0, aSig1;
6280
6281    aSig1 = extractFloat128Frac1( a );
6282    aSig0 = extractFloat128Frac0( a );
6283    aExp = extractFloat128Exp( a );
6284    aSign = extractFloat128Sign( a );
6285    if ( aExp == 0x7FFF ) {
6286        if ( aSig0 | aSig1 ) {
6287            return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6288        }
6289        return packFloat64( aSign, 0x7FF, 0 );
6290    }
6291    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6292    aSig0 |= ( aSig1 != 0 );
6293    if ( aExp || aSig0 ) {
6294        aSig0 |= LIT64( 0x4000000000000000 );
6295        aExp -= 0x3C01;
6296    }
6297    return roundAndPackFloat64(aSign, aExp, aSig0, status);
6298
6299}
6300
6301/*----------------------------------------------------------------------------
6302| Returns the result of converting the quadruple-precision floating-point
6303| value `a' to the extended double-precision floating-point format.  The
6304| conversion is performed according to the IEC/IEEE Standard for Binary
6305| Floating-Point Arithmetic.
6306*----------------------------------------------------------------------------*/
6307
6308floatx80 float128_to_floatx80(float128 a, float_status *status)
6309{
6310    flag aSign;
6311    int32_t aExp;
6312    uint64_t aSig0, aSig1;
6313
6314    aSig1 = extractFloat128Frac1( a );
6315    aSig0 = extractFloat128Frac0( a );
6316    aExp = extractFloat128Exp( a );
6317    aSign = extractFloat128Sign( a );
6318    if ( aExp == 0x7FFF ) {
6319        if ( aSig0 | aSig1 ) {
6320            return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6321        }
6322        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6323    }
6324    if ( aExp == 0 ) {
6325        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6326        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6327    }
6328    else {
6329        aSig0 |= LIT64( 0x0001000000000000 );
6330    }
6331    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6332    return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6333
6334}
6335
6336/*----------------------------------------------------------------------------
6337| Rounds the quadruple-precision floating-point value `a' to an integer, and
6338| returns the result as a quadruple-precision floating-point value.  The
6339| operation is performed according to the IEC/IEEE Standard for Binary
6340| Floating-Point Arithmetic.
6341*----------------------------------------------------------------------------*/
6342
6343float128 float128_round_to_int(float128 a, float_status *status)
6344{
6345    flag aSign;
6346    int32_t aExp;
6347    uint64_t lastBitMask, roundBitsMask;
6348    float128 z;
6349
6350    aExp = extractFloat128Exp( a );
6351    if ( 0x402F <= aExp ) {
6352        if ( 0x406F <= aExp ) {
6353            if (    ( aExp == 0x7FFF )
6354                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6355               ) {
6356                return propagateFloat128NaN(a, a, status);
6357            }
6358            return a;
6359        }
6360        lastBitMask = 1;
6361        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6362        roundBitsMask = lastBitMask - 1;
6363        z = a;
6364        switch (status->float_rounding_mode) {
6365        case float_round_nearest_even:
6366            if ( lastBitMask ) {
6367                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6368                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6369            }
6370            else {
6371                if ( (int64_t) z.low < 0 ) {
6372                    ++z.high;
6373                    if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6374                }
6375            }
6376            break;
6377        case float_round_ties_away:
6378            if (lastBitMask) {
6379                add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6380            } else {
6381                if ((int64_t) z.low < 0) {
6382                    ++z.high;
6383                }
6384            }
6385            break;
6386        case float_round_to_zero:
6387            break;
6388        case float_round_up:
6389            if (!extractFloat128Sign(z)) {
6390                add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6391            }
6392            break;
6393        case float_round_down:
6394            if (extractFloat128Sign(z)) {
6395                add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6396            }
6397            break;
6398        default:
6399            abort();
6400        }
6401        z.low &= ~ roundBitsMask;
6402    }
6403    else {
6404        if ( aExp < 0x3FFF ) {
6405            if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6406            status->float_exception_flags |= float_flag_inexact;
6407            aSign = extractFloat128Sign( a );
6408            switch (status->float_rounding_mode) {
6409             case float_round_nearest_even:
6410                if (    ( aExp == 0x3FFE )
6411                     && (   extractFloat128Frac0( a )
6412                          | extractFloat128Frac1( a ) )
6413                   ) {
6414                    return packFloat128( aSign, 0x3FFF, 0, 0 );
6415                }
6416                break;
6417            case float_round_ties_away:
6418                if (aExp == 0x3FFE) {
6419                    return packFloat128(aSign, 0x3FFF, 0, 0);
6420                }
6421                break;
6422             case float_round_down:
6423                return
6424                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6425                    : packFloat128( 0, 0, 0, 0 );
6426             case float_round_up:
6427                return
6428                      aSign ? packFloat128( 1, 0, 0, 0 )
6429                    : packFloat128( 0, 0x3FFF, 0, 0 );
6430            }
6431            return packFloat128( aSign, 0, 0, 0 );
6432        }
6433        lastBitMask = 1;
6434        lastBitMask <<= 0x402F - aExp;
6435        roundBitsMask = lastBitMask - 1;
6436        z.low = 0;
6437        z.high = a.high;
6438        switch (status->float_rounding_mode) {
6439        case float_round_nearest_even:
6440            z.high += lastBitMask>>1;
6441            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6442                z.high &= ~ lastBitMask;
6443            }
6444            break;
6445        case float_round_ties_away:
6446            z.high += lastBitMask>>1;
6447            break;
6448        case float_round_to_zero:
6449            break;
6450        case float_round_up:
6451            if (!extractFloat128Sign(z)) {
6452                z.high |= ( a.low != 0 );
6453                z.high += roundBitsMask;
6454            }
6455            break;
6456        case float_round_down:
6457            if (extractFloat128Sign(z)) {
6458                z.high |= (a.low != 0);
6459                z.high += roundBitsMask;
6460            }
6461            break;
6462        default:
6463            abort();
6464        }
6465        z.high &= ~ roundBitsMask;
6466    }
6467    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6468        status->float_exception_flags |= float_flag_inexact;
6469    }
6470    return z;
6471
6472}
6473
6474/*----------------------------------------------------------------------------
6475| Returns the result of adding the absolute values of the quadruple-precision
6476| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6477| before being returned.  `zSign' is ignored if the result is a NaN.
6478| The addition is performed according to the IEC/IEEE Standard for Binary
6479| Floating-Point Arithmetic.
6480*----------------------------------------------------------------------------*/
6481
6482static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6483                                float_status *status)
6484{
6485    int32_t aExp, bExp, zExp;
6486    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6487    int32_t expDiff;
6488
6489    aSig1 = extractFloat128Frac1( a );
6490    aSig0 = extractFloat128Frac0( a );
6491    aExp = extractFloat128Exp( a );
6492    bSig1 = extractFloat128Frac1( b );
6493    bSig0 = extractFloat128Frac0( b );
6494    bExp = extractFloat128Exp( b );
6495    expDiff = aExp - bExp;
6496    if ( 0 < expDiff ) {
6497        if ( aExp == 0x7FFF ) {
6498            if (aSig0 | aSig1) {
6499                return propagateFloat128NaN(a, b, status);
6500            }
6501            return a;
6502        }
6503        if ( bExp == 0 ) {
6504            --expDiff;
6505        }
6506        else {
6507            bSig0 |= LIT64( 0x0001000000000000 );
6508        }
6509        shift128ExtraRightJamming(
6510            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6511        zExp = aExp;
6512    }
6513    else if ( expDiff < 0 ) {
6514        if ( bExp == 0x7FFF ) {
6515            if (bSig0 | bSig1) {
6516                return propagateFloat128NaN(a, b, status);
6517            }
6518            return packFloat128( zSign, 0x7FFF, 0, 0 );
6519        }
6520        if ( aExp == 0 ) {
6521            ++expDiff;
6522        }
6523        else {
6524            aSig0 |= LIT64( 0x0001000000000000 );
6525        }
6526        shift128ExtraRightJamming(
6527            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6528        zExp = bExp;
6529    }
6530    else {
6531        if ( aExp == 0x7FFF ) {
6532            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6533                return propagateFloat128NaN(a, b, status);
6534            }
6535            return a;
6536        }
6537        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6538        if ( aExp == 0 ) {
6539            if (status->flush_to_zero) {
6540                if (zSig0 | zSig1) {
6541                    float_raise(float_flag_output_denormal, status);
6542                }
6543                return packFloat128(zSign, 0, 0, 0);
6544            }
6545            return packFloat128( zSign, 0, zSig0, zSig1 );
6546        }
6547        zSig2 = 0;
6548        zSig0 |= LIT64( 0x0002000000000000 );
6549        zExp = aExp;
6550        goto shiftRight1;
6551    }
6552    aSig0 |= LIT64( 0x0001000000000000 );
6553    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6554    --zExp;
6555    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6556    ++zExp;
6557 shiftRight1:
6558    shift128ExtraRightJamming(
6559        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6560 roundAndPack:
6561    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6562
6563}
6564
6565/*----------------------------------------------------------------------------
6566| Returns the result of subtracting the absolute values of the quadruple-
6567| precision floating-point values `a' and `b'.  If `zSign' is 1, the
6568| difference is negated before being returned.  `zSign' is ignored if the
6569| result is a NaN.  The subtraction is performed according to the IEC/IEEE
6570| Standard for Binary Floating-Point Arithmetic.
6571*----------------------------------------------------------------------------*/
6572
6573static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6574                                float_status *status)
6575{
6576    int32_t aExp, bExp, zExp;
6577    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6578    int32_t expDiff;
6579
6580    aSig1 = extractFloat128Frac1( a );
6581    aSig0 = extractFloat128Frac0( a );
6582    aExp = extractFloat128Exp( a );
6583    bSig1 = extractFloat128Frac1( b );
6584    bSig0 = extractFloat128Frac0( b );
6585    bExp = extractFloat128Exp( b );
6586    expDiff = aExp - bExp;
6587    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6588    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6589    if ( 0 < expDiff ) goto aExpBigger;
6590    if ( expDiff < 0 ) goto bExpBigger;
6591    if ( aExp == 0x7FFF ) {
6592        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6593            return propagateFloat128NaN(a, b, status);
6594        }
6595        float_raise(float_flag_invalid, status);
6596        return float128_default_nan(status);
6597    }
6598    if ( aExp == 0 ) {
6599        aExp = 1;
6600        bExp = 1;
6601    }
6602    if ( bSig0 < aSig0 ) goto aBigger;
6603    if ( aSig0 < bSig0 ) goto bBigger;
6604    if ( bSig1 < aSig1 ) goto aBigger;
6605    if ( aSig1 < bSig1 ) goto bBigger;
6606    return packFloat128(status->float_rounding_mode == float_round_down,
6607                        0, 0, 0);
6608 bExpBigger:
6609    if ( bExp == 0x7FFF ) {
6610        if (bSig0 | bSig1) {
6611            return propagateFloat128NaN(a, b, status);
6612        }
6613        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6614    }
6615    if ( aExp == 0 ) {
6616        ++expDiff;
6617    }
6618    else {
6619        aSig0 |= LIT64( 0x4000000000000000 );
6620    }
6621    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6622    bSig0 |= LIT64( 0x4000000000000000 );
6623 bBigger:
6624    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6625    zExp = bExp;
6626    zSign ^= 1;
6627    goto normalizeRoundAndPack;
6628 aExpBigger:
6629    if ( aExp == 0x7FFF ) {
6630        if (aSig0 | aSig1) {
6631            return propagateFloat128NaN(a, b, status);
6632        }
6633        return a;
6634    }
6635    if ( bExp == 0 ) {
6636        --expDiff;
6637    }
6638    else {
6639        bSig0 |= LIT64( 0x4000000000000000 );
6640    }
6641    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6642    aSig0 |= LIT64( 0x4000000000000000 );
6643 aBigger:
6644    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6645    zExp = aExp;
6646 normalizeRoundAndPack:
6647    --zExp;
6648    return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6649                                         status);
6650
6651}
6652
6653/*----------------------------------------------------------------------------
6654| Returns the result of adding the quadruple-precision floating-point values
6655| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6656| for Binary Floating-Point Arithmetic.
6657*----------------------------------------------------------------------------*/
6658
6659float128 float128_add(float128 a, float128 b, float_status *status)
6660{
6661    flag aSign, bSign;
6662
6663    aSign = extractFloat128Sign( a );
6664    bSign = extractFloat128Sign( b );
6665    if ( aSign == bSign ) {
6666        return addFloat128Sigs(a, b, aSign, status);
6667    }
6668    else {
6669        return subFloat128Sigs(a, b, aSign, status);
6670    }
6671
6672}
6673
6674/*----------------------------------------------------------------------------
6675| Returns the result of subtracting the quadruple-precision floating-point
6676| values `a' and `b'.  The operation is performed according to the IEC/IEEE
6677| Standard for Binary Floating-Point Arithmetic.
6678*----------------------------------------------------------------------------*/
6679
6680float128 float128_sub(float128 a, float128 b, float_status *status)
6681{
6682    flag aSign, bSign;
6683
6684    aSign = extractFloat128Sign( a );
6685    bSign = extractFloat128Sign( b );
6686    if ( aSign == bSign ) {
6687        return subFloat128Sigs(a, b, aSign, status);
6688    }
6689    else {
6690        return addFloat128Sigs(a, b, aSign, status);
6691    }
6692
6693}
6694
6695/*----------------------------------------------------------------------------
6696| Returns the result of multiplying the quadruple-precision floating-point
6697| values `a' and `b'.  The operation is performed according to the IEC/IEEE
6698| Standard for Binary Floating-Point Arithmetic.
6699*----------------------------------------------------------------------------*/
6700
6701float128 float128_mul(float128 a, float128 b, float_status *status)
6702{
6703    flag aSign, bSign, zSign;
6704    int32_t aExp, bExp, zExp;
6705    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6706
6707    aSig1 = extractFloat128Frac1( a );
6708    aSig0 = extractFloat128Frac0( a );
6709    aExp = extractFloat128Exp( a );
6710    aSign = extractFloat128Sign( a );
6711    bSig1 = extractFloat128Frac1( b );
6712    bSig0 = extractFloat128Frac0( b );
6713    bExp = extractFloat128Exp( b );
6714    bSign = extractFloat128Sign( b );
6715    zSign = aSign ^ bSign;
6716    if ( aExp == 0x7FFF ) {
6717        if (    ( aSig0 | aSig1 )
6718             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6719            return propagateFloat128NaN(a, b, status);
6720        }
6721        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6722        return packFloat128( zSign, 0x7FFF, 0, 0 );
6723    }
6724    if ( bExp == 0x7FFF ) {
6725        if (bSig0 | bSig1) {
6726            return propagateFloat128NaN(a, b, status);
6727        }
6728        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6729 invalid:
6730            float_raise(float_flag_invalid, status);
6731            return float128_default_nan(status);
6732        }
6733        return packFloat128( zSign, 0x7FFF, 0, 0 );
6734    }
6735    if ( aExp == 0 ) {
6736        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6737        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6738    }
6739    if ( bExp == 0 ) {
6740        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6741        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6742    }
6743    zExp = aExp + bExp - 0x4000;
6744    aSig0 |= LIT64( 0x0001000000000000 );
6745    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6746    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6747    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6748    zSig2 |= ( zSig3 != 0 );
6749    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6750        shift128ExtraRightJamming(
6751            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6752        ++zExp;
6753    }
6754    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6755
6756}
6757
6758/*----------------------------------------------------------------------------
6759| Returns the result of dividing the quadruple-precision floating-point value
6760| `a' by the corresponding value `b'.  The operation is performed according to
6761| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6762*----------------------------------------------------------------------------*/
6763
6764float128 float128_div(float128 a, float128 b, float_status *status)
6765{
6766    flag aSign, bSign, zSign;
6767    int32_t aExp, bExp, zExp;
6768    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6769    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6770
6771    aSig1 = extractFloat128Frac1( a );
6772    aSig0 = extractFloat128Frac0( a );
6773    aExp = extractFloat128Exp( a );
6774    aSign = extractFloat128Sign( a );
6775    bSig1 = extractFloat128Frac1( b );
6776    bSig0 = extractFloat128Frac0( b );
6777    bExp = extractFloat128Exp( b );
6778    bSign = extractFloat128Sign( b );
6779    zSign = aSign ^ bSign;
6780    if ( aExp == 0x7FFF ) {
6781        if (aSig0 | aSig1) {
6782            return propagateFloat128NaN(a, b, status);
6783        }
6784        if ( bExp == 0x7FFF ) {
6785            if (bSig0 | bSig1) {
6786                return propagateFloat128NaN(a, b, status);
6787            }
6788            goto invalid;
6789        }
6790        return packFloat128( zSign, 0x7FFF, 0, 0 );
6791    }
6792    if ( bExp == 0x7FFF ) {
6793        if (bSig0 | bSig1) {
6794            return propagateFloat128NaN(a, b, status);
6795        }
6796        return packFloat128( zSign, 0, 0, 0 );
6797    }
6798    if ( bExp == 0 ) {
6799        if ( ( bSig0 | bSig1 ) == 0 ) {
6800            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6801 invalid:
6802                float_raise(float_flag_invalid, status);
6803                return float128_default_nan(status);
6804            }
6805            float_raise(float_flag_divbyzero, status);
6806            return packFloat128( zSign, 0x7FFF, 0, 0 );
6807        }
6808        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6809    }
6810    if ( aExp == 0 ) {
6811        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6812        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6813    }
6814    zExp = aExp - bExp + 0x3FFD;
6815    shortShift128Left(
6816        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6817    shortShift128Left(
6818        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6819    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6820        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6821        ++zExp;
6822    }
6823    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6824    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6825    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6826    while ( (int64_t) rem0 < 0 ) {
6827        --zSig0;
6828        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6829    }
6830    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6831    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6832        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6833        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6834        while ( (int64_t) rem1 < 0 ) {
6835            --zSig1;
6836            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6837        }
6838        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6839    }
6840    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6841    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6842
6843}
6844
6845/*----------------------------------------------------------------------------
6846| Returns the remainder of the quadruple-precision floating-point value `a'
6847| with respect to the corresponding value `b'.  The operation is performed
6848| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6849*----------------------------------------------------------------------------*/
6850
6851float128 float128_rem(float128 a, float128 b, float_status *status)
6852{
6853    flag aSign, zSign;
6854    int32_t aExp, bExp, expDiff;
6855    uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6856    uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6857    int64_t sigMean0;
6858
6859    aSig1 = extractFloat128Frac1( a );
6860    aSig0 = extractFloat128Frac0( a );
6861    aExp = extractFloat128Exp( a );
6862    aSign = extractFloat128Sign( a );
6863    bSig1 = extractFloat128Frac1( b );
6864    bSig0 = extractFloat128Frac0( b );
6865    bExp = extractFloat128Exp( b );
6866    if ( aExp == 0x7FFF ) {
6867        if (    ( aSig0 | aSig1 )
6868             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6869            return propagateFloat128NaN(a, b, status);
6870        }
6871        goto invalid;
6872    }
6873    if ( bExp == 0x7FFF ) {
6874        if (bSig0 | bSig1) {
6875            return propagateFloat128NaN(a, b, status);
6876        }
6877        return a;
6878    }
6879    if ( bExp == 0 ) {
6880        if ( ( bSig0 | bSig1 ) == 0 ) {
6881 invalid:
6882            float_raise(float_flag_invalid, status);
6883            return float128_default_nan(status);
6884        }
6885        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6886    }
6887    if ( aExp == 0 ) {
6888        if ( ( aSig0 | aSig1 ) == 0 ) return a;
6889        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6890    }
6891    expDiff = aExp - bExp;
6892    if ( expDiff < -1 ) return a;
6893    shortShift128Left(
6894        aSig0 | LIT64( 0x0001000000000000 ),
6895        aSig1,
6896        15 - ( expDiff < 0 ),
6897        &aSig0,
6898        &aSig1
6899    );
6900    shortShift128Left(
6901        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6902    q = le128( bSig0, bSig1, aSig0, aSig1 );
6903    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6904    expDiff -= 64;
6905    while ( 0 < expDiff ) {
6906        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6907        q = ( 4 < q ) ? q - 4 : 0;
6908        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6909        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6910        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6911        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6912        expDiff -= 61;
6913    }
6914    if ( -64 < expDiff ) {
6915        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6916        q = ( 4 < q ) ? q - 4 : 0;
6917        q >>= - expDiff;
6918        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6919        expDiff += 52;
6920        if ( expDiff < 0 ) {
6921            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6922        }
6923        else {
6924            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6925        }
6926        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6927        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6928    }
6929    else {
6930        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6931        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6932    }
6933    do {
6934        alternateASig0 = aSig0;
6935        alternateASig1 = aSig1;
6936        ++q;
6937        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6938    } while ( 0 <= (int64_t) aSig0 );
6939    add128(
6940        aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6941    if (    ( sigMean0 < 0 )
6942         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6943        aSig0 = alternateASig0;
6944        aSig1 = alternateASig1;
6945    }
6946    zSign = ( (int64_t) aSig0 < 0 );
6947    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6948    return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6949                                         status);
6950}
6951
6952/*----------------------------------------------------------------------------
6953| Returns the square root of the quadruple-precision floating-point value `a'.
6954| The operation is performed according to the IEC/IEEE Standard for Binary
6955| Floating-Point Arithmetic.
6956*----------------------------------------------------------------------------*/
6957
6958float128 float128_sqrt(float128 a, float_status *status)
6959{
6960    flag aSign;
6961    int32_t aExp, zExp;
6962    uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6963    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6964
6965    aSig1 = extractFloat128Frac1( a );
6966    aSig0 = extractFloat128Frac0( a );
6967    aExp = extractFloat128Exp( a );
6968    aSign = extractFloat128Sign( a );
6969    if ( aExp == 0x7FFF ) {
6970        if (aSig0 | aSig1) {
6971            return propagateFloat128NaN(a, a, status);
6972        }
6973        if ( ! aSign ) return a;
6974        goto invalid;
6975    }
6976    if ( aSign ) {
6977        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6978 invalid:
6979        float_raise(float_flag_invalid, status);
6980        return float128_default_nan(status);
6981    }
6982    if ( aExp == 0 ) {
6983        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6984        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6985    }
6986    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6987    aSig0 |= LIT64( 0x0001000000000000 );
6988    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6989    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6990    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6991    doubleZSig0 = zSig0<<1;
6992    mul64To128( zSig0, zSig0, &term0, &term1 );
6993    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6994    while ( (int64_t) rem0 < 0 ) {
6995        --zSig0;
6996        doubleZSig0 -= 2;
6997        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6998    }
6999    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7000    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7001        if ( zSig1 == 0 ) zSig1 = 1;
7002        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7003        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7004        mul64To128( zSig1, zSig1, &term2, &term3 );
7005        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7006        while ( (int64_t) rem1 < 0 ) {
7007            --zSig1;
7008            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7009            term3 |= 1;
7010            term2 |= doubleZSig0;
7011            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7012        }
7013        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7014    }
7015    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7016    return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7017
7018}
7019
7020/*----------------------------------------------------------------------------
7021| Returns 1 if the quadruple-precision floating-point value `a' is equal to
7022| the corresponding value `b', and 0 otherwise.  The invalid exception is
7023| raised if either operand is a NaN.  Otherwise, the comparison is performed
7024| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7025*----------------------------------------------------------------------------*/
7026
7027int float128_eq(float128 a, float128 b, float_status *status)
7028{
7029
7030    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7031              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7032         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7033              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7034       ) {
7035        float_raise(float_flag_invalid, status);
7036        return 0;
7037    }
7038    return
7039           ( a.low == b.low )
7040        && (    ( a.high == b.high )
7041             || (    ( a.low == 0 )
7042                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7043           );
7044
7045}
7046
7047/*----------------------------------------------------------------------------
7048| Returns 1 if the quadruple-precision floating-point value `a' is less than
7049| or equal to the corresponding value `b', and 0 otherwise.  The invalid
7050| exception is raised if either operand is a NaN.  The comparison is performed
7051| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7052*----------------------------------------------------------------------------*/
7053
7054int float128_le(float128 a, float128 b, float_status *status)
7055{
7056    flag aSign, bSign;
7057
7058    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7059              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7060         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7061              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7062       ) {
7063        float_raise(float_flag_invalid, status);
7064        return 0;
7065    }
7066    aSign = extractFloat128Sign( a );
7067    bSign = extractFloat128Sign( b );
7068    if ( aSign != bSign ) {
7069        return
7070               aSign
7071            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7072                 == 0 );
7073    }
7074    return
7075          aSign ? le128( b.high, b.low, a.high, a.low )
7076        : le128( a.high, a.low, b.high, b.low );
7077
7078}
7079
7080/*----------------------------------------------------------------------------
7081| Returns 1 if the quadruple-precision floating-point value `a' is less than
7082| the corresponding value `b', and 0 otherwise.  The invalid exception is
7083| raised if either operand is a NaN.  The comparison is performed according
7084| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7085*----------------------------------------------------------------------------*/
7086
7087int float128_lt(float128 a, float128 b, float_status *status)
7088{
7089    flag aSign, bSign;
7090
7091    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7092              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7093         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7094              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7095       ) {
7096        float_raise(float_flag_invalid, status);
7097        return 0;
7098    }
7099    aSign = extractFloat128Sign( a );
7100    bSign = extractFloat128Sign( b );
7101    if ( aSign != bSign ) {
7102        return
7103               aSign
7104            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7105                 != 0 );
7106    }
7107    return
7108          aSign ? lt128( b.high, b.low, a.high, a.low )
7109        : lt128( a.high, a.low, b.high, b.low );
7110
7111}
7112
7113/*----------------------------------------------------------------------------
7114| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7115| be compared, and 0 otherwise.  The invalid exception is raised if either
7116| operand is a NaN. The comparison is performed according to the IEC/IEEE
7117| Standard for Binary Floating-Point Arithmetic.
7118*----------------------------------------------------------------------------*/
7119
7120int float128_unordered(float128 a, float128 b, float_status *status)
7121{
7122    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7123              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7124         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7125              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7126       ) {
7127        float_raise(float_flag_invalid, status);
7128        return 1;
7129    }
7130    return 0;
7131}
7132
7133/*----------------------------------------------------------------------------
7134| Returns 1 if the quadruple-precision floating-point value `a' is equal to
7135| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7136| exception.  The comparison is performed according to the IEC/IEEE Standard
7137| for Binary Floating-Point Arithmetic.
7138*----------------------------------------------------------------------------*/
7139
7140int float128_eq_quiet(float128 a, float128 b, float_status *status)
7141{
7142
7143    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7144              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7145         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7146              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7147       ) {
7148        if (float128_is_signaling_nan(a, status)
7149         || float128_is_signaling_nan(b, status)) {
7150            float_raise(float_flag_invalid, status);
7151        }
7152        return 0;
7153    }
7154    return
7155           ( a.low == b.low )
7156        && (    ( a.high == b.high )
7157             || (    ( a.low == 0 )
7158                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7159           );
7160
7161}
7162
7163/*----------------------------------------------------------------------------
7164| Returns 1 if the quadruple-precision floating-point value `a' is less than
7165| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7166| cause an exception.  Otherwise, the comparison is performed according to the
7167| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7168*----------------------------------------------------------------------------*/
7169
7170int float128_le_quiet(float128 a, float128 b, float_status *status)
7171{
7172    flag aSign, bSign;
7173
7174    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7175              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7176         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7177              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7178       ) {
7179        if (float128_is_signaling_nan(a, status)
7180         || float128_is_signaling_nan(b, status)) {
7181            float_raise(float_flag_invalid, status);
7182        }
7183        return 0;
7184    }
7185    aSign = extractFloat128Sign( a );
7186    bSign = extractFloat128Sign( b );
7187    if ( aSign != bSign ) {
7188        return
7189               aSign
7190            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7191                 == 0 );
7192    }
7193    return
7194          aSign ? le128( b.high, b.low, a.high, a.low )
7195        : le128( a.high, a.low, b.high, b.low );
7196
7197}
7198
7199/*----------------------------------------------------------------------------
7200| Returns 1 if the quadruple-precision floating-point value `a' is less than
7201| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7202| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7203| Standard for Binary Floating-Point Arithmetic.
7204*----------------------------------------------------------------------------*/
7205
7206int float128_lt_quiet(float128 a, float128 b, float_status *status)
7207{
7208    flag aSign, bSign;
7209
7210    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7211              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7212         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7213              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7214       ) {
7215        if (float128_is_signaling_nan(a, status)
7216         || float128_is_signaling_nan(b, status)) {
7217            float_raise(float_flag_invalid, status);
7218        }
7219        return 0;
7220    }
7221    aSign = extractFloat128Sign( a );
7222    bSign = extractFloat128Sign( b );
7223    if ( aSign != bSign ) {
7224        return
7225               aSign
7226            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7227                 != 0 );
7228    }
7229    return
7230          aSign ? lt128( b.high, b.low, a.high, a.low )
7231        : lt128( a.high, a.low, b.high, b.low );
7232
7233}
7234
7235/*----------------------------------------------------------------------------
7236| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7237| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7238| comparison is performed according to the IEC/IEEE Standard for Binary
7239| Floating-Point Arithmetic.
7240*----------------------------------------------------------------------------*/
7241
7242int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7243{
7244    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7245              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7246         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7247              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7248       ) {
7249        if (float128_is_signaling_nan(a, status)
7250         || float128_is_signaling_nan(b, status)) {
7251            float_raise(float_flag_invalid, status);
7252        }
7253        return 1;
7254    }
7255    return 0;
7256}
7257
7258/* misc functions */
7259float32 uint32_to_float32(uint32_t a, float_status *status)
7260{
7261    return int64_to_float32(a, status);
7262}
7263
7264float64 uint32_to_float64(uint32_t a, float_status *status)
7265{
7266    return int64_to_float64(a, status);
7267}
7268
7269uint32_t float32_to_uint32(float32 a, float_status *status)
7270{
7271    int64_t v;
7272    uint32_t res;
7273    int old_exc_flags = get_float_exception_flags(status);
7274
7275    v = float32_to_int64(a, status);
7276    if (v < 0) {
7277        res = 0;
7278    } else if (v > 0xffffffff) {
7279        res = 0xffffffff;
7280    } else {
7281        return v;
7282    }
7283    set_float_exception_flags(old_exc_flags, status);
7284    float_raise(float_flag_invalid, status);
7285    return res;
7286}
7287
7288uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
7289{
7290    int64_t v;
7291    uint32_t res;
7292    int old_exc_flags = get_float_exception_flags(status);
7293
7294    v = float32_to_int64_round_to_zero(a, status);
7295    if (v < 0) {
7296        res = 0;
7297    } else if (v > 0xffffffff) {
7298        res = 0xffffffff;
7299    } else {
7300        return v;
7301    }
7302    set_float_exception_flags(old_exc_flags, status);
7303    float_raise(float_flag_invalid, status);
7304    return res;
7305}
7306
7307int16_t float32_to_int16(float32 a, float_status *status)
7308{
7309    int32_t v;
7310    int16_t res;
7311    int old_exc_flags = get_float_exception_flags(status);
7312
7313    v = float32_to_int32(a, status);
7314    if (v < -0x8000) {
7315        res = -0x8000;
7316    } else if (v > 0x7fff) {
7317        res = 0x7fff;
7318    } else {
7319        return v;
7320    }
7321
7322    set_float_exception_flags(old_exc_flags, status);
7323    float_raise(float_flag_invalid, status);
7324    return res;
7325}
7326
7327uint16_t float32_to_uint16(float32 a, float_status *status)
7328{
7329    int32_t v;
7330    uint16_t res;
7331    int old_exc_flags = get_float_exception_flags(status);
7332
7333    v = float32_to_int32(a, status);
7334    if (v < 0) {
7335        res = 0;
7336    } else if (v > 0xffff) {
7337        res = 0xffff;
7338    } else {
7339        return v;
7340    }
7341
7342    set_float_exception_flags(old_exc_flags, status);
7343    float_raise(float_flag_invalid, status);
7344    return res;
7345}
7346
7347uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7348{
7349    int64_t v;
7350    uint16_t res;
7351    int old_exc_flags = get_float_exception_flags(status);
7352
7353    v = float32_to_int64_round_to_zero(a, status);
7354    if (v < 0) {
7355        res = 0;
7356    } else if (v > 0xffff) {
7357        res = 0xffff;
7358    } else {
7359        return v;
7360    }
7361    set_float_exception_flags(old_exc_flags, status);
7362    float_raise(float_flag_invalid, status);
7363    return res;
7364}
7365
7366uint32_t float64_to_uint32(float64 a, float_status *status)
7367{
7368    uint64_t v;
7369    uint32_t res;
7370    int old_exc_flags = get_float_exception_flags(status);
7371
7372    v = float64_to_uint64(a, status);
7373    if (v > 0xffffffff) {
7374        res = 0xffffffff;
7375    } else {
7376        return v;
7377    }
7378    set_float_exception_flags(old_exc_flags, status);
7379    float_raise(float_flag_invalid, status);
7380    return res;
7381}
7382
7383uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
7384{
7385    uint64_t v;
7386    uint32_t res;
7387    int old_exc_flags = get_float_exception_flags(status);
7388
7389    v = float64_to_uint64_round_to_zero(a, status);
7390    if (v > 0xffffffff) {
7391        res = 0xffffffff;
7392    } else {
7393        return v;
7394    }
7395    set_float_exception_flags(old_exc_flags, status);
7396    float_raise(float_flag_invalid, status);
7397    return res;
7398}
7399
7400int16_t float64_to_int16(float64 a, float_status *status)
7401{
7402    int64_t v;
7403    int16_t res;
7404    int old_exc_flags = get_float_exception_flags(status);
7405
7406    v = float64_to_int32(a, status);
7407    if (v < -0x8000) {
7408        res = -0x8000;
7409    } else if (v > 0x7fff) {
7410        res = 0x7fff;
7411    } else {
7412        return v;
7413    }
7414
7415    set_float_exception_flags(old_exc_flags, status);
7416    float_raise(float_flag_invalid, status);
7417    return res;
7418}
7419
7420uint16_t float64_to_uint16(float64 a, float_status *status)
7421{
7422    int64_t v;
7423    uint16_t res;
7424    int old_exc_flags = get_float_exception_flags(status);
7425
7426    v = float64_to_int32(a, status);
7427    if (v < 0) {
7428        res = 0;
7429    } else if (v > 0xffff) {
7430        res = 0xffff;
7431    } else {
7432        return v;
7433    }
7434
7435    set_float_exception_flags(old_exc_flags, status);
7436    float_raise(float_flag_invalid, status);
7437    return res;
7438}
7439
7440uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7441{
7442    int64_t v;
7443    uint16_t res;
7444    int old_exc_flags = get_float_exception_flags(status);
7445
7446    v = float64_to_int64_round_to_zero(a, status);
7447    if (v < 0) {
7448        res = 0;
7449    } else if (v > 0xffff) {
7450        res = 0xffff;
7451    } else {
7452        return v;
7453    }
7454    set_float_exception_flags(old_exc_flags, status);
7455    float_raise(float_flag_invalid, status);
7456    return res;
7457}
7458
7459/*----------------------------------------------------------------------------
7460| Returns the result of converting the double-precision floating-point value
7461| `a' to the 64-bit unsigned integer format.  The conversion is
7462| performed according to the IEC/IEEE Standard for Binary Floating-Point
7463| Arithmetic---which means in particular that the conversion is rounded
7464| according to the current rounding mode.  If `a' is a NaN, the largest
7465| positive integer is returned.  If the conversion overflows, the
7466| largest unsigned integer is returned.  If 'a' is negative, the value is
7467| rounded and zero is returned; negative values that do not round to zero
7468| will raise the inexact exception.
7469*----------------------------------------------------------------------------*/
7470
7471uint64_t float64_to_uint64(float64 a, float_status *status)
7472{
7473    flag aSign;
7474    int aExp;
7475    int shiftCount;
7476    uint64_t aSig, aSigExtra;
7477    a = float64_squash_input_denormal(a, status);
7478
7479    aSig = extractFloat64Frac(a);
7480    aExp = extractFloat64Exp(a);
7481    aSign = extractFloat64Sign(a);
7482    if (aSign && (aExp > 1022)) {
7483        float_raise(float_flag_invalid, status);
7484        if (float64_is_any_nan(a)) {
7485            return LIT64(0xFFFFFFFFFFFFFFFF);
7486        } else {
7487            return 0;
7488        }
7489    }
7490    if (aExp) {
7491        aSig |= LIT64(0x0010000000000000);
7492    }
7493    shiftCount = 0x433 - aExp;
7494    if (shiftCount <= 0) {
7495        if (0x43E < aExp) {
7496            float_raise(float_flag_invalid, status);
7497            return LIT64(0xFFFFFFFFFFFFFFFF);
7498        }
7499        aSigExtra = 0;
7500        aSig <<= -shiftCount;
7501    } else {
7502        shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7503    }
7504    return roundAndPackUint64(aSign, aSig, aSigExtra, status);
7505}
7506
7507uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7508{
7509    signed char current_rounding_mode = status->float_rounding_mode;
7510    set_float_rounding_mode(float_round_to_zero, status);
7511    uint64_t v = float64_to_uint64(a, status);
7512    set_float_rounding_mode(current_rounding_mode, status);
7513    return v;
7514}
7515
7516#define COMPARE(s, nan_exp)                                                  \
7517static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7518                                      int is_quiet, float_status *status)    \
7519{                                                                            \
7520    flag aSign, bSign;                                                       \
7521    uint ## s ## _t av, bv;                                                  \
7522    a = float ## s ## _squash_input_denormal(a, status);                     \
7523    b = float ## s ## _squash_input_denormal(b, status);                     \
7524                                                                             \
7525    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7526         extractFloat ## s ## Frac( a ) ) ||                                 \
7527        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7528          extractFloat ## s ## Frac( b ) )) {                                \
7529        if (!is_quiet ||                                                     \
7530            float ## s ## _is_signaling_nan(a, status) ||                  \
7531            float ## s ## _is_signaling_nan(b, status)) {                 \
7532            float_raise(float_flag_invalid, status);                         \
7533        }                                                                    \
7534        return float_relation_unordered;                                     \
7535    }                                                                        \
7536    aSign = extractFloat ## s ## Sign( a );                                  \
7537    bSign = extractFloat ## s ## Sign( b );                                  \
7538    av = float ## s ## _val(a);                                              \
7539    bv = float ## s ## _val(b);                                              \
7540    if ( aSign != bSign ) {                                                  \
7541        if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7542            /* zero case */                                                  \
7543            return float_relation_equal;                                     \
7544        } else {                                                             \
7545            return 1 - (2 * aSign);                                          \
7546        }                                                                    \
7547    } else {                                                                 \
7548        if (av == bv) {                                                      \
7549            return float_relation_equal;                                     \
7550        } else {                                                             \
7551            return 1 - 2 * (aSign ^ ( av < bv ));                            \
7552        }                                                                    \
7553    }                                                                        \
7554}                                                                            \
7555                                                                             \
7556int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7557{                                                                            \
7558    return float ## s ## _compare_internal(a, b, 0, status);                 \
7559}                                                                            \
7560                                                                             \
7561int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
7562                                 float_status *status)                       \
7563{                                                                            \
7564    return float ## s ## _compare_internal(a, b, 1, status);                 \
7565}
7566
7567COMPARE(32, 0xff)
7568COMPARE(64, 0x7ff)
7569
7570static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7571                                            int is_quiet, float_status *status)
7572{
7573    flag aSign, bSign;
7574
7575    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7576        float_raise(float_flag_invalid, status);
7577        return float_relation_unordered;
7578    }
7579    if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7580          ( extractFloatx80Frac( a )<<1 ) ) ||
7581        ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7582          ( extractFloatx80Frac( b )<<1 ) )) {
7583        if (!is_quiet ||
7584            floatx80_is_signaling_nan(a, status) ||
7585            floatx80_is_signaling_nan(b, status)) {
7586            float_raise(float_flag_invalid, status);
7587        }
7588        return float_relation_unordered;
7589    }
7590    aSign = extractFloatx80Sign( a );
7591    bSign = extractFloatx80Sign( b );
7592    if ( aSign != bSign ) {
7593
7594        if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7595             ( ( a.low | b.low ) == 0 ) ) {
7596            /* zero case */
7597            return float_relation_equal;
7598        } else {
7599            return 1 - (2 * aSign);
7600        }
7601    } else {
7602        if (a.low == b.low && a.high == b.high) {
7603            return float_relation_equal;
7604        } else {
7605            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7606        }
7607    }
7608}
7609
7610int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7611{
7612    return floatx80_compare_internal(a, b, 0, status);
7613}
7614
7615int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7616{
7617    return floatx80_compare_internal(a, b, 1, status);
7618}
7619
7620static inline int float128_compare_internal(float128 a, float128 b,
7621                                            int is_quiet, float_status *status)
7622{
7623    flag aSign, bSign;
7624
7625    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7626          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7627        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7628          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7629        if (!is_quiet ||
7630            float128_is_signaling_nan(a, status) ||
7631            float128_is_signaling_nan(b, status)) {
7632            float_raise(float_flag_invalid, status);
7633        }
7634        return float_relation_unordered;
7635    }
7636    aSign = extractFloat128Sign( a );
7637    bSign = extractFloat128Sign( b );
7638    if ( aSign != bSign ) {
7639        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7640            /* zero case */
7641            return float_relation_equal;
7642        } else {
7643            return 1 - (2 * aSign);
7644        }
7645    } else {
7646        if (a.low == b.low && a.high == b.high) {
7647            return float_relation_equal;
7648        } else {
7649            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7650        }
7651    }
7652}
7653
7654int float128_compare(float128 a, float128 b, float_status *status)
7655{
7656    return float128_compare_internal(a, b, 0, status);
7657}
7658
7659int float128_compare_quiet(float128 a, float128 b, float_status *status)
7660{
7661    return float128_compare_internal(a, b, 1, status);
7662}
7663
7664/* min() and max() functions. These can't be implemented as
7665 * 'compare and pick one input' because that would mishandle
7666 * NaNs and +0 vs -0.
7667 *
7668 * minnum() and maxnum() functions. These are similar to the min()
7669 * and max() functions but if one of the arguments is a QNaN and
7670 * the other is numerical then the numerical argument is returned.
7671 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7672 * and maxNum() operations. min() and max() are the typical min/max
7673 * semantics provided by many CPUs which predate that specification.
7674 *
7675 * minnummag() and maxnummag() functions correspond to minNumMag()
7676 * and minNumMag() from the IEEE-754 2008.
7677 */
7678#define MINMAX(s)                                                       \
7679static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7680                                               int ismin, int isieee,   \
7681                                               int ismag,               \
7682                                               float_status *status)    \
7683{                                                                       \
7684    flag aSign, bSign;                                                  \
7685    uint ## s ## _t av, bv, aav, abv;                                   \
7686    a = float ## s ## _squash_input_denormal(a, status);                \
7687    b = float ## s ## _squash_input_denormal(b, status);                \
7688    if (float ## s ## _is_any_nan(a) ||                                 \
7689        float ## s ## _is_any_nan(b)) {                                 \
7690        if (isieee) {                                                   \
7691            if (float ## s ## _is_quiet_nan(a, status) &&               \
7692                !float ## s ##_is_any_nan(b)) {                         \
7693                return b;                                               \
7694            } else if (float ## s ## _is_quiet_nan(b, status) &&        \
7695                       !float ## s ## _is_any_nan(a)) {                \
7696                return a;                                               \
7697            }                                                           \
7698        }                                                               \
7699        return propagateFloat ## s ## NaN(a, b, status);                \
7700    }                                                                   \
7701    aSign = extractFloat ## s ## Sign(a);                               \
7702    bSign = extractFloat ## s ## Sign(b);                               \
7703    av = float ## s ## _val(a);                                         \
7704    bv = float ## s ## _val(b);                                         \
7705    if (ismag) {                                                        \
7706        aav = float ## s ## _abs(av);                                   \
7707        abv = float ## s ## _abs(bv);                                   \
7708        if (aav != abv) {                                               \
7709            if (ismin) {                                                \
7710                return (aav < abv) ? a : b;                             \
7711            } else {                                                    \
7712                return (aav < abv) ? b : a;                             \
7713            }                                                           \
7714        }                                                               \
7715    }                                                                   \
7716    if (aSign != bSign) {                                               \
7717        if (ismin) {                                                    \
7718            return aSign ? a : b;                                       \
7719        } else {                                                        \
7720            return aSign ? b : a;                                       \
7721        }                                                               \
7722    } else {                                                            \
7723        if (ismin) {                                                    \
7724            return (aSign ^ (av < bv)) ? a : b;                         \
7725        } else {                                                        \
7726            return (aSign ^ (av < bv)) ? b : a;                         \
7727        }                                                               \
7728    }                                                                   \
7729}                                                                       \
7730                                                                        \
7731float ## s float ## s ## _min(float ## s a, float ## s b,               \
7732                              float_status *status)                     \
7733{                                                                       \
7734    return float ## s ## _minmax(a, b, 1, 0, 0, status);                \
7735}                                                                       \
7736                                                                        \
7737float ## s float ## s ## _max(float ## s a, float ## s b,               \
7738                              float_status *status)                     \
7739{                                                                       \
7740    return float ## s ## _minmax(a, b, 0, 0, 0, status);                \
7741}                                                                       \
7742                                                                        \
7743float ## s float ## s ## _minnum(float ## s a, float ## s b,            \
7744                                 float_status *status)                  \
7745{                                                                       \
7746    return float ## s ## _minmax(a, b, 1, 1, 0, status);                \
7747}                                                                       \
7748                                                                        \
7749float ## s float ## s ## _maxnum(float ## s a, float ## s b,            \
7750                                 float_status *status)                  \
7751{                                                                       \
7752    return float ## s ## _minmax(a, b, 0, 1, 0, status);                \
7753}                                                                       \
7754                                                                        \
7755float ## s float ## s ## _minnummag(float ## s a, float ## s b,         \
7756                                    float_status *status)               \
7757{                                                                       \
7758    return float ## s ## _minmax(a, b, 1, 1, 1, status);                \
7759}                                                                       \
7760                                                                        \
7761float ## s float ## s ## _maxnummag(float ## s a, float ## s b,         \
7762                                    float_status *status)               \
7763{                                                                       \
7764    return float ## s ## _minmax(a, b, 0, 1, 1, status);                \
7765}
7766
7767MINMAX(32)
7768MINMAX(64)
7769
7770
7771/* Multiply A by 2 raised to the power N.  */
7772float32 float32_scalbn(float32 a, int n, float_status *status)
7773{
7774    flag aSign;
7775    int16_t aExp;
7776    uint32_t aSig;
7777
7778    a = float32_squash_input_denormal(a, status);
7779    aSig = extractFloat32Frac( a );
7780    aExp = extractFloat32Exp( a );
7781    aSign = extractFloat32Sign( a );
7782
7783    if ( aExp == 0xFF ) {
7784        if ( aSig ) {
7785            return propagateFloat32NaN(a, a, status);
7786        }
7787        return a;
7788    }
7789    if (aExp != 0) {
7790        aSig |= 0x00800000;
7791    } else if (aSig == 0) {
7792        return a;
7793    } else {
7794        aExp++;
7795    }
7796
7797    if (n > 0x200) {
7798        n = 0x200;
7799    } else if (n < -0x200) {
7800        n = -0x200;
7801    }
7802
7803    aExp += n - 1;
7804    aSig <<= 7;
7805    return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
7806}
7807
7808float64 float64_scalbn(float64 a, int n, float_status *status)
7809{
7810    flag aSign;
7811    int16_t aExp;
7812    uint64_t aSig;
7813
7814    a = float64_squash_input_denormal(a, status);
7815    aSig = extractFloat64Frac( a );
7816    aExp = extractFloat64Exp( a );
7817    aSign = extractFloat64Sign( a );
7818
7819    if ( aExp == 0x7FF ) {
7820        if ( aSig ) {
7821            return propagateFloat64NaN(a, a, status);
7822        }
7823        return a;
7824    }
7825    if (aExp != 0) {
7826        aSig |= LIT64( 0x0010000000000000 );
7827    } else if (aSig == 0) {
7828        return a;
7829    } else {
7830        aExp++;
7831    }
7832
7833    if (n > 0x1000) {
7834        n = 0x1000;
7835    } else if (n < -0x1000) {
7836        n = -0x1000;
7837    }
7838
7839    aExp += n - 1;
7840    aSig <<= 10;
7841    return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
7842}
7843
7844floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7845{
7846    flag aSign;
7847    int32_t aExp;
7848    uint64_t aSig;
7849
7850    if (floatx80_invalid_encoding(a)) {
7851        float_raise(float_flag_invalid, status);
7852        return floatx80_default_nan(status);
7853    }
7854    aSig = extractFloatx80Frac( a );
7855    aExp = extractFloatx80Exp( a );
7856    aSign = extractFloatx80Sign( a );
7857
7858    if ( aExp == 0x7FFF ) {
7859        if ( aSig<<1 ) {
7860            return propagateFloatx80NaN(a, a, status);
7861        }
7862        return a;
7863    }
7864
7865    if (aExp == 0) {
7866        if (aSig == 0) {
7867            return a;
7868        }
7869        aExp++;
7870    }
7871
7872    if (n > 0x10000) {
7873        n = 0x10000;
7874    } else if (n < -0x10000) {
7875        n = -0x10000;
7876    }
7877
7878    aExp += n;
7879    return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7880                                         aSign, aExp, aSig, 0, status);
7881}
7882
7883float128 float128_scalbn(float128 a, int n, float_status *status)
7884{
7885    flag aSign;
7886    int32_t aExp;
7887    uint64_t aSig0, aSig1;
7888
7889    aSig1 = extractFloat128Frac1( a );
7890    aSig0 = extractFloat128Frac0( a );
7891    aExp = extractFloat128Exp( a );
7892    aSign = extractFloat128Sign( a );
7893    if ( aExp == 0x7FFF ) {
7894        if ( aSig0 | aSig1 ) {
7895            return propagateFloat128NaN(a, a, status);
7896        }
7897        return a;
7898    }
7899    if (aExp != 0) {
7900        aSig0 |= LIT64( 0x0001000000000000 );
7901    } else if (aSig0 == 0 && aSig1 == 0) {
7902        return a;
7903    } else {
7904        aExp++;
7905    }
7906
7907    if (n > 0x10000) {
7908        n = 0x10000;
7909    } else if (n < -0x10000) {
7910        n = -0x10000;
7911    }
7912
7913    aExp += n - 1;
7914    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7915                                         , status);
7916
7917}
7918