qemu/fpu/softfloat.c
<<
>>
Prefs
   1/*
   2 * QEMU float support
   3 *
   4 * Derived from SoftFloat.
   5 */
   6
   7/*============================================================================
   8
   9This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  10Package, Release 2b.
  11
  12Written by John R. Hauser.  This work was made possible in part by the
  13International Computer Science Institute, located at Suite 600, 1947 Center
  14Street, Berkeley, California 94704.  Funding was partially provided by the
  15National Science Foundation under grant MIP-9311980.  The original version
  16of this code was written as part of a project to build a fixed-point vector
  17processor in collaboration with the University of California at Berkeley,
  18overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  19is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  20arithmetic/SoftFloat.html'.
  21
  22THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  23been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  24RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  25AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  26COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  27EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  28INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  29OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  30
  31Derivative works are acceptable, even for commercial purposes, so long as
  32(1) the source code for the derivative work includes prominent notice that
  33the work is derivative, and (2) the source code includes prominent notice with
  34these four paragraphs for those parts of this code that are retained.
  35
  36=============================================================================*/
  37
  38/* softfloat (and in particular the code in softfloat-specialize.h) is
  39 * target-dependent and needs the TARGET_* macros.
  40 */
  41#include "config.h"
  42
  43#include "fpu/softfloat.h"
  44
  45/*----------------------------------------------------------------------------
  46| Primitive arithmetic functions, including multi-word arithmetic, and
  47| division and square root approximations.  (Can be specialized to target if
  48| desired.)
  49*----------------------------------------------------------------------------*/
  50#include "softfloat-macros.h"
  51
  52/*----------------------------------------------------------------------------
  53| Functions and definitions to determine:  (1) whether tininess for underflow
  54| is detected before or after rounding by default, (2) what (if anything)
  55| happens when exceptions are raised, (3) how signaling NaNs are distinguished
  56| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  57| are propagated from function inputs to output.  These details are target-
  58| specific.
  59*----------------------------------------------------------------------------*/
  60#include "softfloat-specialize.h"
  61
  62void set_float_rounding_mode(int val STATUS_PARAM)
  63{
  64    STATUS(float_rounding_mode) = val;
  65}
  66
  67void set_float_exception_flags(int val STATUS_PARAM)
  68{
  69    STATUS(float_exception_flags) = val;
  70}
  71
  72void set_floatx80_rounding_precision(int val STATUS_PARAM)
  73{
  74    STATUS(floatx80_rounding_precision) = val;
  75}
  76
  77/*----------------------------------------------------------------------------
  78| Returns the fraction bits of the half-precision floating-point value `a'.
  79*----------------------------------------------------------------------------*/
  80
  81INLINE uint32_t extractFloat16Frac(float16 a)
  82{
  83    return float16_val(a) & 0x3ff;
  84}
  85
  86/*----------------------------------------------------------------------------
  87| Returns the exponent bits of the half-precision floating-point value `a'.
  88*----------------------------------------------------------------------------*/
  89
  90INLINE int_fast16_t extractFloat16Exp(float16 a)
  91{
  92    return (float16_val(a) >> 10) & 0x1f;
  93}
  94
  95/*----------------------------------------------------------------------------
  96| Returns the sign bit of the single-precision floating-point value `a'.
  97*----------------------------------------------------------------------------*/
  98
  99INLINE flag extractFloat16Sign(float16 a)
 100{
 101    return float16_val(a)>>15;
 102}
 103
 104/*----------------------------------------------------------------------------
 105| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 106| and 7, and returns the properly rounded 32-bit integer corresponding to the
 107| input.  If `zSign' is 1, the input is negated before being converted to an
 108| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 109| is simply rounded to an integer, with the inexact exception raised if the
 110| input cannot be represented exactly as an integer.  However, if the fixed-
 111| point input is too large, the invalid exception is raised and the largest
 112| positive or negative integer is returned.
 113*----------------------------------------------------------------------------*/
 114
 115static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
 116{
 117    int8 roundingMode;
 118    flag roundNearestEven;
 119    int8 roundIncrement, roundBits;
 120    int32_t z;
 121
 122    roundingMode = STATUS(float_rounding_mode);
 123    roundNearestEven = ( roundingMode == float_round_nearest_even );
 124    roundIncrement = 0x40;
 125    if ( ! roundNearestEven ) {
 126        if ( roundingMode == float_round_to_zero ) {
 127            roundIncrement = 0;
 128        }
 129        else {
 130            roundIncrement = 0x7F;
 131            if ( zSign ) {
 132                if ( roundingMode == float_round_up ) roundIncrement = 0;
 133            }
 134            else {
 135                if ( roundingMode == float_round_down ) roundIncrement = 0;
 136            }
 137        }
 138    }
 139    roundBits = absZ & 0x7F;
 140    absZ = ( absZ + roundIncrement )>>7;
 141    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 142    z = absZ;
 143    if ( zSign ) z = - z;
 144    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 145        float_raise( float_flag_invalid STATUS_VAR);
 146        return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 147    }
 148    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 149    return z;
 150
 151}
 152
 153/*----------------------------------------------------------------------------
 154| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 155| `absZ1', with binary point between bits 63 and 64 (between the input words),
 156| and returns the properly rounded 64-bit integer corresponding to the input.
 157| If `zSign' is 1, the input is negated before being converted to an integer.
 158| Ordinarily, the fixed-point input is simply rounded to an integer, with
 159| the inexact exception raised if the input cannot be represented exactly as
 160| an integer.  However, if the fixed-point input is too large, the invalid
 161| exception is raised and the largest positive or negative integer is
 162| returned.
 163*----------------------------------------------------------------------------*/
 164
 165static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
 166{
 167    int8 roundingMode;
 168    flag roundNearestEven, increment;
 169    int64_t z;
 170
 171    roundingMode = STATUS(float_rounding_mode);
 172    roundNearestEven = ( roundingMode == float_round_nearest_even );
 173    increment = ( (int64_t) absZ1 < 0 );
 174    if ( ! roundNearestEven ) {
 175        if ( roundingMode == float_round_to_zero ) {
 176            increment = 0;
 177        }
 178        else {
 179            if ( zSign ) {
 180                increment = ( roundingMode == float_round_down ) && absZ1;
 181            }
 182            else {
 183                increment = ( roundingMode == float_round_up ) && absZ1;
 184            }
 185        }
 186    }
 187    if ( increment ) {
 188        ++absZ0;
 189        if ( absZ0 == 0 ) goto overflow;
 190        absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 191    }
 192    z = absZ0;
 193    if ( zSign ) z = - z;
 194    if ( z && ( ( z < 0 ) ^ zSign ) ) {
 195 overflow:
 196        float_raise( float_flag_invalid STATUS_VAR);
 197        return
 198              zSign ? (int64_t) LIT64( 0x8000000000000000 )
 199            : LIT64( 0x7FFFFFFFFFFFFFFF );
 200    }
 201    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 202    return z;
 203
 204}
 205
 206/*----------------------------------------------------------------------------
 207| Returns the fraction bits of the single-precision floating-point value `a'.
 208*----------------------------------------------------------------------------*/
 209
 210INLINE uint32_t extractFloat32Frac( float32 a )
 211{
 212
 213    return float32_val(a) & 0x007FFFFF;
 214
 215}
 216
 217/*----------------------------------------------------------------------------
 218| Returns the exponent bits of the single-precision floating-point value `a'.
 219*----------------------------------------------------------------------------*/
 220
 221INLINE int_fast16_t extractFloat32Exp(float32 a)
 222{
 223
 224    return ( float32_val(a)>>23 ) & 0xFF;
 225
 226}
 227
 228/*----------------------------------------------------------------------------
 229| Returns the sign bit of the single-precision floating-point value `a'.
 230*----------------------------------------------------------------------------*/
 231
 232INLINE flag extractFloat32Sign( float32 a )
 233{
 234
 235    return float32_val(a)>>31;
 236
 237}
 238
 239/*----------------------------------------------------------------------------
 240| If `a' is denormal and we are in flush-to-zero mode then set the
 241| input-denormal exception and return zero. Otherwise just return the value.
 242*----------------------------------------------------------------------------*/
 243static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
 244{
 245    if (STATUS(flush_inputs_to_zero)) {
 246        if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 247            float_raise(float_flag_input_denormal STATUS_VAR);
 248            return make_float32(float32_val(a) & 0x80000000);
 249        }
 250    }
 251    return a;
 252}
 253
 254/*----------------------------------------------------------------------------
 255| Normalizes the subnormal single-precision floating-point value represented
 256| by the denormalized significand `aSig'.  The normalized exponent and
 257| significand are stored at the locations pointed to by `zExpPtr' and
 258| `zSigPtr', respectively.
 259*----------------------------------------------------------------------------*/
 260
 261static void
 262 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
 263{
 264    int8 shiftCount;
 265
 266    shiftCount = countLeadingZeros32( aSig ) - 8;
 267    *zSigPtr = aSig<<shiftCount;
 268    *zExpPtr = 1 - shiftCount;
 269
 270}
 271
 272/*----------------------------------------------------------------------------
 273| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 274| single-precision floating-point value, returning the result.  After being
 275| shifted into the proper positions, the three fields are simply added
 276| together to form the result.  This means that any integer portion of `zSig'
 277| will be added into the exponent.  Since a properly normalized significand
 278| will have an integer portion equal to 1, the `zExp' input should be 1 less
 279| than the desired result exponent whenever `zSig' is a complete, normalized
 280| significand.
 281*----------------------------------------------------------------------------*/
 282
 283INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
 284{
 285
 286    return make_float32(
 287          ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 288
 289}
 290
 291/*----------------------------------------------------------------------------
 292| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 293| and significand `zSig', and returns the proper single-precision floating-
 294| point value corresponding to the abstract input.  Ordinarily, the abstract
 295| value is simply rounded and packed into the single-precision format, with
 296| the inexact exception raised if the abstract input cannot be represented
 297| exactly.  However, if the abstract value is too large, the overflow and
 298| inexact exceptions are raised and an infinity or maximal finite value is
 299| returned.  If the abstract value is too small, the input value is rounded to
 300| a subnormal number, and the underflow and inexact exceptions are raised if
 301| the abstract input cannot be represented exactly as a subnormal single-
 302| precision floating-point number.
 303|     The input significand `zSig' has its binary point between bits 30
 304| and 29, which is 7 bits to the left of the usual location.  This shifted
 305| significand must be normalized or smaller.  If `zSig' is not normalized,
 306| `zExp' must be 0; in that case, the result returned is a subnormal number,
 307| and it must not require rounding.  In the usual case that `zSig' is
 308| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 309| The handling of underflow and overflow follows the IEC/IEEE Standard for
 310| Binary Floating-Point Arithmetic.
 311*----------------------------------------------------------------------------*/
 312
 313static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
 314{
 315    int8 roundingMode;
 316    flag roundNearestEven;
 317    int8 roundIncrement, roundBits;
 318    flag isTiny;
 319
 320    roundingMode = STATUS(float_rounding_mode);
 321    roundNearestEven = ( roundingMode == float_round_nearest_even );
 322    roundIncrement = 0x40;
 323    if ( ! roundNearestEven ) {
 324        if ( roundingMode == float_round_to_zero ) {
 325            roundIncrement = 0;
 326        }
 327        else {
 328            roundIncrement = 0x7F;
 329            if ( zSign ) {
 330                if ( roundingMode == float_round_up ) roundIncrement = 0;
 331            }
 332            else {
 333                if ( roundingMode == float_round_down ) roundIncrement = 0;
 334            }
 335        }
 336    }
 337    roundBits = zSig & 0x7F;
 338    if ( 0xFD <= (uint16_t) zExp ) {
 339        if (    ( 0xFD < zExp )
 340             || (    ( zExp == 0xFD )
 341                  && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 342           ) {
 343            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 344            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 345        }
 346        if ( zExp < 0 ) {
 347            if (STATUS(flush_to_zero)) {
 348                float_raise(float_flag_output_denormal STATUS_VAR);
 349                return packFloat32(zSign, 0, 0);
 350            }
 351            isTiny =
 352                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 353                || ( zExp < -1 )
 354                || ( zSig + roundIncrement < 0x80000000 );
 355            shift32RightJamming( zSig, - zExp, &zSig );
 356            zExp = 0;
 357            roundBits = zSig & 0x7F;
 358            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 359        }
 360    }
 361    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 362    zSig = ( zSig + roundIncrement )>>7;
 363    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 364    if ( zSig == 0 ) zExp = 0;
 365    return packFloat32( zSign, zExp, zSig );
 366
 367}
 368
 369/*----------------------------------------------------------------------------
 370| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 371| and significand `zSig', and returns the proper single-precision floating-
 372| point value corresponding to the abstract input.  This routine is just like
 373| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 374| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 375| floating-point exponent.
 376*----------------------------------------------------------------------------*/
 377
 378static float32
 379 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
 380{
 381    int8 shiftCount;
 382
 383    shiftCount = countLeadingZeros32( zSig ) - 1;
 384    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 385
 386}
 387
 388/*----------------------------------------------------------------------------
 389| Returns the fraction bits of the double-precision floating-point value `a'.
 390*----------------------------------------------------------------------------*/
 391
 392INLINE uint64_t extractFloat64Frac( float64 a )
 393{
 394
 395    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 396
 397}
 398
 399/*----------------------------------------------------------------------------
 400| Returns the exponent bits of the double-precision floating-point value `a'.
 401*----------------------------------------------------------------------------*/
 402
 403INLINE int_fast16_t extractFloat64Exp(float64 a)
 404{
 405
 406    return ( float64_val(a)>>52 ) & 0x7FF;
 407
 408}
 409
 410/*----------------------------------------------------------------------------
 411| Returns the sign bit of the double-precision floating-point value `a'.
 412*----------------------------------------------------------------------------*/
 413
 414INLINE flag extractFloat64Sign( float64 a )
 415{
 416
 417    return float64_val(a)>>63;
 418
 419}
 420
 421/*----------------------------------------------------------------------------
 422| If `a' is denormal and we are in flush-to-zero mode then set the
 423| input-denormal exception and return zero. Otherwise just return the value.
 424*----------------------------------------------------------------------------*/
 425static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
 426{
 427    if (STATUS(flush_inputs_to_zero)) {
 428        if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 429            float_raise(float_flag_input_denormal STATUS_VAR);
 430            return make_float64(float64_val(a) & (1ULL << 63));
 431        }
 432    }
 433    return a;
 434}
 435
 436/*----------------------------------------------------------------------------
 437| Normalizes the subnormal double-precision floating-point value represented
 438| by the denormalized significand `aSig'.  The normalized exponent and
 439| significand are stored at the locations pointed to by `zExpPtr' and
 440| `zSigPtr', respectively.
 441*----------------------------------------------------------------------------*/
 442
 443static void
 444 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
 445{
 446    int8 shiftCount;
 447
 448    shiftCount = countLeadingZeros64( aSig ) - 11;
 449    *zSigPtr = aSig<<shiftCount;
 450    *zExpPtr = 1 - shiftCount;
 451
 452}
 453
 454/*----------------------------------------------------------------------------
 455| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 456| double-precision floating-point value, returning the result.  After being
 457| shifted into the proper positions, the three fields are simply added
 458| together to form the result.  This means that any integer portion of `zSig'
 459| will be added into the exponent.  Since a properly normalized significand
 460| will have an integer portion equal to 1, the `zExp' input should be 1 less
 461| than the desired result exponent whenever `zSig' is a complete, normalized
 462| significand.
 463*----------------------------------------------------------------------------*/
 464
 465INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
 466{
 467
 468    return make_float64(
 469        ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 470
 471}
 472
 473/*----------------------------------------------------------------------------
 474| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 475| and significand `zSig', and returns the proper double-precision floating-
 476| point value corresponding to the abstract input.  Ordinarily, the abstract
 477| value is simply rounded and packed into the double-precision format, with
 478| the inexact exception raised if the abstract input cannot be represented
 479| exactly.  However, if the abstract value is too large, the overflow and
 480| inexact exceptions are raised and an infinity or maximal finite value is
 481| returned.  If the abstract value is too small, the input value is rounded
 482| to a subnormal number, and the underflow and inexact exceptions are raised
 483| if the abstract input cannot be represented exactly as a subnormal double-
 484| precision floating-point number.
 485|     The input significand `zSig' has its binary point between bits 62
 486| and 61, which is 10 bits to the left of the usual location.  This shifted
 487| significand must be normalized or smaller.  If `zSig' is not normalized,
 488| `zExp' must be 0; in that case, the result returned is a subnormal number,
 489| and it must not require rounding.  In the usual case that `zSig' is
 490| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 491| The handling of underflow and overflow follows the IEC/IEEE Standard for
 492| Binary Floating-Point Arithmetic.
 493*----------------------------------------------------------------------------*/
 494
 495static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
 496{
 497    int8 roundingMode;
 498    flag roundNearestEven;
 499    int_fast16_t roundIncrement, roundBits;
 500    flag isTiny;
 501
 502    roundingMode = STATUS(float_rounding_mode);
 503    roundNearestEven = ( roundingMode == float_round_nearest_even );
 504    roundIncrement = 0x200;
 505    if ( ! roundNearestEven ) {
 506        if ( roundingMode == float_round_to_zero ) {
 507            roundIncrement = 0;
 508        }
 509        else {
 510            roundIncrement = 0x3FF;
 511            if ( zSign ) {
 512                if ( roundingMode == float_round_up ) roundIncrement = 0;
 513            }
 514            else {
 515                if ( roundingMode == float_round_down ) roundIncrement = 0;
 516            }
 517        }
 518    }
 519    roundBits = zSig & 0x3FF;
 520    if ( 0x7FD <= (uint16_t) zExp ) {
 521        if (    ( 0x7FD < zExp )
 522             || (    ( zExp == 0x7FD )
 523                  && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 524           ) {
 525            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 526            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 527        }
 528        if ( zExp < 0 ) {
 529            if (STATUS(flush_to_zero)) {
 530                float_raise(float_flag_output_denormal STATUS_VAR);
 531                return packFloat64(zSign, 0, 0);
 532            }
 533            isTiny =
 534                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 535                || ( zExp < -1 )
 536                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 537            shift64RightJamming( zSig, - zExp, &zSig );
 538            zExp = 0;
 539            roundBits = zSig & 0x3FF;
 540            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 541        }
 542    }
 543    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 544    zSig = ( zSig + roundIncrement )>>10;
 545    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 546    if ( zSig == 0 ) zExp = 0;
 547    return packFloat64( zSign, zExp, zSig );
 548
 549}
 550
 551/*----------------------------------------------------------------------------
 552| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 553| and significand `zSig', and returns the proper double-precision floating-
 554| point value corresponding to the abstract input.  This routine is just like
 555| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 556| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 557| floating-point exponent.
 558*----------------------------------------------------------------------------*/
 559
 560static float64
 561 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
 562{
 563    int8 shiftCount;
 564
 565    shiftCount = countLeadingZeros64( zSig ) - 1;
 566    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 567
 568}
 569
 570/*----------------------------------------------------------------------------
 571| Returns the fraction bits of the extended double-precision floating-point
 572| value `a'.
 573*----------------------------------------------------------------------------*/
 574
 575INLINE uint64_t extractFloatx80Frac( floatx80 a )
 576{
 577
 578    return a.low;
 579
 580}
 581
 582/*----------------------------------------------------------------------------
 583| Returns the exponent bits of the extended double-precision floating-point
 584| value `a'.
 585*----------------------------------------------------------------------------*/
 586
 587INLINE int32 extractFloatx80Exp( floatx80 a )
 588{
 589
 590    return a.high & 0x7FFF;
 591
 592}
 593
 594/*----------------------------------------------------------------------------
 595| Returns the sign bit of the extended double-precision floating-point value
 596| `a'.
 597*----------------------------------------------------------------------------*/
 598
 599INLINE flag extractFloatx80Sign( floatx80 a )
 600{
 601
 602    return a.high>>15;
 603
 604}
 605
 606/*----------------------------------------------------------------------------
 607| Normalizes the subnormal extended double-precision floating-point value
 608| represented by the denormalized significand `aSig'.  The normalized exponent
 609| and significand are stored at the locations pointed to by `zExpPtr' and
 610| `zSigPtr', respectively.
 611*----------------------------------------------------------------------------*/
 612
 613static void
 614 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
 615{
 616    int8 shiftCount;
 617
 618    shiftCount = countLeadingZeros64( aSig );
 619    *zSigPtr = aSig<<shiftCount;
 620    *zExpPtr = 1 - shiftCount;
 621
 622}
 623
 624/*----------------------------------------------------------------------------
 625| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 626| extended double-precision floating-point value, returning the result.
 627*----------------------------------------------------------------------------*/
 628
 629INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
 630{
 631    floatx80 z;
 632
 633    z.low = zSig;
 634    z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 635    return z;
 636
 637}
 638
 639/*----------------------------------------------------------------------------
 640| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 641| and extended significand formed by the concatenation of `zSig0' and `zSig1',
 642| and returns the proper extended double-precision floating-point value
 643| corresponding to the abstract input.  Ordinarily, the abstract value is
 644| rounded and packed into the extended double-precision format, with the
 645| inexact exception raised if the abstract input cannot be represented
 646| exactly.  However, if the abstract value is too large, the overflow and
 647| inexact exceptions are raised and an infinity or maximal finite value is
 648| returned.  If the abstract value is too small, the input value is rounded to
 649| a subnormal number, and the underflow and inexact exceptions are raised if
 650| the abstract input cannot be represented exactly as a subnormal extended
 651| double-precision floating-point number.
 652|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 653| number of bits as single or double precision, respectively.  Otherwise, the
 654| result is rounded to the full precision of the extended double-precision
 655| format.
 656|     The input significand must be normalized or smaller.  If the input
 657| significand is not normalized, `zExp' must be 0; in that case, the result
 658| returned is a subnormal number, and it must not require rounding.  The
 659| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 660| Floating-Point Arithmetic.
 661*----------------------------------------------------------------------------*/
 662
 663static floatx80
 664 roundAndPackFloatx80(
 665     int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 666 STATUS_PARAM)
 667{
 668    int8 roundingMode;
 669    flag roundNearestEven, increment, isTiny;
 670    int64 roundIncrement, roundMask, roundBits;
 671
 672    roundingMode = STATUS(float_rounding_mode);
 673    roundNearestEven = ( roundingMode == float_round_nearest_even );
 674    if ( roundingPrecision == 80 ) goto precision80;
 675    if ( roundingPrecision == 64 ) {
 676        roundIncrement = LIT64( 0x0000000000000400 );
 677        roundMask = LIT64( 0x00000000000007FF );
 678    }
 679    else if ( roundingPrecision == 32 ) {
 680        roundIncrement = LIT64( 0x0000008000000000 );
 681        roundMask = LIT64( 0x000000FFFFFFFFFF );
 682    }
 683    else {
 684        goto precision80;
 685    }
 686    zSig0 |= ( zSig1 != 0 );
 687    if ( ! roundNearestEven ) {
 688        if ( roundingMode == float_round_to_zero ) {
 689            roundIncrement = 0;
 690        }
 691        else {
 692            roundIncrement = roundMask;
 693            if ( zSign ) {
 694                if ( roundingMode == float_round_up ) roundIncrement = 0;
 695            }
 696            else {
 697                if ( roundingMode == float_round_down ) roundIncrement = 0;
 698            }
 699        }
 700    }
 701    roundBits = zSig0 & roundMask;
 702    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 703        if (    ( 0x7FFE < zExp )
 704             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 705           ) {
 706            goto overflow;
 707        }
 708        if ( zExp <= 0 ) {
 709            if (STATUS(flush_to_zero)) {
 710                float_raise(float_flag_output_denormal STATUS_VAR);
 711                return packFloatx80(zSign, 0, 0);
 712            }
 713            isTiny =
 714                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 715                || ( zExp < 0 )
 716                || ( zSig0 <= zSig0 + roundIncrement );
 717            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 718            zExp = 0;
 719            roundBits = zSig0 & roundMask;
 720            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 721            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 722            zSig0 += roundIncrement;
 723            if ( (int64_t) zSig0 < 0 ) zExp = 1;
 724            roundIncrement = roundMask + 1;
 725            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 726                roundMask |= roundIncrement;
 727            }
 728            zSig0 &= ~ roundMask;
 729            return packFloatx80( zSign, zExp, zSig0 );
 730        }
 731    }
 732    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 733    zSig0 += roundIncrement;
 734    if ( zSig0 < roundIncrement ) {
 735        ++zExp;
 736        zSig0 = LIT64( 0x8000000000000000 );
 737    }
 738    roundIncrement = roundMask + 1;
 739    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 740        roundMask |= roundIncrement;
 741    }
 742    zSig0 &= ~ roundMask;
 743    if ( zSig0 == 0 ) zExp = 0;
 744    return packFloatx80( zSign, zExp, zSig0 );
 745 precision80:
 746    increment = ( (int64_t) zSig1 < 0 );
 747    if ( ! roundNearestEven ) {
 748        if ( roundingMode == float_round_to_zero ) {
 749            increment = 0;
 750        }
 751        else {
 752            if ( zSign ) {
 753                increment = ( roundingMode == float_round_down ) && zSig1;
 754            }
 755            else {
 756                increment = ( roundingMode == float_round_up ) && zSig1;
 757            }
 758        }
 759    }
 760    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 761        if (    ( 0x7FFE < zExp )
 762             || (    ( zExp == 0x7FFE )
 763                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 764                  && increment
 765                )
 766           ) {
 767            roundMask = 0;
 768 overflow:
 769            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 770            if (    ( roundingMode == float_round_to_zero )
 771                 || ( zSign && ( roundingMode == float_round_up ) )
 772                 || ( ! zSign && ( roundingMode == float_round_down ) )
 773               ) {
 774                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 775            }
 776            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 777        }
 778        if ( zExp <= 0 ) {
 779            isTiny =
 780                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 781                || ( zExp < 0 )
 782                || ! increment
 783                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 784            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 785            zExp = 0;
 786            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 787            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 788            if ( roundNearestEven ) {
 789                increment = ( (int64_t) zSig1 < 0 );
 790            }
 791            else {
 792                if ( zSign ) {
 793                    increment = ( roundingMode == float_round_down ) && zSig1;
 794                }
 795                else {
 796                    increment = ( roundingMode == float_round_up ) && zSig1;
 797                }
 798            }
 799            if ( increment ) {
 800                ++zSig0;
 801                zSig0 &=
 802                    ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 803                if ( (int64_t) zSig0 < 0 ) zExp = 1;
 804            }
 805            return packFloatx80( zSign, zExp, zSig0 );
 806        }
 807    }
 808    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 809    if ( increment ) {
 810        ++zSig0;
 811        if ( zSig0 == 0 ) {
 812            ++zExp;
 813            zSig0 = LIT64( 0x8000000000000000 );
 814        }
 815        else {
 816            zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 817        }
 818    }
 819    else {
 820        if ( zSig0 == 0 ) zExp = 0;
 821    }
 822    return packFloatx80( zSign, zExp, zSig0 );
 823
 824}
 825
 826/*----------------------------------------------------------------------------
 827| Takes an abstract floating-point value having sign `zSign', exponent
 828| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 829| and returns the proper extended double-precision floating-point value
 830| corresponding to the abstract input.  This routine is just like
 831| `roundAndPackFloatx80' except that the input significand does not have to be
 832| normalized.
 833*----------------------------------------------------------------------------*/
 834
 835static floatx80
 836 normalizeRoundAndPackFloatx80(
 837     int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 838 STATUS_PARAM)
 839{
 840    int8 shiftCount;
 841
 842    if ( zSig0 == 0 ) {
 843        zSig0 = zSig1;
 844        zSig1 = 0;
 845        zExp -= 64;
 846    }
 847    shiftCount = countLeadingZeros64( zSig0 );
 848    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 849    zExp -= shiftCount;
 850    return
 851        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 852
 853}
 854
 855/*----------------------------------------------------------------------------
 856| Returns the least-significant 64 fraction bits of the quadruple-precision
 857| floating-point value `a'.
 858*----------------------------------------------------------------------------*/
 859
 860INLINE uint64_t extractFloat128Frac1( float128 a )
 861{
 862
 863    return a.low;
 864
 865}
 866
 867/*----------------------------------------------------------------------------
 868| Returns the most-significant 48 fraction bits of the quadruple-precision
 869| floating-point value `a'.
 870*----------------------------------------------------------------------------*/
 871
 872INLINE uint64_t extractFloat128Frac0( float128 a )
 873{
 874
 875    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 876
 877}
 878
 879/*----------------------------------------------------------------------------
 880| Returns the exponent bits of the quadruple-precision floating-point value
 881| `a'.
 882*----------------------------------------------------------------------------*/
 883
 884INLINE int32 extractFloat128Exp( float128 a )
 885{
 886
 887    return ( a.high>>48 ) & 0x7FFF;
 888
 889}
 890
 891/*----------------------------------------------------------------------------
 892| Returns the sign bit of the quadruple-precision floating-point value `a'.
 893*----------------------------------------------------------------------------*/
 894
 895INLINE flag extractFloat128Sign( float128 a )
 896{
 897
 898    return a.high>>63;
 899
 900}
 901
 902/*----------------------------------------------------------------------------
 903| Normalizes the subnormal quadruple-precision floating-point value
 904| represented by the denormalized significand formed by the concatenation of
 905| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 906| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 907| significand are stored at the location pointed to by `zSig0Ptr', and the
 908| least significant 64 bits of the normalized significand are stored at the
 909| location pointed to by `zSig1Ptr'.
 910*----------------------------------------------------------------------------*/
 911
 912static void
 913 normalizeFloat128Subnormal(
 914     uint64_t aSig0,
 915     uint64_t aSig1,
 916     int32 *zExpPtr,
 917     uint64_t *zSig0Ptr,
 918     uint64_t *zSig1Ptr
 919 )
 920{
 921    int8 shiftCount;
 922
 923    if ( aSig0 == 0 ) {
 924        shiftCount = countLeadingZeros64( aSig1 ) - 15;
 925        if ( shiftCount < 0 ) {
 926            *zSig0Ptr = aSig1>>( - shiftCount );
 927            *zSig1Ptr = aSig1<<( shiftCount & 63 );
 928        }
 929        else {
 930            *zSig0Ptr = aSig1<<shiftCount;
 931            *zSig1Ptr = 0;
 932        }
 933        *zExpPtr = - shiftCount - 63;
 934    }
 935    else {
 936        shiftCount = countLeadingZeros64( aSig0 ) - 15;
 937        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 938        *zExpPtr = 1 - shiftCount;
 939    }
 940
 941}
 942
 943/*----------------------------------------------------------------------------
 944| Packs the sign `zSign', the exponent `zExp', and the significand formed
 945| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 946| floating-point value, returning the result.  After being shifted into the
 947| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 948| added together to form the most significant 32 bits of the result.  This
 949| means that any integer portion of `zSig0' will be added into the exponent.
 950| Since a properly normalized significand will have an integer portion equal
 951| to 1, the `zExp' input should be 1 less than the desired result exponent
 952| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 953| significand.
 954*----------------------------------------------------------------------------*/
 955
 956INLINE float128
 957 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
 958{
 959    float128 z;
 960
 961    z.low = zSig1;
 962    z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
 963    return z;
 964
 965}
 966
 967/*----------------------------------------------------------------------------
 968| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 969| and extended significand formed by the concatenation of `zSig0', `zSig1',
 970| and `zSig2', and returns the proper quadruple-precision floating-point value
 971| corresponding to the abstract input.  Ordinarily, the abstract value is
 972| simply rounded and packed into the quadruple-precision format, with the
 973| inexact exception raised if the abstract input cannot be represented
 974| exactly.  However, if the abstract value is too large, the overflow and
 975| inexact exceptions are raised and an infinity or maximal finite value is
 976| returned.  If the abstract value is too small, the input value is rounded to
 977| a subnormal number, and the underflow and inexact exceptions are raised if
 978| the abstract input cannot be represented exactly as a subnormal quadruple-
 979| precision floating-point number.
 980|     The input significand must be normalized or smaller.  If the input
 981| significand is not normalized, `zExp' must be 0; in that case, the result
 982| returned is a subnormal number, and it must not require rounding.  In the
 983| usual case that the input significand is normalized, `zExp' must be 1 less
 984| than the ``true'' floating-point exponent.  The handling of underflow and
 985| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 986*----------------------------------------------------------------------------*/
 987
 988static float128
 989 roundAndPackFloat128(
 990     flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
 991{
 992    int8 roundingMode;
 993    flag roundNearestEven, increment, isTiny;
 994
 995    roundingMode = STATUS(float_rounding_mode);
 996    roundNearestEven = ( roundingMode == float_round_nearest_even );
 997    increment = ( (int64_t) zSig2 < 0 );
 998    if ( ! roundNearestEven ) {
 999        if ( roundingMode == float_round_to_zero ) {
1000            increment = 0;
1001        }
1002        else {
1003            if ( zSign ) {
1004                increment = ( roundingMode == float_round_down ) && zSig2;
1005            }
1006            else {
1007                increment = ( roundingMode == float_round_up ) && zSig2;
1008            }
1009        }
1010    }
1011    if ( 0x7FFD <= (uint32_t) zExp ) {
1012        if (    ( 0x7FFD < zExp )
1013             || (    ( zExp == 0x7FFD )
1014                  && eq128(
1015                         LIT64( 0x0001FFFFFFFFFFFF ),
1016                         LIT64( 0xFFFFFFFFFFFFFFFF ),
1017                         zSig0,
1018                         zSig1
1019                     )
1020                  && increment
1021                )
1022           ) {
1023            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1024            if (    ( roundingMode == float_round_to_zero )
1025                 || ( zSign && ( roundingMode == float_round_up ) )
1026                 || ( ! zSign && ( roundingMode == float_round_down ) )
1027               ) {
1028                return
1029                    packFloat128(
1030                        zSign,
1031                        0x7FFE,
1032                        LIT64( 0x0000FFFFFFFFFFFF ),
1033                        LIT64( 0xFFFFFFFFFFFFFFFF )
1034                    );
1035            }
1036            return packFloat128( zSign, 0x7FFF, 0, 0 );
1037        }
1038        if ( zExp < 0 ) {
1039            if (STATUS(flush_to_zero)) {
1040                float_raise(float_flag_output_denormal STATUS_VAR);
1041                return packFloat128(zSign, 0, 0, 0);
1042            }
1043            isTiny =
1044                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1045                || ( zExp < -1 )
1046                || ! increment
1047                || lt128(
1048                       zSig0,
1049                       zSig1,
1050                       LIT64( 0x0001FFFFFFFFFFFF ),
1051                       LIT64( 0xFFFFFFFFFFFFFFFF )
1052                   );
1053            shift128ExtraRightJamming(
1054                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1055            zExp = 0;
1056            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1057            if ( roundNearestEven ) {
1058                increment = ( (int64_t) zSig2 < 0 );
1059            }
1060            else {
1061                if ( zSign ) {
1062                    increment = ( roundingMode == float_round_down ) && zSig2;
1063                }
1064                else {
1065                    increment = ( roundingMode == float_round_up ) && zSig2;
1066                }
1067            }
1068        }
1069    }
1070    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1071    if ( increment ) {
1072        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1073        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1074    }
1075    else {
1076        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1077    }
1078    return packFloat128( zSign, zExp, zSig0, zSig1 );
1079
1080}
1081
1082/*----------------------------------------------------------------------------
1083| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1084| and significand formed by the concatenation of `zSig0' and `zSig1', and
1085| returns the proper quadruple-precision floating-point value corresponding
1086| to the abstract input.  This routine is just like `roundAndPackFloat128'
1087| except that the input significand has fewer bits and does not have to be
1088| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1089| point exponent.
1090*----------------------------------------------------------------------------*/
1091
1092static float128
1093 normalizeRoundAndPackFloat128(
1094     flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1095{
1096    int8 shiftCount;
1097    uint64_t zSig2;
1098
1099    if ( zSig0 == 0 ) {
1100        zSig0 = zSig1;
1101        zSig1 = 0;
1102        zExp -= 64;
1103    }
1104    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1105    if ( 0 <= shiftCount ) {
1106        zSig2 = 0;
1107        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1108    }
1109    else {
1110        shift128ExtraRightJamming(
1111            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1112    }
1113    zExp -= shiftCount;
1114    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1115
1116}
1117
1118/*----------------------------------------------------------------------------
1119| Returns the result of converting the 32-bit two's complement integer `a'
1120| to the single-precision floating-point format.  The conversion is performed
1121| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1122*----------------------------------------------------------------------------*/
1123
1124float32 int32_to_float32( int32 a STATUS_PARAM )
1125{
1126    flag zSign;
1127
1128    if ( a == 0 ) return float32_zero;
1129    if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1130    zSign = ( a < 0 );
1131    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1132
1133}
1134
1135/*----------------------------------------------------------------------------
1136| Returns the result of converting the 32-bit two's complement integer `a'
1137| to the double-precision floating-point format.  The conversion is performed
1138| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139*----------------------------------------------------------------------------*/
1140
1141float64 int32_to_float64( int32 a STATUS_PARAM )
1142{
1143    flag zSign;
1144    uint32 absA;
1145    int8 shiftCount;
1146    uint64_t zSig;
1147
1148    if ( a == 0 ) return float64_zero;
1149    zSign = ( a < 0 );
1150    absA = zSign ? - a : a;
1151    shiftCount = countLeadingZeros32( absA ) + 21;
1152    zSig = absA;
1153    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1154
1155}
1156
1157/*----------------------------------------------------------------------------
1158| Returns the result of converting the 32-bit two's complement integer `a'
1159| to the extended double-precision floating-point format.  The conversion
1160| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1161| Arithmetic.
1162*----------------------------------------------------------------------------*/
1163
1164floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1165{
1166    flag zSign;
1167    uint32 absA;
1168    int8 shiftCount;
1169    uint64_t zSig;
1170
1171    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1172    zSign = ( a < 0 );
1173    absA = zSign ? - a : a;
1174    shiftCount = countLeadingZeros32( absA ) + 32;
1175    zSig = absA;
1176    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1177
1178}
1179
1180/*----------------------------------------------------------------------------
1181| Returns the result of converting the 32-bit two's complement integer `a' to
1182| the quadruple-precision floating-point format.  The conversion is performed
1183| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1184*----------------------------------------------------------------------------*/
1185
1186float128 int32_to_float128( int32 a STATUS_PARAM )
1187{
1188    flag zSign;
1189    uint32 absA;
1190    int8 shiftCount;
1191    uint64_t zSig0;
1192
1193    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1194    zSign = ( a < 0 );
1195    absA = zSign ? - a : a;
1196    shiftCount = countLeadingZeros32( absA ) + 17;
1197    zSig0 = absA;
1198    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1199
1200}
1201
1202/*----------------------------------------------------------------------------
1203| Returns the result of converting the 64-bit two's complement integer `a'
1204| to the single-precision floating-point format.  The conversion is performed
1205| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1206*----------------------------------------------------------------------------*/
1207
1208float32 int64_to_float32( int64 a STATUS_PARAM )
1209{
1210    flag zSign;
1211    uint64 absA;
1212    int8 shiftCount;
1213
1214    if ( a == 0 ) return float32_zero;
1215    zSign = ( a < 0 );
1216    absA = zSign ? - a : a;
1217    shiftCount = countLeadingZeros64( absA ) - 40;
1218    if ( 0 <= shiftCount ) {
1219        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1220    }
1221    else {
1222        shiftCount += 7;
1223        if ( shiftCount < 0 ) {
1224            shift64RightJamming( absA, - shiftCount, &absA );
1225        }
1226        else {
1227            absA <<= shiftCount;
1228        }
1229        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1230    }
1231
1232}
1233
1234float32 uint64_to_float32( uint64 a STATUS_PARAM )
1235{
1236    int8 shiftCount;
1237
1238    if ( a == 0 ) return float32_zero;
1239    shiftCount = countLeadingZeros64( a ) - 40;
1240    if ( 0 <= shiftCount ) {
1241        return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
1242    }
1243    else {
1244        shiftCount += 7;
1245        if ( shiftCount < 0 ) {
1246            shift64RightJamming( a, - shiftCount, &a );
1247        }
1248        else {
1249            a <<= shiftCount;
1250        }
1251        return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
1252    }
1253}
1254
1255/*----------------------------------------------------------------------------
1256| Returns the result of converting the 64-bit two's complement integer `a'
1257| to the double-precision floating-point format.  The conversion is performed
1258| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1259*----------------------------------------------------------------------------*/
1260
1261float64 int64_to_float64( int64 a STATUS_PARAM )
1262{
1263    flag zSign;
1264
1265    if ( a == 0 ) return float64_zero;
1266    if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1267        return packFloat64( 1, 0x43E, 0 );
1268    }
1269    zSign = ( a < 0 );
1270    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1271
1272}
1273
1274float64 uint64_to_float64(uint64 a STATUS_PARAM)
1275{
1276    int exp =  0x43C;
1277
1278    if (a == 0) {
1279        return float64_zero;
1280    }
1281    if ((int64_t)a < 0) {
1282        shift64RightJamming(a, 1, &a);
1283        exp += 1;
1284    }
1285    return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
1286}
1287
1288/*----------------------------------------------------------------------------
1289| Returns the result of converting the 64-bit two's complement integer `a'
1290| to the extended double-precision floating-point format.  The conversion
1291| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1292| Arithmetic.
1293*----------------------------------------------------------------------------*/
1294
1295floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1296{
1297    flag zSign;
1298    uint64 absA;
1299    int8 shiftCount;
1300
1301    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1302    zSign = ( a < 0 );
1303    absA = zSign ? - a : a;
1304    shiftCount = countLeadingZeros64( absA );
1305    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1306
1307}
1308
1309/*----------------------------------------------------------------------------
1310| Returns the result of converting the 64-bit two's complement integer `a' to
1311| the quadruple-precision floating-point format.  The conversion is performed
1312| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1313*----------------------------------------------------------------------------*/
1314
1315float128 int64_to_float128( int64 a STATUS_PARAM )
1316{
1317    flag zSign;
1318    uint64 absA;
1319    int8 shiftCount;
1320    int32 zExp;
1321    uint64_t zSig0, zSig1;
1322
1323    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1324    zSign = ( a < 0 );
1325    absA = zSign ? - a : a;
1326    shiftCount = countLeadingZeros64( absA ) + 49;
1327    zExp = 0x406E - shiftCount;
1328    if ( 64 <= shiftCount ) {
1329        zSig1 = 0;
1330        zSig0 = absA;
1331        shiftCount -= 64;
1332    }
1333    else {
1334        zSig1 = absA;
1335        zSig0 = 0;
1336    }
1337    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1338    return packFloat128( zSign, zExp, zSig0, zSig1 );
1339
1340}
1341
1342float128 uint64_to_float128(uint64 a STATUS_PARAM)
1343{
1344    if (a == 0) {
1345        return float128_zero;
1346    }
1347    return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1348}
1349
1350/*----------------------------------------------------------------------------
1351| Returns the result of converting the single-precision floating-point value
1352| `a' to the 32-bit two's complement integer format.  The conversion is
1353| performed according to the IEC/IEEE Standard for Binary Floating-Point
1354| Arithmetic---which means in particular that the conversion is rounded
1355| according to the current rounding mode.  If `a' is a NaN, the largest
1356| positive integer is returned.  Otherwise, if the conversion overflows, the
1357| largest integer with the same sign as `a' is returned.
1358*----------------------------------------------------------------------------*/
1359
1360int32 float32_to_int32( float32 a STATUS_PARAM )
1361{
1362    flag aSign;
1363    int_fast16_t aExp, shiftCount;
1364    uint32_t aSig;
1365    uint64_t aSig64;
1366
1367    a = float32_squash_input_denormal(a STATUS_VAR);
1368    aSig = extractFloat32Frac( a );
1369    aExp = extractFloat32Exp( a );
1370    aSign = extractFloat32Sign( a );
1371    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1372    if ( aExp ) aSig |= 0x00800000;
1373    shiftCount = 0xAF - aExp;
1374    aSig64 = aSig;
1375    aSig64 <<= 32;
1376    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1377    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1378
1379}
1380
1381/*----------------------------------------------------------------------------
1382| Returns the result of converting the single-precision floating-point value
1383| `a' to the 32-bit two's complement integer format.  The conversion is
1384| performed according to the IEC/IEEE Standard for Binary Floating-Point
1385| Arithmetic, except that the conversion is always rounded toward zero.
1386| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1387| the conversion overflows, the largest integer with the same sign as `a' is
1388| returned.
1389*----------------------------------------------------------------------------*/
1390
1391int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1392{
1393    flag aSign;
1394    int_fast16_t aExp, shiftCount;
1395    uint32_t aSig;
1396    int32_t z;
1397    a = float32_squash_input_denormal(a STATUS_VAR);
1398
1399    aSig = extractFloat32Frac( a );
1400    aExp = extractFloat32Exp( a );
1401    aSign = extractFloat32Sign( a );
1402    shiftCount = aExp - 0x9E;
1403    if ( 0 <= shiftCount ) {
1404        if ( float32_val(a) != 0xCF000000 ) {
1405            float_raise( float_flag_invalid STATUS_VAR);
1406            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1407        }
1408        return (int32_t) 0x80000000;
1409    }
1410    else if ( aExp <= 0x7E ) {
1411        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1412        return 0;
1413    }
1414    aSig = ( aSig | 0x00800000 )<<8;
1415    z = aSig>>( - shiftCount );
1416    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1417        STATUS(float_exception_flags) |= float_flag_inexact;
1418    }
1419    if ( aSign ) z = - z;
1420    return z;
1421
1422}
1423
1424/*----------------------------------------------------------------------------
1425| Returns the result of converting the single-precision floating-point value
1426| `a' to the 16-bit two's complement integer format.  The conversion is
1427| performed according to the IEC/IEEE Standard for Binary Floating-Point
1428| Arithmetic, except that the conversion is always rounded toward zero.
1429| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1430| the conversion overflows, the largest integer with the same sign as `a' is
1431| returned.
1432*----------------------------------------------------------------------------*/
1433
1434int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
1435{
1436    flag aSign;
1437    int_fast16_t aExp, shiftCount;
1438    uint32_t aSig;
1439    int32 z;
1440
1441    aSig = extractFloat32Frac( a );
1442    aExp = extractFloat32Exp( a );
1443    aSign = extractFloat32Sign( a );
1444    shiftCount = aExp - 0x8E;
1445    if ( 0 <= shiftCount ) {
1446        if ( float32_val(a) != 0xC7000000 ) {
1447            float_raise( float_flag_invalid STATUS_VAR);
1448            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1449                return 0x7FFF;
1450            }
1451        }
1452        return (int32_t) 0xffff8000;
1453    }
1454    else if ( aExp <= 0x7E ) {
1455        if ( aExp | aSig ) {
1456            STATUS(float_exception_flags) |= float_flag_inexact;
1457        }
1458        return 0;
1459    }
1460    shiftCount -= 0x10;
1461    aSig = ( aSig | 0x00800000 )<<8;
1462    z = aSig>>( - shiftCount );
1463    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1464        STATUS(float_exception_flags) |= float_flag_inexact;
1465    }
1466    if ( aSign ) {
1467        z = - z;
1468    }
1469    return z;
1470
1471}
1472
1473/*----------------------------------------------------------------------------
1474| Returns the result of converting the single-precision floating-point value
1475| `a' to the 64-bit two's complement integer format.  The conversion is
1476| performed according to the IEC/IEEE Standard for Binary Floating-Point
1477| Arithmetic---which means in particular that the conversion is rounded
1478| according to the current rounding mode.  If `a' is a NaN, the largest
1479| positive integer is returned.  Otherwise, if the conversion overflows, the
1480| largest integer with the same sign as `a' is returned.
1481*----------------------------------------------------------------------------*/
1482
1483int64 float32_to_int64( float32 a STATUS_PARAM )
1484{
1485    flag aSign;
1486    int_fast16_t aExp, shiftCount;
1487    uint32_t aSig;
1488    uint64_t aSig64, aSigExtra;
1489    a = float32_squash_input_denormal(a STATUS_VAR);
1490
1491    aSig = extractFloat32Frac( a );
1492    aExp = extractFloat32Exp( a );
1493    aSign = extractFloat32Sign( a );
1494    shiftCount = 0xBE - aExp;
1495    if ( shiftCount < 0 ) {
1496        float_raise( float_flag_invalid STATUS_VAR);
1497        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1498            return LIT64( 0x7FFFFFFFFFFFFFFF );
1499        }
1500        return (int64_t) LIT64( 0x8000000000000000 );
1501    }
1502    if ( aExp ) aSig |= 0x00800000;
1503    aSig64 = aSig;
1504    aSig64 <<= 40;
1505    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1506    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1507
1508}
1509
1510/*----------------------------------------------------------------------------
1511| Returns the result of converting the single-precision floating-point value
1512| `a' to the 64-bit two's complement integer format.  The conversion is
1513| performed according to the IEC/IEEE Standard for Binary Floating-Point
1514| Arithmetic, except that the conversion is always rounded toward zero.  If
1515| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1516| conversion overflows, the largest integer with the same sign as `a' is
1517| returned.
1518*----------------------------------------------------------------------------*/
1519
1520int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1521{
1522    flag aSign;
1523    int_fast16_t aExp, shiftCount;
1524    uint32_t aSig;
1525    uint64_t aSig64;
1526    int64 z;
1527    a = float32_squash_input_denormal(a STATUS_VAR);
1528
1529    aSig = extractFloat32Frac( a );
1530    aExp = extractFloat32Exp( a );
1531    aSign = extractFloat32Sign( a );
1532    shiftCount = aExp - 0xBE;
1533    if ( 0 <= shiftCount ) {
1534        if ( float32_val(a) != 0xDF000000 ) {
1535            float_raise( float_flag_invalid STATUS_VAR);
1536            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1537                return LIT64( 0x7FFFFFFFFFFFFFFF );
1538            }
1539        }
1540        return (int64_t) LIT64( 0x8000000000000000 );
1541    }
1542    else if ( aExp <= 0x7E ) {
1543        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1544        return 0;
1545    }
1546    aSig64 = aSig | 0x00800000;
1547    aSig64 <<= 40;
1548    z = aSig64>>( - shiftCount );
1549    if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1550        STATUS(float_exception_flags) |= float_flag_inexact;
1551    }
1552    if ( aSign ) z = - z;
1553    return z;
1554
1555}
1556
1557/*----------------------------------------------------------------------------
1558| Returns the result of converting the single-precision floating-point value
1559| `a' to the double-precision floating-point format.  The conversion is
1560| performed according to the IEC/IEEE Standard for Binary Floating-Point
1561| Arithmetic.
1562*----------------------------------------------------------------------------*/
1563
1564float64 float32_to_float64( float32 a STATUS_PARAM )
1565{
1566    flag aSign;
1567    int_fast16_t aExp;
1568    uint32_t aSig;
1569    a = float32_squash_input_denormal(a STATUS_VAR);
1570
1571    aSig = extractFloat32Frac( a );
1572    aExp = extractFloat32Exp( a );
1573    aSign = extractFloat32Sign( a );
1574    if ( aExp == 0xFF ) {
1575        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1576        return packFloat64( aSign, 0x7FF, 0 );
1577    }
1578    if ( aExp == 0 ) {
1579        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1580        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1581        --aExp;
1582    }
1583    return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1584
1585}
1586
1587/*----------------------------------------------------------------------------
1588| Returns the result of converting the single-precision floating-point value
1589| `a' to the extended double-precision floating-point format.  The conversion
1590| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1591| Arithmetic.
1592*----------------------------------------------------------------------------*/
1593
1594floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1595{
1596    flag aSign;
1597    int_fast16_t aExp;
1598    uint32_t aSig;
1599
1600    a = float32_squash_input_denormal(a STATUS_VAR);
1601    aSig = extractFloat32Frac( a );
1602    aExp = extractFloat32Exp( a );
1603    aSign = extractFloat32Sign( a );
1604    if ( aExp == 0xFF ) {
1605        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1606        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1607    }
1608    if ( aExp == 0 ) {
1609        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1610        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1611    }
1612    aSig |= 0x00800000;
1613    return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1614
1615}
1616
1617/*----------------------------------------------------------------------------
1618| Returns the result of converting the single-precision floating-point value
1619| `a' to the double-precision floating-point format.  The conversion is
1620| performed according to the IEC/IEEE Standard for Binary Floating-Point
1621| Arithmetic.
1622*----------------------------------------------------------------------------*/
1623
1624float128 float32_to_float128( float32 a STATUS_PARAM )
1625{
1626    flag aSign;
1627    int_fast16_t aExp;
1628    uint32_t aSig;
1629
1630    a = float32_squash_input_denormal(a STATUS_VAR);
1631    aSig = extractFloat32Frac( a );
1632    aExp = extractFloat32Exp( a );
1633    aSign = extractFloat32Sign( a );
1634    if ( aExp == 0xFF ) {
1635        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1636        return packFloat128( aSign, 0x7FFF, 0, 0 );
1637    }
1638    if ( aExp == 0 ) {
1639        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1640        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1641        --aExp;
1642    }
1643    return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1644
1645}
1646
1647/*----------------------------------------------------------------------------
1648| Rounds the single-precision floating-point value `a' to an integer, and
1649| returns the result as a single-precision floating-point value.  The
1650| operation is performed according to the IEC/IEEE Standard for Binary
1651| Floating-Point Arithmetic.
1652*----------------------------------------------------------------------------*/
1653
1654float32 float32_round_to_int( float32 a STATUS_PARAM)
1655{
1656    flag aSign;
1657    int_fast16_t aExp;
1658    uint32_t lastBitMask, roundBitsMask;
1659    int8 roundingMode;
1660    uint32_t z;
1661    a = float32_squash_input_denormal(a STATUS_VAR);
1662
1663    aExp = extractFloat32Exp( a );
1664    if ( 0x96 <= aExp ) {
1665        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1666            return propagateFloat32NaN( a, a STATUS_VAR );
1667        }
1668        return a;
1669    }
1670    if ( aExp <= 0x7E ) {
1671        if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1672        STATUS(float_exception_flags) |= float_flag_inexact;
1673        aSign = extractFloat32Sign( a );
1674        switch ( STATUS(float_rounding_mode) ) {
1675         case float_round_nearest_even:
1676            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1677                return packFloat32( aSign, 0x7F, 0 );
1678            }
1679            break;
1680         case float_round_down:
1681            return make_float32(aSign ? 0xBF800000 : 0);
1682         case float_round_up:
1683            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1684        }
1685        return packFloat32( aSign, 0, 0 );
1686    }
1687    lastBitMask = 1;
1688    lastBitMask <<= 0x96 - aExp;
1689    roundBitsMask = lastBitMask - 1;
1690    z = float32_val(a);
1691    roundingMode = STATUS(float_rounding_mode);
1692    if ( roundingMode == float_round_nearest_even ) {
1693        z += lastBitMask>>1;
1694        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1695    }
1696    else if ( roundingMode != float_round_to_zero ) {
1697        if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1698            z += roundBitsMask;
1699        }
1700    }
1701    z &= ~ roundBitsMask;
1702    if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1703    return make_float32(z);
1704
1705}
1706
1707/*----------------------------------------------------------------------------
1708| Returns the result of adding the absolute values of the single-precision
1709| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1710| before being returned.  `zSign' is ignored if the result is a NaN.
1711| The addition is performed according to the IEC/IEEE Standard for Binary
1712| Floating-Point Arithmetic.
1713*----------------------------------------------------------------------------*/
1714
1715static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1716{
1717    int_fast16_t aExp, bExp, zExp;
1718    uint32_t aSig, bSig, zSig;
1719    int_fast16_t expDiff;
1720
1721    aSig = extractFloat32Frac( a );
1722    aExp = extractFloat32Exp( a );
1723    bSig = extractFloat32Frac( b );
1724    bExp = extractFloat32Exp( b );
1725    expDiff = aExp - bExp;
1726    aSig <<= 6;
1727    bSig <<= 6;
1728    if ( 0 < expDiff ) {
1729        if ( aExp == 0xFF ) {
1730            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1731            return a;
1732        }
1733        if ( bExp == 0 ) {
1734            --expDiff;
1735        }
1736        else {
1737            bSig |= 0x20000000;
1738        }
1739        shift32RightJamming( bSig, expDiff, &bSig );
1740        zExp = aExp;
1741    }
1742    else if ( expDiff < 0 ) {
1743        if ( bExp == 0xFF ) {
1744            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1745            return packFloat32( zSign, 0xFF, 0 );
1746        }
1747        if ( aExp == 0 ) {
1748            ++expDiff;
1749        }
1750        else {
1751            aSig |= 0x20000000;
1752        }
1753        shift32RightJamming( aSig, - expDiff, &aSig );
1754        zExp = bExp;
1755    }
1756    else {
1757        if ( aExp == 0xFF ) {
1758            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1759            return a;
1760        }
1761        if ( aExp == 0 ) {
1762            if (STATUS(flush_to_zero)) {
1763                if (aSig | bSig) {
1764                    float_raise(float_flag_output_denormal STATUS_VAR);
1765                }
1766                return packFloat32(zSign, 0, 0);
1767            }
1768            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1769        }
1770        zSig = 0x40000000 + aSig + bSig;
1771        zExp = aExp;
1772        goto roundAndPack;
1773    }
1774    aSig |= 0x20000000;
1775    zSig = ( aSig + bSig )<<1;
1776    --zExp;
1777    if ( (int32_t) zSig < 0 ) {
1778        zSig = aSig + bSig;
1779        ++zExp;
1780    }
1781 roundAndPack:
1782    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1783
1784}
1785
1786/*----------------------------------------------------------------------------
1787| Returns the result of subtracting the absolute values of the single-
1788| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1789| difference is negated before being returned.  `zSign' is ignored if the
1790| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1791| Standard for Binary Floating-Point Arithmetic.
1792*----------------------------------------------------------------------------*/
1793
1794static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1795{
1796    int_fast16_t aExp, bExp, zExp;
1797    uint32_t aSig, bSig, zSig;
1798    int_fast16_t expDiff;
1799
1800    aSig = extractFloat32Frac( a );
1801    aExp = extractFloat32Exp( a );
1802    bSig = extractFloat32Frac( b );
1803    bExp = extractFloat32Exp( b );
1804    expDiff = aExp - bExp;
1805    aSig <<= 7;
1806    bSig <<= 7;
1807    if ( 0 < expDiff ) goto aExpBigger;
1808    if ( expDiff < 0 ) goto bExpBigger;
1809    if ( aExp == 0xFF ) {
1810        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1811        float_raise( float_flag_invalid STATUS_VAR);
1812        return float32_default_nan;
1813    }
1814    if ( aExp == 0 ) {
1815        aExp = 1;
1816        bExp = 1;
1817    }
1818    if ( bSig < aSig ) goto aBigger;
1819    if ( aSig < bSig ) goto bBigger;
1820    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1821 bExpBigger:
1822    if ( bExp == 0xFF ) {
1823        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1824        return packFloat32( zSign ^ 1, 0xFF, 0 );
1825    }
1826    if ( aExp == 0 ) {
1827        ++expDiff;
1828    }
1829    else {
1830        aSig |= 0x40000000;
1831    }
1832    shift32RightJamming( aSig, - expDiff, &aSig );
1833    bSig |= 0x40000000;
1834 bBigger:
1835    zSig = bSig - aSig;
1836    zExp = bExp;
1837    zSign ^= 1;
1838    goto normalizeRoundAndPack;
1839 aExpBigger:
1840    if ( aExp == 0xFF ) {
1841        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1842        return a;
1843    }
1844    if ( bExp == 0 ) {
1845        --expDiff;
1846    }
1847    else {
1848        bSig |= 0x40000000;
1849    }
1850    shift32RightJamming( bSig, expDiff, &bSig );
1851    aSig |= 0x40000000;
1852 aBigger:
1853    zSig = aSig - bSig;
1854    zExp = aExp;
1855 normalizeRoundAndPack:
1856    --zExp;
1857    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1858
1859}
1860
1861/*----------------------------------------------------------------------------
1862| Returns the result of adding the single-precision floating-point values `a'
1863| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1864| Binary Floating-Point Arithmetic.
1865*----------------------------------------------------------------------------*/
1866
1867float32 float32_add( float32 a, float32 b STATUS_PARAM )
1868{
1869    flag aSign, bSign;
1870    a = float32_squash_input_denormal(a STATUS_VAR);
1871    b = float32_squash_input_denormal(b STATUS_VAR);
1872
1873    aSign = extractFloat32Sign( a );
1874    bSign = extractFloat32Sign( b );
1875    if ( aSign == bSign ) {
1876        return addFloat32Sigs( a, b, aSign STATUS_VAR);
1877    }
1878    else {
1879        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1880    }
1881
1882}
1883
1884/*----------------------------------------------------------------------------
1885| Returns the result of subtracting the single-precision floating-point values
1886| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1887| for Binary Floating-Point Arithmetic.
1888*----------------------------------------------------------------------------*/
1889
1890float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1891{
1892    flag aSign, bSign;
1893    a = float32_squash_input_denormal(a STATUS_VAR);
1894    b = float32_squash_input_denormal(b STATUS_VAR);
1895
1896    aSign = extractFloat32Sign( a );
1897    bSign = extractFloat32Sign( b );
1898    if ( aSign == bSign ) {
1899        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1900    }
1901    else {
1902        return addFloat32Sigs( a, b, aSign STATUS_VAR );
1903    }
1904
1905}
1906
1907/*----------------------------------------------------------------------------
1908| Returns the result of multiplying the single-precision floating-point values
1909| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1910| for Binary Floating-Point Arithmetic.
1911*----------------------------------------------------------------------------*/
1912
1913float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1914{
1915    flag aSign, bSign, zSign;
1916    int_fast16_t aExp, bExp, zExp;
1917    uint32_t aSig, bSig;
1918    uint64_t zSig64;
1919    uint32_t zSig;
1920
1921    a = float32_squash_input_denormal(a STATUS_VAR);
1922    b = float32_squash_input_denormal(b STATUS_VAR);
1923
1924    aSig = extractFloat32Frac( a );
1925    aExp = extractFloat32Exp( a );
1926    aSign = extractFloat32Sign( a );
1927    bSig = extractFloat32Frac( b );
1928    bExp = extractFloat32Exp( b );
1929    bSign = extractFloat32Sign( b );
1930    zSign = aSign ^ bSign;
1931    if ( aExp == 0xFF ) {
1932        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1933            return propagateFloat32NaN( a, b STATUS_VAR );
1934        }
1935        if ( ( bExp | bSig ) == 0 ) {
1936            float_raise( float_flag_invalid STATUS_VAR);
1937            return float32_default_nan;
1938        }
1939        return packFloat32( zSign, 0xFF, 0 );
1940    }
1941    if ( bExp == 0xFF ) {
1942        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1943        if ( ( aExp | aSig ) == 0 ) {
1944            float_raise( float_flag_invalid STATUS_VAR);
1945            return float32_default_nan;
1946        }
1947        return packFloat32( zSign, 0xFF, 0 );
1948    }
1949    if ( aExp == 0 ) {
1950        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1951        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1952    }
1953    if ( bExp == 0 ) {
1954        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1955        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1956    }
1957    zExp = aExp + bExp - 0x7F;
1958    aSig = ( aSig | 0x00800000 )<<7;
1959    bSig = ( bSig | 0x00800000 )<<8;
1960    shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1961    zSig = zSig64;
1962    if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1963        zSig <<= 1;
1964        --zExp;
1965    }
1966    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1967
1968}
1969
1970/*----------------------------------------------------------------------------
1971| Returns the result of dividing the single-precision floating-point value `a'
1972| by the corresponding value `b'.  The operation is performed according to the
1973| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1974*----------------------------------------------------------------------------*/
1975
1976float32 float32_div( float32 a, float32 b STATUS_PARAM )
1977{
1978    flag aSign, bSign, zSign;
1979    int_fast16_t aExp, bExp, zExp;
1980    uint32_t aSig, bSig, zSig;
1981    a = float32_squash_input_denormal(a STATUS_VAR);
1982    b = float32_squash_input_denormal(b STATUS_VAR);
1983
1984    aSig = extractFloat32Frac( a );
1985    aExp = extractFloat32Exp( a );
1986    aSign = extractFloat32Sign( a );
1987    bSig = extractFloat32Frac( b );
1988    bExp = extractFloat32Exp( b );
1989    bSign = extractFloat32Sign( b );
1990    zSign = aSign ^ bSign;
1991    if ( aExp == 0xFF ) {
1992        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1993        if ( bExp == 0xFF ) {
1994            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1995            float_raise( float_flag_invalid STATUS_VAR);
1996            return float32_default_nan;
1997        }
1998        return packFloat32( zSign, 0xFF, 0 );
1999    }
2000    if ( bExp == 0xFF ) {
2001        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2002        return packFloat32( zSign, 0, 0 );
2003    }
2004    if ( bExp == 0 ) {
2005        if ( bSig == 0 ) {
2006            if ( ( aExp | aSig ) == 0 ) {
2007                float_raise( float_flag_invalid STATUS_VAR);
2008                return float32_default_nan;
2009            }
2010            float_raise( float_flag_divbyzero STATUS_VAR);
2011            return packFloat32( zSign, 0xFF, 0 );
2012        }
2013        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2014    }
2015    if ( aExp == 0 ) {
2016        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2017        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2018    }
2019    zExp = aExp - bExp + 0x7D;
2020    aSig = ( aSig | 0x00800000 )<<7;
2021    bSig = ( bSig | 0x00800000 )<<8;
2022    if ( bSig <= ( aSig + aSig ) ) {
2023        aSig >>= 1;
2024        ++zExp;
2025    }
2026    zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2027    if ( ( zSig & 0x3F ) == 0 ) {
2028        zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2029    }
2030    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2031
2032}
2033
2034/*----------------------------------------------------------------------------
2035| Returns the remainder of the single-precision floating-point value `a'
2036| with respect to the corresponding value `b'.  The operation is performed
2037| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2038*----------------------------------------------------------------------------*/
2039
2040float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2041{
2042    flag aSign, zSign;
2043    int_fast16_t aExp, bExp, expDiff;
2044    uint32_t aSig, bSig;
2045    uint32_t q;
2046    uint64_t aSig64, bSig64, q64;
2047    uint32_t alternateASig;
2048    int32_t sigMean;
2049    a = float32_squash_input_denormal(a STATUS_VAR);
2050    b = float32_squash_input_denormal(b STATUS_VAR);
2051
2052    aSig = extractFloat32Frac( a );
2053    aExp = extractFloat32Exp( a );
2054    aSign = extractFloat32Sign( a );
2055    bSig = extractFloat32Frac( b );
2056    bExp = extractFloat32Exp( b );
2057    if ( aExp == 0xFF ) {
2058        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2059            return propagateFloat32NaN( a, b STATUS_VAR );
2060        }
2061        float_raise( float_flag_invalid STATUS_VAR);
2062        return float32_default_nan;
2063    }
2064    if ( bExp == 0xFF ) {
2065        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2066        return a;
2067    }
2068    if ( bExp == 0 ) {
2069        if ( bSig == 0 ) {
2070            float_raise( float_flag_invalid STATUS_VAR);
2071            return float32_default_nan;
2072        }
2073        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2074    }
2075    if ( aExp == 0 ) {
2076        if ( aSig == 0 ) return a;
2077        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2078    }
2079    expDiff = aExp - bExp;
2080    aSig |= 0x00800000;
2081    bSig |= 0x00800000;
2082    if ( expDiff < 32 ) {
2083        aSig <<= 8;
2084        bSig <<= 8;
2085        if ( expDiff < 0 ) {
2086            if ( expDiff < -1 ) return a;
2087            aSig >>= 1;
2088        }
2089        q = ( bSig <= aSig );
2090        if ( q ) aSig -= bSig;
2091        if ( 0 < expDiff ) {
2092            q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2093            q >>= 32 - expDiff;
2094            bSig >>= 2;
2095            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2096        }
2097        else {
2098            aSig >>= 2;
2099            bSig >>= 2;
2100        }
2101    }
2102    else {
2103        if ( bSig <= aSig ) aSig -= bSig;
2104        aSig64 = ( (uint64_t) aSig )<<40;
2105        bSig64 = ( (uint64_t) bSig )<<40;
2106        expDiff -= 64;
2107        while ( 0 < expDiff ) {
2108            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2109            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2110            aSig64 = - ( ( bSig * q64 )<<38 );
2111            expDiff -= 62;
2112        }
2113        expDiff += 64;
2114        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2115        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2116        q = q64>>( 64 - expDiff );
2117        bSig <<= 6;
2118        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2119    }
2120    do {
2121        alternateASig = aSig;
2122        ++q;
2123        aSig -= bSig;
2124    } while ( 0 <= (int32_t) aSig );
2125    sigMean = aSig + alternateASig;
2126    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2127        aSig = alternateASig;
2128    }
2129    zSign = ( (int32_t) aSig < 0 );
2130    if ( zSign ) aSig = - aSig;
2131    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2132
2133}
2134
2135/*----------------------------------------------------------------------------
2136| Returns the result of multiplying the single-precision floating-point values
2137| `a' and `b' then adding 'c', with no intermediate rounding step after the
2138| multiplication.  The operation is performed according to the IEC/IEEE
2139| Standard for Binary Floating-Point Arithmetic 754-2008.
2140| The flags argument allows the caller to select negation of the
2141| addend, the intermediate product, or the final result. (The difference
2142| between this and having the caller do a separate negation is that negating
2143| externally will flip the sign bit on NaNs.)
2144*----------------------------------------------------------------------------*/
2145
2146float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2147{
2148    flag aSign, bSign, cSign, zSign;
2149    int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2150    uint32_t aSig, bSig, cSig;
2151    flag pInf, pZero, pSign;
2152    uint64_t pSig64, cSig64, zSig64;
2153    uint32_t pSig;
2154    int shiftcount;
2155    flag signflip, infzero;
2156
2157    a = float32_squash_input_denormal(a STATUS_VAR);
2158    b = float32_squash_input_denormal(b STATUS_VAR);
2159    c = float32_squash_input_denormal(c STATUS_VAR);
2160    aSig = extractFloat32Frac(a);
2161    aExp = extractFloat32Exp(a);
2162    aSign = extractFloat32Sign(a);
2163    bSig = extractFloat32Frac(b);
2164    bExp = extractFloat32Exp(b);
2165    bSign = extractFloat32Sign(b);
2166    cSig = extractFloat32Frac(c);
2167    cExp = extractFloat32Exp(c);
2168    cSign = extractFloat32Sign(c);
2169
2170    infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2171               (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2172
2173    /* It is implementation-defined whether the cases of (0,inf,qnan)
2174     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2175     * they return if they do), so we have to hand this information
2176     * off to the target-specific pick-a-NaN routine.
2177     */
2178    if (((aExp == 0xff) && aSig) ||
2179        ((bExp == 0xff) && bSig) ||
2180        ((cExp == 0xff) && cSig)) {
2181        return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2182    }
2183
2184    if (infzero) {
2185        float_raise(float_flag_invalid STATUS_VAR);
2186        return float32_default_nan;
2187    }
2188
2189    if (flags & float_muladd_negate_c) {
2190        cSign ^= 1;
2191    }
2192
2193    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2194
2195    /* Work out the sign and type of the product */
2196    pSign = aSign ^ bSign;
2197    if (flags & float_muladd_negate_product) {
2198        pSign ^= 1;
2199    }
2200    pInf = (aExp == 0xff) || (bExp == 0xff);
2201    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2202
2203    if (cExp == 0xff) {
2204        if (pInf && (pSign ^ cSign)) {
2205            /* addition of opposite-signed infinities => InvalidOperation */
2206            float_raise(float_flag_invalid STATUS_VAR);
2207            return float32_default_nan;
2208        }
2209        /* Otherwise generate an infinity of the same sign */
2210        return packFloat32(cSign ^ signflip, 0xff, 0);
2211    }
2212
2213    if (pInf) {
2214        return packFloat32(pSign ^ signflip, 0xff, 0);
2215    }
2216
2217    if (pZero) {
2218        if (cExp == 0) {
2219            if (cSig == 0) {
2220                /* Adding two exact zeroes */
2221                if (pSign == cSign) {
2222                    zSign = pSign;
2223                } else if (STATUS(float_rounding_mode) == float_round_down) {
2224                    zSign = 1;
2225                } else {
2226                    zSign = 0;
2227                }
2228                return packFloat32(zSign ^ signflip, 0, 0);
2229            }
2230            /* Exact zero plus a denorm */
2231            if (STATUS(flush_to_zero)) {
2232                float_raise(float_flag_output_denormal STATUS_VAR);
2233                return packFloat32(cSign ^ signflip, 0, 0);
2234            }
2235        }
2236        /* Zero plus something non-zero : just return the something */
2237        return packFloat32(cSign ^ signflip, cExp, cSig);
2238    }
2239
2240    if (aExp == 0) {
2241        normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2242    }
2243    if (bExp == 0) {
2244        normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2245    }
2246
2247    /* Calculate the actual result a * b + c */
2248
2249    /* Multiply first; this is easy. */
2250    /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2251     * because we want the true exponent, not the "one-less-than"
2252     * flavour that roundAndPackFloat32() takes.
2253     */
2254    pExp = aExp + bExp - 0x7e;
2255    aSig = (aSig | 0x00800000) << 7;
2256    bSig = (bSig | 0x00800000) << 8;
2257    pSig64 = (uint64_t)aSig * bSig;
2258    if ((int64_t)(pSig64 << 1) >= 0) {
2259        pSig64 <<= 1;
2260        pExp--;
2261    }
2262
2263    zSign = pSign ^ signflip;
2264
2265    /* Now pSig64 is the significand of the multiply, with the explicit bit in
2266     * position 62.
2267     */
2268    if (cExp == 0) {
2269        if (!cSig) {
2270            /* Throw out the special case of c being an exact zero now */
2271            shift64RightJamming(pSig64, 32, &pSig64);
2272            pSig = pSig64;
2273            return roundAndPackFloat32(zSign, pExp - 1,
2274                                       pSig STATUS_VAR);
2275        }
2276        normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2277    }
2278
2279    cSig64 = (uint64_t)cSig << (62 - 23);
2280    cSig64 |= LIT64(0x4000000000000000);
2281    expDiff = pExp - cExp;
2282
2283    if (pSign == cSign) {
2284        /* Addition */
2285        if (expDiff > 0) {
2286            /* scale c to match p */
2287            shift64RightJamming(cSig64, expDiff, &cSig64);
2288            zExp = pExp;
2289        } else if (expDiff < 0) {
2290            /* scale p to match c */
2291            shift64RightJamming(pSig64, -expDiff, &pSig64);
2292            zExp = cExp;
2293        } else {
2294            /* no scaling needed */
2295            zExp = cExp;
2296        }
2297        /* Add significands and make sure explicit bit ends up in posn 62 */
2298        zSig64 = pSig64 + cSig64;
2299        if ((int64_t)zSig64 < 0) {
2300            shift64RightJamming(zSig64, 1, &zSig64);
2301        } else {
2302            zExp--;
2303        }
2304    } else {
2305        /* Subtraction */
2306        if (expDiff > 0) {
2307            shift64RightJamming(cSig64, expDiff, &cSig64);
2308            zSig64 = pSig64 - cSig64;
2309            zExp = pExp;
2310        } else if (expDiff < 0) {
2311            shift64RightJamming(pSig64, -expDiff, &pSig64);
2312            zSig64 = cSig64 - pSig64;
2313            zExp = cExp;
2314            zSign ^= 1;
2315        } else {
2316            zExp = pExp;
2317            if (cSig64 < pSig64) {
2318                zSig64 = pSig64 - cSig64;
2319            } else if (pSig64 < cSig64) {
2320                zSig64 = cSig64 - pSig64;
2321                zSign ^= 1;
2322            } else {
2323                /* Exact zero */
2324                zSign = signflip;
2325                if (STATUS(float_rounding_mode) == float_round_down) {
2326                    zSign ^= 1;
2327                }
2328                return packFloat32(zSign, 0, 0);
2329            }
2330        }
2331        --zExp;
2332        /* Normalize to put the explicit bit back into bit 62. */
2333        shiftcount = countLeadingZeros64(zSig64) - 1;
2334        zSig64 <<= shiftcount;
2335        zExp -= shiftcount;
2336    }
2337    shift64RightJamming(zSig64, 32, &zSig64);
2338    return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2339}
2340
2341
2342/*----------------------------------------------------------------------------
2343| Returns the square root of the single-precision floating-point value `a'.
2344| The operation is performed according to the IEC/IEEE Standard for Binary
2345| Floating-Point Arithmetic.
2346*----------------------------------------------------------------------------*/
2347
2348float32 float32_sqrt( float32 a STATUS_PARAM )
2349{
2350    flag aSign;
2351    int_fast16_t aExp, zExp;
2352    uint32_t aSig, zSig;
2353    uint64_t rem, term;
2354    a = float32_squash_input_denormal(a STATUS_VAR);
2355
2356    aSig = extractFloat32Frac( a );
2357    aExp = extractFloat32Exp( a );
2358    aSign = extractFloat32Sign( a );
2359    if ( aExp == 0xFF ) {
2360        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2361        if ( ! aSign ) return a;
2362        float_raise( float_flag_invalid STATUS_VAR);
2363        return float32_default_nan;
2364    }
2365    if ( aSign ) {
2366        if ( ( aExp | aSig ) == 0 ) return a;
2367        float_raise( float_flag_invalid STATUS_VAR);
2368        return float32_default_nan;
2369    }
2370    if ( aExp == 0 ) {
2371        if ( aSig == 0 ) return float32_zero;
2372        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2373    }
2374    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2375    aSig = ( aSig | 0x00800000 )<<8;
2376    zSig = estimateSqrt32( aExp, aSig ) + 2;
2377    if ( ( zSig & 0x7F ) <= 5 ) {
2378        if ( zSig < 2 ) {
2379            zSig = 0x7FFFFFFF;
2380            goto roundAndPack;
2381        }
2382        aSig >>= aExp & 1;
2383        term = ( (uint64_t) zSig ) * zSig;
2384        rem = ( ( (uint64_t) aSig )<<32 ) - term;
2385        while ( (int64_t) rem < 0 ) {
2386            --zSig;
2387            rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2388        }
2389        zSig |= ( rem != 0 );
2390    }
2391    shift32RightJamming( zSig, 1, &zSig );
2392 roundAndPack:
2393    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2394
2395}
2396
2397/*----------------------------------------------------------------------------
2398| Returns the binary exponential of the single-precision floating-point value
2399| `a'. The operation is performed according to the IEC/IEEE Standard for
2400| Binary Floating-Point Arithmetic.
2401|
2402| Uses the following identities:
2403|
2404| 1. -------------------------------------------------------------------------
2405|      x    x*ln(2)
2406|     2  = e
2407|
2408| 2. -------------------------------------------------------------------------
2409|                      2     3     4     5           n
2410|      x        x     x     x     x     x           x
2411|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2412|               1!    2!    3!    4!    5!          n!
2413*----------------------------------------------------------------------------*/
2414
2415static const float64 float32_exp2_coefficients[15] =
2416{
2417    const_float64( 0x3ff0000000000000ll ), /*  1 */
2418    const_float64( 0x3fe0000000000000ll ), /*  2 */
2419    const_float64( 0x3fc5555555555555ll ), /*  3 */
2420    const_float64( 0x3fa5555555555555ll ), /*  4 */
2421    const_float64( 0x3f81111111111111ll ), /*  5 */
2422    const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2423    const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2424    const_float64( 0x3efa01a01a01a01all ), /*  8 */
2425    const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2426    const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2427    const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2428    const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2429    const_float64( 0x3de6124613a86d09ll ), /* 13 */
2430    const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2431    const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2432};
2433
2434float32 float32_exp2( float32 a STATUS_PARAM )
2435{
2436    flag aSign;
2437    int_fast16_t aExp;
2438    uint32_t aSig;
2439    float64 r, x, xn;
2440    int i;
2441    a = float32_squash_input_denormal(a STATUS_VAR);
2442
2443    aSig = extractFloat32Frac( a );
2444    aExp = extractFloat32Exp( a );
2445    aSign = extractFloat32Sign( a );
2446
2447    if ( aExp == 0xFF) {
2448        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2449        return (aSign) ? float32_zero : a;
2450    }
2451    if (aExp == 0) {
2452        if (aSig == 0) return float32_one;
2453    }
2454
2455    float_raise( float_flag_inexact STATUS_VAR);
2456
2457    /* ******************************* */
2458    /* using float64 for approximation */
2459    /* ******************************* */
2460    x = float32_to_float64(a STATUS_VAR);
2461    x = float64_mul(x, float64_ln2 STATUS_VAR);
2462
2463    xn = x;
2464    r = float64_one;
2465    for (i = 0 ; i < 15 ; i++) {
2466        float64 f;
2467
2468        f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2469        r = float64_add(r, f STATUS_VAR);
2470
2471        xn = float64_mul(xn, x STATUS_VAR);
2472    }
2473
2474    return float64_to_float32(r, status);
2475}
2476
2477/*----------------------------------------------------------------------------
2478| Returns the binary log of the single-precision floating-point value `a'.
2479| The operation is performed according to the IEC/IEEE Standard for Binary
2480| Floating-Point Arithmetic.
2481*----------------------------------------------------------------------------*/
2482float32 float32_log2( float32 a STATUS_PARAM )
2483{
2484    flag aSign, zSign;
2485    int_fast16_t aExp;
2486    uint32_t aSig, zSig, i;
2487
2488    a = float32_squash_input_denormal(a STATUS_VAR);
2489    aSig = extractFloat32Frac( a );
2490    aExp = extractFloat32Exp( a );
2491    aSign = extractFloat32Sign( a );
2492
2493    if ( aExp == 0 ) {
2494        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2495        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2496    }
2497    if ( aSign ) {
2498        float_raise( float_flag_invalid STATUS_VAR);
2499        return float32_default_nan;
2500    }
2501    if ( aExp == 0xFF ) {
2502        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2503        return a;
2504    }
2505
2506    aExp -= 0x7F;
2507    aSig |= 0x00800000;
2508    zSign = aExp < 0;
2509    zSig = aExp << 23;
2510
2511    for (i = 1 << 22; i > 0; i >>= 1) {
2512        aSig = ( (uint64_t)aSig * aSig ) >> 23;
2513        if ( aSig & 0x01000000 ) {
2514            aSig >>= 1;
2515            zSig |= i;
2516        }
2517    }
2518
2519    if ( zSign )
2520        zSig = -zSig;
2521
2522    return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2523}
2524
2525/*----------------------------------------------------------------------------
2526| Returns 1 if the single-precision floating-point value `a' is equal to
2527| the corresponding value `b', and 0 otherwise.  The invalid exception is
2528| raised if either operand is a NaN.  Otherwise, the comparison is performed
2529| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2530*----------------------------------------------------------------------------*/
2531
2532int float32_eq( float32 a, float32 b STATUS_PARAM )
2533{
2534    uint32_t av, bv;
2535    a = float32_squash_input_denormal(a STATUS_VAR);
2536    b = float32_squash_input_denormal(b STATUS_VAR);
2537
2538    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2539         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2540       ) {
2541        float_raise( float_flag_invalid STATUS_VAR);
2542        return 0;
2543    }
2544    av = float32_val(a);
2545    bv = float32_val(b);
2546    return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2547}
2548
2549/*----------------------------------------------------------------------------
2550| Returns 1 if the single-precision floating-point value `a' is less than
2551| or equal to the corresponding value `b', and 0 otherwise.  The invalid
2552| exception is raised if either operand is a NaN.  The comparison is performed
2553| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2554*----------------------------------------------------------------------------*/
2555
2556int float32_le( float32 a, float32 b STATUS_PARAM )
2557{
2558    flag aSign, bSign;
2559    uint32_t av, bv;
2560    a = float32_squash_input_denormal(a STATUS_VAR);
2561    b = float32_squash_input_denormal(b STATUS_VAR);
2562
2563    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2564         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2565       ) {
2566        float_raise( float_flag_invalid STATUS_VAR);
2567        return 0;
2568    }
2569    aSign = extractFloat32Sign( a );
2570    bSign = extractFloat32Sign( b );
2571    av = float32_val(a);
2572    bv = float32_val(b);
2573    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2574    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2575
2576}
2577
2578/*----------------------------------------------------------------------------
2579| Returns 1 if the single-precision floating-point value `a' is less than
2580| the corresponding value `b', and 0 otherwise.  The invalid exception is
2581| raised if either operand is a NaN.  The comparison is performed according
2582| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2583*----------------------------------------------------------------------------*/
2584
2585int float32_lt( float32 a, float32 b STATUS_PARAM )
2586{
2587    flag aSign, bSign;
2588    uint32_t av, bv;
2589    a = float32_squash_input_denormal(a STATUS_VAR);
2590    b = float32_squash_input_denormal(b STATUS_VAR);
2591
2592    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2593         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2594       ) {
2595        float_raise( float_flag_invalid STATUS_VAR);
2596        return 0;
2597    }
2598    aSign = extractFloat32Sign( a );
2599    bSign = extractFloat32Sign( b );
2600    av = float32_val(a);
2601    bv = float32_val(b);
2602    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2603    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2604
2605}
2606
2607/*----------------------------------------------------------------------------
2608| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2609| be compared, and 0 otherwise.  The invalid exception is raised if either
2610| operand is a NaN.  The comparison is performed according to the IEC/IEEE
2611| Standard for Binary Floating-Point Arithmetic.
2612*----------------------------------------------------------------------------*/
2613
2614int float32_unordered( float32 a, float32 b STATUS_PARAM )
2615{
2616    a = float32_squash_input_denormal(a STATUS_VAR);
2617    b = float32_squash_input_denormal(b STATUS_VAR);
2618
2619    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2620         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2621       ) {
2622        float_raise( float_flag_invalid STATUS_VAR);
2623        return 1;
2624    }
2625    return 0;
2626}
2627
2628/*----------------------------------------------------------------------------
2629| Returns 1 if the single-precision floating-point value `a' is equal to
2630| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2631| exception.  The comparison is performed according to the IEC/IEEE Standard
2632| for Binary Floating-Point Arithmetic.
2633*----------------------------------------------------------------------------*/
2634
2635int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2636{
2637    a = float32_squash_input_denormal(a STATUS_VAR);
2638    b = float32_squash_input_denormal(b STATUS_VAR);
2639
2640    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2641         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2642       ) {
2643        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2644            float_raise( float_flag_invalid STATUS_VAR);
2645        }
2646        return 0;
2647    }
2648    return ( float32_val(a) == float32_val(b) ) ||
2649            ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2650}
2651
2652/*----------------------------------------------------------------------------
2653| Returns 1 if the single-precision floating-point value `a' is less than or
2654| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2655| cause an exception.  Otherwise, the comparison is performed according to the
2656| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2657*----------------------------------------------------------------------------*/
2658
2659int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2660{
2661    flag aSign, bSign;
2662    uint32_t av, bv;
2663    a = float32_squash_input_denormal(a STATUS_VAR);
2664    b = float32_squash_input_denormal(b STATUS_VAR);
2665
2666    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2667         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2668       ) {
2669        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2670            float_raise( float_flag_invalid STATUS_VAR);
2671        }
2672        return 0;
2673    }
2674    aSign = extractFloat32Sign( a );
2675    bSign = extractFloat32Sign( b );
2676    av = float32_val(a);
2677    bv = float32_val(b);
2678    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2679    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2680
2681}
2682
2683/*----------------------------------------------------------------------------
2684| Returns 1 if the single-precision floating-point value `a' is less than
2685| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2686| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2687| Standard for Binary Floating-Point Arithmetic.
2688*----------------------------------------------------------------------------*/
2689
2690int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2691{
2692    flag aSign, bSign;
2693    uint32_t av, bv;
2694    a = float32_squash_input_denormal(a STATUS_VAR);
2695    b = float32_squash_input_denormal(b STATUS_VAR);
2696
2697    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2698         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2699       ) {
2700        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2701            float_raise( float_flag_invalid STATUS_VAR);
2702        }
2703        return 0;
2704    }
2705    aSign = extractFloat32Sign( a );
2706    bSign = extractFloat32Sign( b );
2707    av = float32_val(a);
2708    bv = float32_val(b);
2709    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2710    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2711
2712}
2713
2714/*----------------------------------------------------------------------------
2715| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2716| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2717| comparison is performed according to the IEC/IEEE Standard for Binary
2718| Floating-Point Arithmetic.
2719*----------------------------------------------------------------------------*/
2720
2721int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2722{
2723    a = float32_squash_input_denormal(a STATUS_VAR);
2724    b = float32_squash_input_denormal(b STATUS_VAR);
2725
2726    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2727         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2728       ) {
2729        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2730            float_raise( float_flag_invalid STATUS_VAR);
2731        }
2732        return 1;
2733    }
2734    return 0;
2735}
2736
2737/*----------------------------------------------------------------------------
2738| Returns the result of converting the double-precision floating-point value
2739| `a' to the 32-bit two's complement integer format.  The conversion is
2740| performed according to the IEC/IEEE Standard for Binary Floating-Point
2741| Arithmetic---which means in particular that the conversion is rounded
2742| according to the current rounding mode.  If `a' is a NaN, the largest
2743| positive integer is returned.  Otherwise, if the conversion overflows, the
2744| largest integer with the same sign as `a' is returned.
2745*----------------------------------------------------------------------------*/
2746
2747int32 float64_to_int32( float64 a STATUS_PARAM )
2748{
2749    flag aSign;
2750    int_fast16_t aExp, shiftCount;
2751    uint64_t aSig;
2752    a = float64_squash_input_denormal(a STATUS_VAR);
2753
2754    aSig = extractFloat64Frac( a );
2755    aExp = extractFloat64Exp( a );
2756    aSign = extractFloat64Sign( a );
2757    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2758    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2759    shiftCount = 0x42C - aExp;
2760    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2761    return roundAndPackInt32( aSign, aSig STATUS_VAR );
2762
2763}
2764
2765/*----------------------------------------------------------------------------
2766| Returns the result of converting the double-precision floating-point value
2767| `a' to the 32-bit two's complement integer format.  The conversion is
2768| performed according to the IEC/IEEE Standard for Binary Floating-Point
2769| Arithmetic, except that the conversion is always rounded toward zero.
2770| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2771| the conversion overflows, the largest integer with the same sign as `a' is
2772| returned.
2773*----------------------------------------------------------------------------*/
2774
2775int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2776{
2777    flag aSign;
2778    int_fast16_t aExp, shiftCount;
2779    uint64_t aSig, savedASig;
2780    int32_t z;
2781    a = float64_squash_input_denormal(a STATUS_VAR);
2782
2783    aSig = extractFloat64Frac( a );
2784    aExp = extractFloat64Exp( a );
2785    aSign = extractFloat64Sign( a );
2786    if ( 0x41E < aExp ) {
2787        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2788        goto invalid;
2789    }
2790    else if ( aExp < 0x3FF ) {
2791        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2792        return 0;
2793    }
2794    aSig |= LIT64( 0x0010000000000000 );
2795    shiftCount = 0x433 - aExp;
2796    savedASig = aSig;
2797    aSig >>= shiftCount;
2798    z = aSig;
2799    if ( aSign ) z = - z;
2800    if ( ( z < 0 ) ^ aSign ) {
2801 invalid:
2802        float_raise( float_flag_invalid STATUS_VAR);
2803        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2804    }
2805    if ( ( aSig<<shiftCount ) != savedASig ) {
2806        STATUS(float_exception_flags) |= float_flag_inexact;
2807    }
2808    return z;
2809
2810}
2811
2812/*----------------------------------------------------------------------------
2813| Returns the result of converting the double-precision floating-point value
2814| `a' to the 16-bit two's complement integer format.  The conversion is
2815| performed according to the IEC/IEEE Standard for Binary Floating-Point
2816| Arithmetic, except that the conversion is always rounded toward zero.
2817| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2818| the conversion overflows, the largest integer with the same sign as `a' is
2819| returned.
2820*----------------------------------------------------------------------------*/
2821
2822int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
2823{
2824    flag aSign;
2825    int_fast16_t aExp, shiftCount;
2826    uint64_t aSig, savedASig;
2827    int32 z;
2828
2829    aSig = extractFloat64Frac( a );
2830    aExp = extractFloat64Exp( a );
2831    aSign = extractFloat64Sign( a );
2832    if ( 0x40E < aExp ) {
2833        if ( ( aExp == 0x7FF ) && aSig ) {
2834            aSign = 0;
2835        }
2836        goto invalid;
2837    }
2838    else if ( aExp < 0x3FF ) {
2839        if ( aExp || aSig ) {
2840            STATUS(float_exception_flags) |= float_flag_inexact;
2841        }
2842        return 0;
2843    }
2844    aSig |= LIT64( 0x0010000000000000 );
2845    shiftCount = 0x433 - aExp;
2846    savedASig = aSig;
2847    aSig >>= shiftCount;
2848    z = aSig;
2849    if ( aSign ) {
2850        z = - z;
2851    }
2852    if ( ( (int16_t)z < 0 ) ^ aSign ) {
2853 invalid:
2854        float_raise( float_flag_invalid STATUS_VAR);
2855        return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2856    }
2857    if ( ( aSig<<shiftCount ) != savedASig ) {
2858        STATUS(float_exception_flags) |= float_flag_inexact;
2859    }
2860    return z;
2861}
2862
2863/*----------------------------------------------------------------------------
2864| Returns the result of converting the double-precision floating-point value
2865| `a' to the 64-bit two's complement integer format.  The conversion is
2866| performed according to the IEC/IEEE Standard for Binary Floating-Point
2867| Arithmetic---which means in particular that the conversion is rounded
2868| according to the current rounding mode.  If `a' is a NaN, the largest
2869| positive integer is returned.  Otherwise, if the conversion overflows, the
2870| largest integer with the same sign as `a' is returned.
2871*----------------------------------------------------------------------------*/
2872
2873int64 float64_to_int64( float64 a STATUS_PARAM )
2874{
2875    flag aSign;
2876    int_fast16_t aExp, shiftCount;
2877    uint64_t aSig, aSigExtra;
2878    a = float64_squash_input_denormal(a STATUS_VAR);
2879
2880    aSig = extractFloat64Frac( a );
2881    aExp = extractFloat64Exp( a );
2882    aSign = extractFloat64Sign( a );
2883    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2884    shiftCount = 0x433 - aExp;
2885    if ( shiftCount <= 0 ) {
2886        if ( 0x43E < aExp ) {
2887            float_raise( float_flag_invalid STATUS_VAR);
2888            if (    ! aSign
2889                 || (    ( aExp == 0x7FF )
2890                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2891               ) {
2892                return LIT64( 0x7FFFFFFFFFFFFFFF );
2893            }
2894            return (int64_t) LIT64( 0x8000000000000000 );
2895        }
2896        aSigExtra = 0;
2897        aSig <<= - shiftCount;
2898    }
2899    else {
2900        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2901    }
2902    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2903
2904}
2905
2906/*----------------------------------------------------------------------------
2907| Returns the result of converting the double-precision floating-point value
2908| `a' to the 64-bit two's complement integer format.  The conversion is
2909| performed according to the IEC/IEEE Standard for Binary Floating-Point
2910| Arithmetic, except that the conversion is always rounded toward zero.
2911| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2912| the conversion overflows, the largest integer with the same sign as `a' is
2913| returned.
2914*----------------------------------------------------------------------------*/
2915
2916int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2917{
2918    flag aSign;
2919    int_fast16_t aExp, shiftCount;
2920    uint64_t aSig;
2921    int64 z;
2922    a = float64_squash_input_denormal(a STATUS_VAR);
2923
2924    aSig = extractFloat64Frac( a );
2925    aExp = extractFloat64Exp( a );
2926    aSign = extractFloat64Sign( a );
2927    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2928    shiftCount = aExp - 0x433;
2929    if ( 0 <= shiftCount ) {
2930        if ( 0x43E <= aExp ) {
2931            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2932                float_raise( float_flag_invalid STATUS_VAR);
2933                if (    ! aSign
2934                     || (    ( aExp == 0x7FF )
2935                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2936                   ) {
2937                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2938                }
2939            }
2940            return (int64_t) LIT64( 0x8000000000000000 );
2941        }
2942        z = aSig<<shiftCount;
2943    }
2944    else {
2945        if ( aExp < 0x3FE ) {
2946            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2947            return 0;
2948        }
2949        z = aSig>>( - shiftCount );
2950        if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2951            STATUS(float_exception_flags) |= float_flag_inexact;
2952        }
2953    }
2954    if ( aSign ) z = - z;
2955    return z;
2956
2957}
2958
2959/*----------------------------------------------------------------------------
2960| Returns the result of converting the double-precision floating-point value
2961| `a' to the single-precision floating-point format.  The conversion is
2962| performed according to the IEC/IEEE Standard for Binary Floating-Point
2963| Arithmetic.
2964*----------------------------------------------------------------------------*/
2965
2966float32 float64_to_float32( float64 a STATUS_PARAM )
2967{
2968    flag aSign;
2969    int_fast16_t aExp;
2970    uint64_t aSig;
2971    uint32_t zSig;
2972    a = float64_squash_input_denormal(a STATUS_VAR);
2973
2974    aSig = extractFloat64Frac( a );
2975    aExp = extractFloat64Exp( a );
2976    aSign = extractFloat64Sign( a );
2977    if ( aExp == 0x7FF ) {
2978        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2979        return packFloat32( aSign, 0xFF, 0 );
2980    }
2981    shift64RightJamming( aSig, 22, &aSig );
2982    zSig = aSig;
2983    if ( aExp || zSig ) {
2984        zSig |= 0x40000000;
2985        aExp -= 0x381;
2986    }
2987    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2988
2989}
2990
2991
2992/*----------------------------------------------------------------------------
2993| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2994| half-precision floating-point value, returning the result.  After being
2995| shifted into the proper positions, the three fields are simply added
2996| together to form the result.  This means that any integer portion of `zSig'
2997| will be added into the exponent.  Since a properly normalized significand
2998| will have an integer portion equal to 1, the `zExp' input should be 1 less
2999| than the desired result exponent whenever `zSig' is a complete, normalized
3000| significand.
3001*----------------------------------------------------------------------------*/
3002static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3003{
3004    return make_float16(
3005        (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3006}
3007
3008/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3009   The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3010
3011float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
3012{
3013    flag aSign;
3014    int_fast16_t aExp;
3015    uint32_t aSig;
3016
3017    aSign = extractFloat16Sign(a);
3018    aExp = extractFloat16Exp(a);
3019    aSig = extractFloat16Frac(a);
3020
3021    if (aExp == 0x1f && ieee) {
3022        if (aSig) {
3023            return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3024        }
3025        return packFloat32(aSign, 0xff, 0);
3026    }
3027    if (aExp == 0) {
3028        int8 shiftCount;
3029
3030        if (aSig == 0) {
3031            return packFloat32(aSign, 0, 0);
3032        }
3033
3034        shiftCount = countLeadingZeros32( aSig ) - 21;
3035        aSig = aSig << shiftCount;
3036        aExp = -shiftCount;
3037    }
3038    return packFloat32( aSign, aExp + 0x70, aSig << 13);
3039}
3040
3041float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3042{
3043    flag aSign;
3044    int_fast16_t aExp;
3045    uint32_t aSig;
3046    uint32_t mask;
3047    uint32_t increment;
3048    int8 roundingMode;
3049    a = float32_squash_input_denormal(a STATUS_VAR);
3050
3051    aSig = extractFloat32Frac( a );
3052    aExp = extractFloat32Exp( a );
3053    aSign = extractFloat32Sign( a );
3054    if ( aExp == 0xFF ) {
3055        if (aSig) {
3056            /* Input is a NaN */
3057            float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3058            if (!ieee) {
3059                return packFloat16(aSign, 0, 0);
3060            }
3061            return r;
3062        }
3063        /* Infinity */
3064        if (!ieee) {
3065            float_raise(float_flag_invalid STATUS_VAR);
3066            return packFloat16(aSign, 0x1f, 0x3ff);
3067        }
3068        return packFloat16(aSign, 0x1f, 0);
3069    }
3070    if (aExp == 0 && aSig == 0) {
3071        return packFloat16(aSign, 0, 0);
3072    }
3073    /* Decimal point between bits 22 and 23.  */
3074    aSig |= 0x00800000;
3075    aExp -= 0x7f;
3076    if (aExp < -14) {
3077        mask = 0x00ffffff;
3078        if (aExp >= -24) {
3079            mask >>= 25 + aExp;
3080        }
3081    } else {
3082        mask = 0x00001fff;
3083    }
3084    if (aSig & mask) {
3085        float_raise( float_flag_underflow STATUS_VAR );
3086        roundingMode = STATUS(float_rounding_mode);
3087        switch (roundingMode) {
3088        case float_round_nearest_even:
3089            increment = (mask + 1) >> 1;
3090            if ((aSig & mask) == increment) {
3091                increment = aSig & (increment << 1);
3092            }
3093            break;
3094        case float_round_up:
3095            increment = aSign ? 0 : mask;
3096            break;
3097        case float_round_down:
3098            increment = aSign ? mask : 0;
3099            break;
3100        default: /* round_to_zero */
3101            increment = 0;
3102            break;
3103        }
3104        aSig += increment;
3105        if (aSig >= 0x01000000) {
3106            aSig >>= 1;
3107            aExp++;
3108        }
3109    } else if (aExp < -14
3110          && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
3111        float_raise( float_flag_underflow STATUS_VAR);
3112    }
3113
3114    if (ieee) {
3115        if (aExp > 15) {
3116            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
3117            return packFloat16(aSign, 0x1f, 0);
3118        }
3119    } else {
3120        if (aExp > 16) {
3121            float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
3122            return packFloat16(aSign, 0x1f, 0x3ff);
3123        }
3124    }
3125    if (aExp < -24) {
3126        return packFloat16(aSign, 0, 0);
3127    }
3128    if (aExp < -14) {
3129        aSig >>= -14 - aExp;
3130        aExp = -14;
3131    }
3132    return packFloat16(aSign, aExp + 14, aSig >> 13);
3133}
3134
3135/*----------------------------------------------------------------------------
3136| Returns the result of converting the double-precision floating-point value
3137| `a' to the extended double-precision floating-point format.  The conversion
3138| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3139| Arithmetic.
3140*----------------------------------------------------------------------------*/
3141
3142floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3143{
3144    flag aSign;
3145    int_fast16_t aExp;
3146    uint64_t aSig;
3147
3148    a = float64_squash_input_denormal(a STATUS_VAR);
3149    aSig = extractFloat64Frac( a );
3150    aExp = extractFloat64Exp( a );
3151    aSign = extractFloat64Sign( a );
3152    if ( aExp == 0x7FF ) {
3153        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3154        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3155    }
3156    if ( aExp == 0 ) {
3157        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3158        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3159    }
3160    return
3161        packFloatx80(
3162            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3163
3164}
3165
3166/*----------------------------------------------------------------------------
3167| Returns the result of converting the double-precision floating-point value
3168| `a' to the quadruple-precision floating-point format.  The conversion is
3169| performed according to the IEC/IEEE Standard for Binary Floating-Point
3170| Arithmetic.
3171*----------------------------------------------------------------------------*/
3172
3173float128 float64_to_float128( float64 a STATUS_PARAM )
3174{
3175    flag aSign;
3176    int_fast16_t aExp;
3177    uint64_t aSig, zSig0, zSig1;
3178
3179    a = float64_squash_input_denormal(a STATUS_VAR);
3180    aSig = extractFloat64Frac( a );
3181    aExp = extractFloat64Exp( a );
3182    aSign = extractFloat64Sign( a );
3183    if ( aExp == 0x7FF ) {
3184        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3185        return packFloat128( aSign, 0x7FFF, 0, 0 );
3186    }
3187    if ( aExp == 0 ) {
3188        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3189        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3190        --aExp;
3191    }
3192    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3193    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3194
3195}
3196
3197/*----------------------------------------------------------------------------
3198| Rounds the double-precision floating-point value `a' to an integer, and
3199| returns the result as a double-precision floating-point value.  The
3200| operation is performed according to the IEC/IEEE Standard for Binary
3201| Floating-Point Arithmetic.
3202*----------------------------------------------------------------------------*/
3203
3204float64 float64_round_to_int( float64 a STATUS_PARAM )
3205{
3206    flag aSign;
3207    int_fast16_t aExp;
3208    uint64_t lastBitMask, roundBitsMask;
3209    int8 roundingMode;
3210    uint64_t z;
3211    a = float64_squash_input_denormal(a STATUS_VAR);
3212
3213    aExp = extractFloat64Exp( a );
3214    if ( 0x433 <= aExp ) {
3215        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3216            return propagateFloat64NaN( a, a STATUS_VAR );
3217        }
3218        return a;
3219    }
3220    if ( aExp < 0x3FF ) {
3221        if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3222        STATUS(float_exception_flags) |= float_flag_inexact;
3223        aSign = extractFloat64Sign( a );
3224        switch ( STATUS(float_rounding_mode) ) {
3225         case float_round_nearest_even:
3226            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3227                return packFloat64( aSign, 0x3FF, 0 );
3228            }
3229            break;
3230         case float_round_down:
3231            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3232         case float_round_up:
3233            return make_float64(
3234            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3235        }
3236        return packFloat64( aSign, 0, 0 );
3237    }
3238    lastBitMask = 1;
3239    lastBitMask <<= 0x433 - aExp;
3240    roundBitsMask = lastBitMask - 1;
3241    z = float64_val(a);
3242    roundingMode = STATUS(float_rounding_mode);
3243    if ( roundingMode == float_round_nearest_even ) {
3244        z += lastBitMask>>1;
3245        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3246    }
3247    else if ( roundingMode != float_round_to_zero ) {
3248        if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3249            z += roundBitsMask;
3250        }
3251    }
3252    z &= ~ roundBitsMask;
3253    if ( z != float64_val(a) )
3254        STATUS(float_exception_flags) |= float_flag_inexact;
3255    return make_float64(z);
3256
3257}
3258
3259float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3260{
3261    int oldmode;
3262    float64 res;
3263    oldmode = STATUS(float_rounding_mode);
3264    STATUS(float_rounding_mode) = float_round_to_zero;
3265    res = float64_round_to_int(a STATUS_VAR);
3266    STATUS(float_rounding_mode) = oldmode;
3267    return res;
3268}
3269
3270/*----------------------------------------------------------------------------
3271| Returns the result of adding the absolute values of the double-precision
3272| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3273| before being returned.  `zSign' is ignored if the result is a NaN.
3274| The addition is performed according to the IEC/IEEE Standard for Binary
3275| Floating-Point Arithmetic.
3276*----------------------------------------------------------------------------*/
3277
3278static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3279{
3280    int_fast16_t aExp, bExp, zExp;
3281    uint64_t aSig, bSig, zSig;
3282    int_fast16_t expDiff;
3283
3284    aSig = extractFloat64Frac( a );
3285    aExp = extractFloat64Exp( a );
3286    bSig = extractFloat64Frac( b );
3287    bExp = extractFloat64Exp( b );
3288    expDiff = aExp - bExp;
3289    aSig <<= 9;
3290    bSig <<= 9;
3291    if ( 0 < expDiff ) {
3292        if ( aExp == 0x7FF ) {
3293            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3294            return a;
3295        }
3296        if ( bExp == 0 ) {
3297            --expDiff;
3298        }
3299        else {
3300            bSig |= LIT64( 0x2000000000000000 );
3301        }
3302        shift64RightJamming( bSig, expDiff, &bSig );
3303        zExp = aExp;
3304    }
3305    else if ( expDiff < 0 ) {
3306        if ( bExp == 0x7FF ) {
3307            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3308            return packFloat64( zSign, 0x7FF, 0 );
3309        }
3310        if ( aExp == 0 ) {
3311            ++expDiff;
3312        }
3313        else {
3314            aSig |= LIT64( 0x2000000000000000 );
3315        }
3316        shift64RightJamming( aSig, - expDiff, &aSig );
3317        zExp = bExp;
3318    }
3319    else {
3320        if ( aExp == 0x7FF ) {
3321            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3322            return a;
3323        }
3324        if ( aExp == 0 ) {
3325            if (STATUS(flush_to_zero)) {
3326                if (aSig | bSig) {
3327                    float_raise(float_flag_output_denormal STATUS_VAR);
3328                }
3329                return packFloat64(zSign, 0, 0);
3330            }
3331            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3332        }
3333        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3334        zExp = aExp;
3335        goto roundAndPack;
3336    }
3337    aSig |= LIT64( 0x2000000000000000 );
3338    zSig = ( aSig + bSig )<<1;
3339    --zExp;
3340    if ( (int64_t) zSig < 0 ) {
3341        zSig = aSig + bSig;
3342        ++zExp;
3343    }
3344 roundAndPack:
3345    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3346
3347}
3348
3349/*----------------------------------------------------------------------------
3350| Returns the result of subtracting the absolute values of the double-
3351| precision floating-point values `a' and `b'.  If `zSign' is 1, the
3352| difference is negated before being returned.  `zSign' is ignored if the
3353| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3354| Standard for Binary Floating-Point Arithmetic.
3355*----------------------------------------------------------------------------*/
3356
3357static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3358{
3359    int_fast16_t aExp, bExp, zExp;
3360    uint64_t aSig, bSig, zSig;
3361    int_fast16_t expDiff;
3362
3363    aSig = extractFloat64Frac( a );
3364    aExp = extractFloat64Exp( a );
3365    bSig = extractFloat64Frac( b );
3366    bExp = extractFloat64Exp( b );
3367    expDiff = aExp - bExp;
3368    aSig <<= 10;
3369    bSig <<= 10;
3370    if ( 0 < expDiff ) goto aExpBigger;
3371    if ( expDiff < 0 ) goto bExpBigger;
3372    if ( aExp == 0x7FF ) {
3373        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3374        float_raise( float_flag_invalid STATUS_VAR);
3375        return float64_default_nan;
3376    }
3377    if ( aExp == 0 ) {
3378        aExp = 1;
3379        bExp = 1;
3380    }
3381    if ( bSig < aSig ) goto aBigger;
3382    if ( aSig < bSig ) goto bBigger;
3383    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3384 bExpBigger:
3385    if ( bExp == 0x7FF ) {
3386        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3387        return packFloat64( zSign ^ 1, 0x7FF, 0 );
3388    }
3389    if ( aExp == 0 ) {
3390        ++expDiff;
3391    }
3392    else {
3393        aSig |= LIT64( 0x4000000000000000 );
3394    }
3395    shift64RightJamming( aSig, - expDiff, &aSig );
3396    bSig |= LIT64( 0x4000000000000000 );
3397 bBigger:
3398    zSig = bSig - aSig;
3399    zExp = bExp;
3400    zSign ^= 1;
3401    goto normalizeRoundAndPack;
3402 aExpBigger:
3403    if ( aExp == 0x7FF ) {
3404        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3405        return a;
3406    }
3407    if ( bExp == 0 ) {
3408        --expDiff;
3409    }
3410    else {
3411        bSig |= LIT64( 0x4000000000000000 );
3412    }
3413    shift64RightJamming( bSig, expDiff, &bSig );
3414    aSig |= LIT64( 0x4000000000000000 );
3415 aBigger:
3416    zSig = aSig - bSig;
3417    zExp = aExp;
3418 normalizeRoundAndPack:
3419    --zExp;
3420    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3421
3422}
3423
3424/*----------------------------------------------------------------------------
3425| Returns the result of adding the double-precision floating-point values `a'
3426| and `b'.  The operation is performed according to the IEC/IEEE Standard for
3427| Binary Floating-Point Arithmetic.
3428*----------------------------------------------------------------------------*/
3429
3430float64 float64_add( float64 a, float64 b STATUS_PARAM )
3431{
3432    flag aSign, bSign;
3433    a = float64_squash_input_denormal(a STATUS_VAR);
3434    b = float64_squash_input_denormal(b STATUS_VAR);
3435
3436    aSign = extractFloat64Sign( a );
3437    bSign = extractFloat64Sign( b );
3438    if ( aSign == bSign ) {
3439        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3440    }
3441    else {
3442        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3443    }
3444
3445}
3446
3447/*----------------------------------------------------------------------------
3448| Returns the result of subtracting the double-precision floating-point values
3449| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3450| for Binary Floating-Point Arithmetic.
3451*----------------------------------------------------------------------------*/
3452
3453float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3454{
3455    flag aSign, bSign;
3456    a = float64_squash_input_denormal(a STATUS_VAR);
3457    b = float64_squash_input_denormal(b STATUS_VAR);
3458
3459    aSign = extractFloat64Sign( a );
3460    bSign = extractFloat64Sign( b );
3461    if ( aSign == bSign ) {
3462        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3463    }
3464    else {
3465        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3466    }
3467
3468}
3469
3470/*----------------------------------------------------------------------------
3471| Returns the result of multiplying the double-precision floating-point values
3472| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3473| for Binary Floating-Point Arithmetic.
3474*----------------------------------------------------------------------------*/
3475
3476float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3477{
3478    flag aSign, bSign, zSign;
3479    int_fast16_t aExp, bExp, zExp;
3480    uint64_t aSig, bSig, zSig0, zSig1;
3481
3482    a = float64_squash_input_denormal(a STATUS_VAR);
3483    b = float64_squash_input_denormal(b STATUS_VAR);
3484
3485    aSig = extractFloat64Frac( a );
3486    aExp = extractFloat64Exp( a );
3487    aSign = extractFloat64Sign( a );
3488    bSig = extractFloat64Frac( b );
3489    bExp = extractFloat64Exp( b );
3490    bSign = extractFloat64Sign( b );
3491    zSign = aSign ^ bSign;
3492    if ( aExp == 0x7FF ) {
3493        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3494            return propagateFloat64NaN( a, b STATUS_VAR );
3495        }
3496        if ( ( bExp | bSig ) == 0 ) {
3497            float_raise( float_flag_invalid STATUS_VAR);
3498            return float64_default_nan;
3499        }
3500        return packFloat64( zSign, 0x7FF, 0 );
3501    }
3502    if ( bExp == 0x7FF ) {
3503        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3504        if ( ( aExp | aSig ) == 0 ) {
3505            float_raise( float_flag_invalid STATUS_VAR);
3506            return float64_default_nan;
3507        }
3508        return packFloat64( zSign, 0x7FF, 0 );
3509    }
3510    if ( aExp == 0 ) {
3511        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3512        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3513    }
3514    if ( bExp == 0 ) {
3515        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3516        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3517    }
3518    zExp = aExp + bExp - 0x3FF;
3519    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3520    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3521    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3522    zSig0 |= ( zSig1 != 0 );
3523    if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3524        zSig0 <<= 1;
3525        --zExp;
3526    }
3527    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3528
3529}
3530
3531/*----------------------------------------------------------------------------
3532| Returns the result of dividing the double-precision floating-point value `a'
3533| by the corresponding value `b'.  The operation is performed according to
3534| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3535*----------------------------------------------------------------------------*/
3536
3537float64 float64_div( float64 a, float64 b STATUS_PARAM )
3538{
3539    flag aSign, bSign, zSign;
3540    int_fast16_t aExp, bExp, zExp;
3541    uint64_t aSig, bSig, zSig;
3542    uint64_t rem0, rem1;
3543    uint64_t term0, term1;
3544    a = float64_squash_input_denormal(a STATUS_VAR);
3545    b = float64_squash_input_denormal(b STATUS_VAR);
3546
3547    aSig = extractFloat64Frac( a );
3548    aExp = extractFloat64Exp( a );
3549    aSign = extractFloat64Sign( a );
3550    bSig = extractFloat64Frac( b );
3551    bExp = extractFloat64Exp( b );
3552    bSign = extractFloat64Sign( b );
3553    zSign = aSign ^ bSign;
3554    if ( aExp == 0x7FF ) {
3555        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3556        if ( bExp == 0x7FF ) {
3557            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3558            float_raise( float_flag_invalid STATUS_VAR);
3559            return float64_default_nan;
3560        }
3561        return packFloat64( zSign, 0x7FF, 0 );
3562    }
3563    if ( bExp == 0x7FF ) {
3564        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3565        return packFloat64( zSign, 0, 0 );
3566    }
3567    if ( bExp == 0 ) {
3568        if ( bSig == 0 ) {
3569            if ( ( aExp | aSig ) == 0 ) {
3570                float_raise( float_flag_invalid STATUS_VAR);
3571                return float64_default_nan;
3572            }
3573            float_raise( float_flag_divbyzero STATUS_VAR);
3574            return packFloat64( zSign, 0x7FF, 0 );
3575        }
3576        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3577    }
3578    if ( aExp == 0 ) {
3579        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3580        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3581    }
3582    zExp = aExp - bExp + 0x3FD;
3583    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3584    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3585    if ( bSig <= ( aSig + aSig ) ) {
3586        aSig >>= 1;
3587        ++zExp;
3588    }
3589    zSig = estimateDiv128To64( aSig, 0, bSig );
3590    if ( ( zSig & 0x1FF ) <= 2 ) {
3591        mul64To128( bSig, zSig, &term0, &term1 );
3592        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3593        while ( (int64_t) rem0 < 0 ) {
3594            --zSig;
3595            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3596        }
3597        zSig |= ( rem1 != 0 );
3598    }
3599    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3600
3601}
3602
3603/*----------------------------------------------------------------------------
3604| Returns the remainder of the double-precision floating-point value `a'
3605| with respect to the corresponding value `b'.  The operation is performed
3606| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3607*----------------------------------------------------------------------------*/
3608
3609float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3610{
3611    flag aSign, zSign;
3612    int_fast16_t aExp, bExp, expDiff;
3613    uint64_t aSig, bSig;
3614    uint64_t q, alternateASig;
3615    int64_t sigMean;
3616
3617    a = float64_squash_input_denormal(a STATUS_VAR);
3618    b = float64_squash_input_denormal(b STATUS_VAR);
3619    aSig = extractFloat64Frac( a );
3620    aExp = extractFloat64Exp( a );
3621    aSign = extractFloat64Sign( a );
3622    bSig = extractFloat64Frac( b );
3623    bExp = extractFloat64Exp( b );
3624    if ( aExp == 0x7FF ) {
3625        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3626            return propagateFloat64NaN( a, b STATUS_VAR );
3627        }
3628        float_raise( float_flag_invalid STATUS_VAR);
3629        return float64_default_nan;
3630    }
3631    if ( bExp == 0x7FF ) {
3632        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3633        return a;
3634    }
3635    if ( bExp == 0 ) {
3636        if ( bSig == 0 ) {
3637            float_raise( float_flag_invalid STATUS_VAR);
3638            return float64_default_nan;
3639        }
3640        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3641    }
3642    if ( aExp == 0 ) {
3643        if ( aSig == 0 ) return a;
3644        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3645    }
3646    expDiff = aExp - bExp;
3647    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3648    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3649    if ( expDiff < 0 ) {
3650        if ( expDiff < -1 ) return a;
3651        aSig >>= 1;
3652    }
3653    q = ( bSig <= aSig );
3654    if ( q ) aSig -= bSig;
3655    expDiff -= 64;
3656    while ( 0 < expDiff ) {
3657        q = estimateDiv128To64( aSig, 0, bSig );
3658        q = ( 2 < q ) ? q - 2 : 0;
3659        aSig = - ( ( bSig>>2 ) * q );
3660        expDiff -= 62;
3661    }
3662    expDiff += 64;
3663    if ( 0 < expDiff ) {
3664        q = estimateDiv128To64( aSig, 0, bSig );
3665        q = ( 2 < q ) ? q - 2 : 0;
3666        q >>= 64 - expDiff;
3667        bSig >>= 2;
3668        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3669    }
3670    else {
3671        aSig >>= 2;
3672        bSig >>= 2;
3673    }
3674    do {
3675        alternateASig = aSig;
3676        ++q;
3677        aSig -= bSig;
3678    } while ( 0 <= (int64_t) aSig );
3679    sigMean = aSig + alternateASig;
3680    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3681        aSig = alternateASig;
3682    }
3683    zSign = ( (int64_t) aSig < 0 );
3684    if ( zSign ) aSig = - aSig;
3685    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3686
3687}
3688
3689/*----------------------------------------------------------------------------
3690| Returns the result of multiplying the double-precision floating-point values
3691| `a' and `b' then adding 'c', with no intermediate rounding step after the
3692| multiplication.  The operation is performed according to the IEC/IEEE
3693| Standard for Binary Floating-Point Arithmetic 754-2008.
3694| The flags argument allows the caller to select negation of the
3695| addend, the intermediate product, or the final result. (The difference
3696| between this and having the caller do a separate negation is that negating
3697| externally will flip the sign bit on NaNs.)
3698*----------------------------------------------------------------------------*/
3699
3700float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
3701{
3702    flag aSign, bSign, cSign, zSign;
3703    int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
3704    uint64_t aSig, bSig, cSig;
3705    flag pInf, pZero, pSign;
3706    uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
3707    int shiftcount;
3708    flag signflip, infzero;
3709
3710    a = float64_squash_input_denormal(a STATUS_VAR);
3711    b = float64_squash_input_denormal(b STATUS_VAR);
3712    c = float64_squash_input_denormal(c STATUS_VAR);
3713    aSig = extractFloat64Frac(a);
3714    aExp = extractFloat64Exp(a);
3715    aSign = extractFloat64Sign(a);
3716    bSig = extractFloat64Frac(b);
3717    bExp = extractFloat64Exp(b);
3718    bSign = extractFloat64Sign(b);
3719    cSig = extractFloat64Frac(c);
3720    cExp = extractFloat64Exp(c);
3721    cSign = extractFloat64Sign(c);
3722
3723    infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
3724               (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
3725
3726    /* It is implementation-defined whether the cases of (0,inf,qnan)
3727     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
3728     * they return if they do), so we have to hand this information
3729     * off to the target-specific pick-a-NaN routine.
3730     */
3731    if (((aExp == 0x7ff) && aSig) ||
3732        ((bExp == 0x7ff) && bSig) ||
3733        ((cExp == 0x7ff) && cSig)) {
3734        return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
3735    }
3736
3737    if (infzero) {
3738        float_raise(float_flag_invalid STATUS_VAR);
3739        return float64_default_nan;
3740    }
3741
3742    if (flags & float_muladd_negate_c) {
3743        cSign ^= 1;
3744    }
3745
3746    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
3747
3748    /* Work out the sign and type of the product */
3749    pSign = aSign ^ bSign;
3750    if (flags & float_muladd_negate_product) {
3751        pSign ^= 1;
3752    }
3753    pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
3754    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
3755
3756    if (cExp == 0x7ff) {
3757        if (pInf && (pSign ^ cSign)) {
3758            /* addition of opposite-signed infinities => InvalidOperation */
3759            float_raise(float_flag_invalid STATUS_VAR);
3760            return float64_default_nan;
3761        }
3762        /* Otherwise generate an infinity of the same sign */
3763        return packFloat64(cSign ^ signflip, 0x7ff, 0);
3764    }
3765
3766    if (pInf) {
3767        return packFloat64(pSign ^ signflip, 0x7ff, 0);
3768    }
3769
3770    if (pZero) {
3771        if (cExp == 0) {
3772            if (cSig == 0) {
3773                /* Adding two exact zeroes */
3774                if (pSign == cSign) {
3775                    zSign = pSign;
3776                } else if (STATUS(float_rounding_mode) == float_round_down) {
3777                    zSign = 1;
3778                } else {
3779                    zSign = 0;
3780                }
3781                return packFloat64(zSign ^ signflip, 0, 0);
3782            }
3783            /* Exact zero plus a denorm */
3784            if (STATUS(flush_to_zero)) {
3785                float_raise(float_flag_output_denormal STATUS_VAR);
3786                return packFloat64(cSign ^ signflip, 0, 0);
3787            }
3788        }
3789        /* Zero plus something non-zero : just return the something */
3790        return packFloat64(cSign ^ signflip, cExp, cSig);
3791    }
3792
3793    if (aExp == 0) {
3794        normalizeFloat64Subnormal(aSig, &aExp, &aSig);
3795    }
3796    if (bExp == 0) {
3797        normalizeFloat64Subnormal(bSig, &bExp, &bSig);
3798    }
3799
3800    /* Calculate the actual result a * b + c */
3801
3802    /* Multiply first; this is easy. */
3803    /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
3804     * because we want the true exponent, not the "one-less-than"
3805     * flavour that roundAndPackFloat64() takes.
3806     */
3807    pExp = aExp + bExp - 0x3fe;
3808    aSig = (aSig | LIT64(0x0010000000000000))<<10;
3809    bSig = (bSig | LIT64(0x0010000000000000))<<11;
3810    mul64To128(aSig, bSig, &pSig0, &pSig1);
3811    if ((int64_t)(pSig0 << 1) >= 0) {
3812        shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
3813        pExp--;
3814    }
3815
3816    zSign = pSign ^ signflip;
3817
3818    /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
3819     * bit in position 126.
3820     */
3821    if (cExp == 0) {
3822        if (!cSig) {
3823            /* Throw out the special case of c being an exact zero now */
3824            shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
3825            return roundAndPackFloat64(zSign, pExp - 1,
3826                                       pSig1 STATUS_VAR);
3827        }
3828        normalizeFloat64Subnormal(cSig, &cExp, &cSig);
3829    }
3830
3831    /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
3832     * significand of the addend, with the explicit bit in position 126.
3833     */
3834    cSig0 = cSig << (126 - 64 - 52);
3835    cSig1 = 0;
3836    cSig0 |= LIT64(0x4000000000000000);
3837    expDiff = pExp - cExp;
3838
3839    if (pSign == cSign) {
3840        /* Addition */
3841        if (expDiff > 0) {
3842            /* scale c to match p */
3843            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3844            zExp = pExp;
3845        } else if (expDiff < 0) {
3846            /* scale p to match c */
3847            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3848            zExp = cExp;
3849        } else {
3850            /* no scaling needed */
3851            zExp = cExp;
3852        }
3853        /* Add significands and make sure explicit bit ends up in posn 126 */
3854        add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3855        if ((int64_t)zSig0 < 0) {
3856            shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
3857        } else {
3858            zExp--;
3859        }
3860        shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
3861        return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
3862    } else {
3863        /* Subtraction */
3864        if (expDiff > 0) {
3865            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3866            sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3867            zExp = pExp;
3868        } else if (expDiff < 0) {
3869            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3870            sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3871            zExp = cExp;
3872            zSign ^= 1;
3873        } else {
3874            zExp = pExp;
3875            if (lt128(cSig0, cSig1, pSig0, pSig1)) {
3876                sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3877            } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
3878                sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3879                zSign ^= 1;
3880            } else {
3881                /* Exact zero */
3882                zSign = signflip;
3883                if (STATUS(float_rounding_mode) == float_round_down) {
3884                    zSign ^= 1;
3885                }
3886                return packFloat64(zSign, 0, 0);
3887            }
3888        }
3889        --zExp;
3890        /* Do the equivalent of normalizeRoundAndPackFloat64() but
3891         * starting with the significand in a pair of uint64_t.
3892         */
3893        if (zSig0) {
3894            shiftcount = countLeadingZeros64(zSig0) - 1;
3895            shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
3896            if (zSig1) {
3897                zSig0 |= 1;
3898            }
3899            zExp -= shiftcount;
3900        } else {
3901            shiftcount = countLeadingZeros64(zSig1) - 1;
3902            zSig0 = zSig1 << shiftcount;
3903            zExp -= (shiftcount + 64);
3904        }
3905        return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
3906    }
3907}
3908
3909/*----------------------------------------------------------------------------
3910| Returns the square root of the double-precision floating-point value `a'.
3911| The operation is performed according to the IEC/IEEE Standard for Binary
3912| Floating-Point Arithmetic.
3913*----------------------------------------------------------------------------*/
3914
3915float64 float64_sqrt( float64 a STATUS_PARAM )
3916{
3917    flag aSign;
3918    int_fast16_t aExp, zExp;
3919    uint64_t aSig, zSig, doubleZSig;
3920    uint64_t rem0, rem1, term0, term1;
3921    a = float64_squash_input_denormal(a STATUS_VAR);
3922
3923    aSig = extractFloat64Frac( a );
3924    aExp = extractFloat64Exp( a );
3925    aSign = extractFloat64Sign( a );
3926    if ( aExp == 0x7FF ) {
3927        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3928        if ( ! aSign ) return a;
3929        float_raise( float_flag_invalid STATUS_VAR);
3930        return float64_default_nan;
3931    }
3932    if ( aSign ) {
3933        if ( ( aExp | aSig ) == 0 ) return a;
3934        float_raise( float_flag_invalid STATUS_VAR);
3935        return float64_default_nan;
3936    }
3937    if ( aExp == 0 ) {
3938        if ( aSig == 0 ) return float64_zero;
3939        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3940    }
3941    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3942    aSig |= LIT64( 0x0010000000000000 );
3943    zSig = estimateSqrt32( aExp, aSig>>21 );
3944    aSig <<= 9 - ( aExp & 1 );
3945    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3946    if ( ( zSig & 0x1FF ) <= 5 ) {
3947        doubleZSig = zSig<<1;
3948        mul64To128( zSig, zSig, &term0, &term1 );
3949        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3950        while ( (int64_t) rem0 < 0 ) {
3951            --zSig;
3952            doubleZSig -= 2;
3953            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3954        }
3955        zSig |= ( ( rem0 | rem1 ) != 0 );
3956    }
3957    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3958
3959}
3960
3961/*----------------------------------------------------------------------------
3962| Returns the binary log of the double-precision floating-point value `a'.
3963| The operation is performed according to the IEC/IEEE Standard for Binary
3964| Floating-Point Arithmetic.
3965*----------------------------------------------------------------------------*/
3966float64 float64_log2( float64 a STATUS_PARAM )
3967{
3968    flag aSign, zSign;
3969    int_fast16_t aExp;
3970    uint64_t aSig, aSig0, aSig1, zSig, i;
3971    a = float64_squash_input_denormal(a STATUS_VAR);
3972
3973    aSig = extractFloat64Frac( a );
3974    aExp = extractFloat64Exp( a );
3975    aSign = extractFloat64Sign( a );
3976
3977    if ( aExp == 0 ) {
3978        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3979        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3980    }
3981    if ( aSign ) {
3982        float_raise( float_flag_invalid STATUS_VAR);
3983        return float64_default_nan;
3984    }
3985    if ( aExp == 0x7FF ) {
3986        if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3987        return a;
3988    }
3989
3990    aExp -= 0x3FF;
3991    aSig |= LIT64( 0x0010000000000000 );
3992    zSign = aExp < 0;
3993    zSig = (uint64_t)aExp << 52;
3994    for (i = 1LL << 51; i > 0; i >>= 1) {
3995        mul64To128( aSig, aSig, &aSig0, &aSig1 );
3996        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3997        if ( aSig & LIT64( 0x0020000000000000 ) ) {
3998            aSig >>= 1;
3999            zSig |= i;
4000        }
4001    }
4002
4003    if ( zSign )
4004        zSig = -zSig;
4005    return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4006}
4007
4008/*----------------------------------------------------------------------------
4009| Returns 1 if the double-precision floating-point value `a' is equal to the
4010| corresponding value `b', and 0 otherwise.  The invalid exception is raised
4011| if either operand is a NaN.  Otherwise, the comparison is performed
4012| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4013*----------------------------------------------------------------------------*/
4014
4015int float64_eq( float64 a, float64 b STATUS_PARAM )
4016{
4017    uint64_t av, bv;
4018    a = float64_squash_input_denormal(a STATUS_VAR);
4019    b = float64_squash_input_denormal(b STATUS_VAR);
4020
4021    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4022         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4023       ) {
4024        float_raise( float_flag_invalid STATUS_VAR);
4025        return 0;
4026    }
4027    av = float64_val(a);
4028    bv = float64_val(b);
4029    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4030
4031}
4032
4033/*----------------------------------------------------------------------------
4034| Returns 1 if the double-precision floating-point value `a' is less than or
4035| equal to the corresponding value `b', and 0 otherwise.  The invalid
4036| exception is raised if either operand is a NaN.  The comparison is performed
4037| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4038*----------------------------------------------------------------------------*/
4039
4040int float64_le( float64 a, float64 b STATUS_PARAM )
4041{
4042    flag aSign, bSign;
4043    uint64_t av, bv;
4044    a = float64_squash_input_denormal(a STATUS_VAR);
4045    b = float64_squash_input_denormal(b STATUS_VAR);
4046
4047    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4048         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4049       ) {
4050        float_raise( float_flag_invalid STATUS_VAR);
4051        return 0;
4052    }
4053    aSign = extractFloat64Sign( a );
4054    bSign = extractFloat64Sign( b );
4055    av = float64_val(a);
4056    bv = float64_val(b);
4057    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4058    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4059
4060}
4061
4062/*----------------------------------------------------------------------------
4063| Returns 1 if the double-precision floating-point value `a' is less than
4064| the corresponding value `b', and 0 otherwise.  The invalid exception is
4065| raised if either operand is a NaN.  The comparison is performed according
4066| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4067*----------------------------------------------------------------------------*/
4068
4069int float64_lt( float64 a, float64 b STATUS_PARAM )
4070{
4071    flag aSign, bSign;
4072    uint64_t av, bv;
4073
4074    a = float64_squash_input_denormal(a STATUS_VAR);
4075    b = float64_squash_input_denormal(b STATUS_VAR);
4076    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4077         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4078       ) {
4079        float_raise( float_flag_invalid STATUS_VAR);
4080        return 0;
4081    }
4082    aSign = extractFloat64Sign( a );
4083    bSign = extractFloat64Sign( b );
4084    av = float64_val(a);
4085    bv = float64_val(b);
4086    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4087    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4088
4089}
4090
4091/*----------------------------------------------------------------------------
4092| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4093| be compared, and 0 otherwise.  The invalid exception is raised if either
4094| operand is a NaN.  The comparison is performed according to the IEC/IEEE
4095| Standard for Binary Floating-Point Arithmetic.
4096*----------------------------------------------------------------------------*/
4097
4098int float64_unordered( float64 a, float64 b STATUS_PARAM )
4099{
4100    a = float64_squash_input_denormal(a STATUS_VAR);
4101    b = float64_squash_input_denormal(b STATUS_VAR);
4102
4103    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4104         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4105       ) {
4106        float_raise( float_flag_invalid STATUS_VAR);
4107        return 1;
4108    }
4109    return 0;
4110}
4111
4112/*----------------------------------------------------------------------------
4113| Returns 1 if the double-precision floating-point value `a' is equal to the
4114| corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4115| exception.The comparison is performed according to the IEC/IEEE Standard
4116| for Binary Floating-Point Arithmetic.
4117*----------------------------------------------------------------------------*/
4118
4119int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
4120{
4121    uint64_t av, bv;
4122    a = float64_squash_input_denormal(a STATUS_VAR);
4123    b = float64_squash_input_denormal(b STATUS_VAR);
4124
4125    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4126         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4127       ) {
4128        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4129            float_raise( float_flag_invalid STATUS_VAR);
4130        }
4131        return 0;
4132    }
4133    av = float64_val(a);
4134    bv = float64_val(b);
4135    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4136
4137}
4138
4139/*----------------------------------------------------------------------------
4140| Returns 1 if the double-precision floating-point value `a' is less than or
4141| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4142| cause an exception.  Otherwise, the comparison is performed according to the
4143| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4144*----------------------------------------------------------------------------*/
4145
4146int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
4147{
4148    flag aSign, bSign;
4149    uint64_t av, bv;
4150    a = float64_squash_input_denormal(a STATUS_VAR);
4151    b = float64_squash_input_denormal(b STATUS_VAR);
4152
4153    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4154         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4155       ) {
4156        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4157            float_raise( float_flag_invalid STATUS_VAR);
4158        }
4159        return 0;
4160    }
4161    aSign = extractFloat64Sign( a );
4162    bSign = extractFloat64Sign( b );
4163    av = float64_val(a);
4164    bv = float64_val(b);
4165    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4166    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4167
4168}
4169
4170/*----------------------------------------------------------------------------
4171| Returns 1 if the double-precision floating-point value `a' is less than
4172| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4173| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4174| Standard for Binary Floating-Point Arithmetic.
4175*----------------------------------------------------------------------------*/
4176
4177int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
4178{
4179    flag aSign, bSign;
4180    uint64_t av, bv;
4181    a = float64_squash_input_denormal(a STATUS_VAR);
4182    b = float64_squash_input_denormal(b STATUS_VAR);
4183
4184    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4185         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4186       ) {
4187        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4188            float_raise( float_flag_invalid STATUS_VAR);
4189        }
4190        return 0;
4191    }
4192    aSign = extractFloat64Sign( a );
4193    bSign = extractFloat64Sign( b );
4194    av = float64_val(a);
4195    bv = float64_val(b);
4196    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4197    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4198
4199}
4200
4201/*----------------------------------------------------------------------------
4202| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4203| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4204| comparison is performed according to the IEC/IEEE Standard for Binary
4205| Floating-Point Arithmetic.
4206*----------------------------------------------------------------------------*/
4207
4208int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4209{
4210    a = float64_squash_input_denormal(a STATUS_VAR);
4211    b = float64_squash_input_denormal(b STATUS_VAR);
4212
4213    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4214         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4215       ) {
4216        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4217            float_raise( float_flag_invalid STATUS_VAR);
4218        }
4219        return 1;
4220    }
4221    return 0;
4222}
4223
4224/*----------------------------------------------------------------------------
4225| Returns the result of converting the extended double-precision floating-
4226| point value `a' to the 32-bit two's complement integer format.  The
4227| conversion is performed according to the IEC/IEEE Standard for Binary
4228| Floating-Point Arithmetic---which means in particular that the conversion
4229| is rounded according to the current rounding mode.  If `a' is a NaN, the
4230| largest positive integer is returned.  Otherwise, if the conversion
4231| overflows, the largest integer with the same sign as `a' is returned.
4232*----------------------------------------------------------------------------*/
4233
4234int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4235{
4236    flag aSign;
4237    int32 aExp, shiftCount;
4238    uint64_t aSig;
4239
4240    aSig = extractFloatx80Frac( a );
4241    aExp = extractFloatx80Exp( a );
4242    aSign = extractFloatx80Sign( a );
4243    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4244    shiftCount = 0x4037 - aExp;
4245    if ( shiftCount <= 0 ) shiftCount = 1;
4246    shift64RightJamming( aSig, shiftCount, &aSig );
4247    return roundAndPackInt32( aSign, aSig STATUS_VAR );
4248
4249}
4250
4251/*----------------------------------------------------------------------------
4252| Returns the result of converting the extended double-precision floating-
4253| point value `a' to the 32-bit two's complement integer format.  The
4254| conversion is performed according to the IEC/IEEE Standard for Binary
4255| Floating-Point Arithmetic, except that the conversion is always rounded
4256| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4257| Otherwise, if the conversion overflows, the largest integer with the same
4258| sign as `a' is returned.
4259*----------------------------------------------------------------------------*/
4260
4261int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4262{
4263    flag aSign;
4264    int32 aExp, shiftCount;
4265    uint64_t aSig, savedASig;
4266    int32_t z;
4267
4268    aSig = extractFloatx80Frac( a );
4269    aExp = extractFloatx80Exp( a );
4270    aSign = extractFloatx80Sign( a );
4271    if ( 0x401E < aExp ) {
4272        if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4273        goto invalid;
4274    }
4275    else if ( aExp < 0x3FFF ) {
4276        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4277        return 0;
4278    }
4279    shiftCount = 0x403E - aExp;
4280    savedASig = aSig;
4281    aSig >>= shiftCount;
4282    z = aSig;
4283    if ( aSign ) z = - z;
4284    if ( ( z < 0 ) ^ aSign ) {
4285 invalid:
4286        float_raise( float_flag_invalid STATUS_VAR);
4287        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4288    }
4289    if ( ( aSig<<shiftCount ) != savedASig ) {
4290        STATUS(float_exception_flags) |= float_flag_inexact;
4291    }
4292    return z;
4293
4294}
4295
4296/*----------------------------------------------------------------------------
4297| Returns the result of converting the extended double-precision floating-
4298| point value `a' to the 64-bit two's complement integer format.  The
4299| conversion is performed according to the IEC/IEEE Standard for Binary
4300| Floating-Point Arithmetic---which means in particular that the conversion
4301| is rounded according to the current rounding mode.  If `a' is a NaN,
4302| the largest positive integer is returned.  Otherwise, if the conversion
4303| overflows, the largest integer with the same sign as `a' is returned.
4304*----------------------------------------------------------------------------*/
4305
4306int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4307{
4308    flag aSign;
4309    int32 aExp, shiftCount;
4310    uint64_t aSig, aSigExtra;
4311
4312    aSig = extractFloatx80Frac( a );
4313    aExp = extractFloatx80Exp( a );
4314    aSign = extractFloatx80Sign( a );
4315    shiftCount = 0x403E - aExp;
4316    if ( shiftCount <= 0 ) {
4317        if ( shiftCount ) {
4318            float_raise( float_flag_invalid STATUS_VAR);
4319            if (    ! aSign
4320                 || (    ( aExp == 0x7FFF )
4321                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
4322               ) {
4323                return LIT64( 0x7FFFFFFFFFFFFFFF );
4324            }
4325            return (int64_t) LIT64( 0x8000000000000000 );
4326        }
4327        aSigExtra = 0;
4328    }
4329    else {
4330        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4331    }
4332    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4333
4334}
4335
4336/*----------------------------------------------------------------------------
4337| Returns the result of converting the extended double-precision floating-
4338| point value `a' to the 64-bit two's complement integer format.  The
4339| conversion is performed according to the IEC/IEEE Standard for Binary
4340| Floating-Point Arithmetic, except that the conversion is always rounded
4341| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4342| Otherwise, if the conversion overflows, the largest integer with the same
4343| sign as `a' is returned.
4344*----------------------------------------------------------------------------*/
4345
4346int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4347{
4348    flag aSign;
4349    int32 aExp, shiftCount;
4350    uint64_t aSig;
4351    int64 z;
4352
4353    aSig = extractFloatx80Frac( a );
4354    aExp = extractFloatx80Exp( a );
4355    aSign = extractFloatx80Sign( a );
4356    shiftCount = aExp - 0x403E;
4357    if ( 0 <= shiftCount ) {
4358        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4359        if ( ( a.high != 0xC03E ) || aSig ) {
4360            float_raise( float_flag_invalid STATUS_VAR);
4361            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4362                return LIT64( 0x7FFFFFFFFFFFFFFF );
4363            }
4364        }
4365        return (int64_t) LIT64( 0x8000000000000000 );
4366    }
4367    else if ( aExp < 0x3FFF ) {
4368        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4369        return 0;
4370    }
4371    z = aSig>>( - shiftCount );
4372    if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4373        STATUS(float_exception_flags) |= float_flag_inexact;
4374    }
4375    if ( aSign ) z = - z;
4376    return z;
4377
4378}
4379
4380/*----------------------------------------------------------------------------
4381| Returns the result of converting the extended double-precision floating-
4382| point value `a' to the single-precision floating-point format.  The
4383| conversion is performed according to the IEC/IEEE Standard for Binary
4384| Floating-Point Arithmetic.
4385*----------------------------------------------------------------------------*/
4386
4387float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4388{
4389    flag aSign;
4390    int32 aExp;
4391    uint64_t aSig;
4392
4393    aSig = extractFloatx80Frac( a );
4394    aExp = extractFloatx80Exp( a );
4395    aSign = extractFloatx80Sign( a );
4396    if ( aExp == 0x7FFF ) {
4397        if ( (uint64_t) ( aSig<<1 ) ) {
4398            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4399        }
4400        return packFloat32( aSign, 0xFF, 0 );
4401    }
4402    shift64RightJamming( aSig, 33, &aSig );
4403    if ( aExp || aSig ) aExp -= 0x3F81;
4404    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4405
4406}
4407
4408/*----------------------------------------------------------------------------
4409| Returns the result of converting the extended double-precision floating-
4410| point value `a' to the double-precision floating-point format.  The
4411| conversion is performed according to the IEC/IEEE Standard for Binary
4412| Floating-Point Arithmetic.
4413*----------------------------------------------------------------------------*/
4414
4415float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4416{
4417    flag aSign;
4418    int32 aExp;
4419    uint64_t aSig, zSig;
4420
4421    aSig = extractFloatx80Frac( a );
4422    aExp = extractFloatx80Exp( a );
4423    aSign = extractFloatx80Sign( a );
4424    if ( aExp == 0x7FFF ) {
4425        if ( (uint64_t) ( aSig<<1 ) ) {
4426            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4427        }
4428        return packFloat64( aSign, 0x7FF, 0 );
4429    }
4430    shift64RightJamming( aSig, 1, &zSig );
4431    if ( aExp || aSig ) aExp -= 0x3C01;
4432    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4433
4434}
4435
4436/*----------------------------------------------------------------------------
4437| Returns the result of converting the extended double-precision floating-
4438| point value `a' to the quadruple-precision floating-point format.  The
4439| conversion is performed according to the IEC/IEEE Standard for Binary
4440| Floating-Point Arithmetic.
4441*----------------------------------------------------------------------------*/
4442
4443float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4444{
4445    flag aSign;
4446    int_fast16_t aExp;
4447    uint64_t aSig, zSig0, zSig1;
4448
4449    aSig = extractFloatx80Frac( a );
4450    aExp = extractFloatx80Exp( a );
4451    aSign = extractFloatx80Sign( a );
4452    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4453        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4454    }
4455    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4456    return packFloat128( aSign, aExp, zSig0, zSig1 );
4457
4458}
4459
4460/*----------------------------------------------------------------------------
4461| Rounds the extended double-precision floating-point value `a' to an integer,
4462| and returns the result as an extended quadruple-precision floating-point
4463| value.  The operation is performed according to the IEC/IEEE Standard for
4464| Binary Floating-Point Arithmetic.
4465*----------------------------------------------------------------------------*/
4466
4467floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4468{
4469    flag aSign;
4470    int32 aExp;
4471    uint64_t lastBitMask, roundBitsMask;
4472    int8 roundingMode;
4473    floatx80 z;
4474
4475    aExp = extractFloatx80Exp( a );
4476    if ( 0x403E <= aExp ) {
4477        if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4478            return propagateFloatx80NaN( a, a STATUS_VAR );
4479        }
4480        return a;
4481    }
4482    if ( aExp < 0x3FFF ) {
4483        if (    ( aExp == 0 )
4484             && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4485            return a;
4486        }
4487        STATUS(float_exception_flags) |= float_flag_inexact;
4488        aSign = extractFloatx80Sign( a );
4489        switch ( STATUS(float_rounding_mode) ) {
4490         case float_round_nearest_even:
4491            if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4492               ) {
4493                return
4494                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4495            }
4496            break;
4497         case float_round_down:
4498            return
4499                  aSign ?
4500                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4501                : packFloatx80( 0, 0, 0 );
4502         case float_round_up:
4503            return
4504                  aSign ? packFloatx80( 1, 0, 0 )
4505                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4506        }
4507        return packFloatx80( aSign, 0, 0 );
4508    }
4509    lastBitMask = 1;
4510    lastBitMask <<= 0x403E - aExp;
4511    roundBitsMask = lastBitMask - 1;
4512    z = a;
4513    roundingMode = STATUS(float_rounding_mode);
4514    if ( roundingMode == float_round_nearest_even ) {
4515        z.low += lastBitMask>>1;
4516        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4517    }
4518    else if ( roundingMode != float_round_to_zero ) {
4519        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4520            z.low += roundBitsMask;
4521        }
4522    }
4523    z.low &= ~ roundBitsMask;
4524    if ( z.low == 0 ) {
4525        ++z.high;
4526        z.low = LIT64( 0x8000000000000000 );
4527    }
4528    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4529    return z;
4530
4531}
4532
4533/*----------------------------------------------------------------------------
4534| Returns the result of adding the absolute values of the extended double-
4535| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4536| negated before being returned.  `zSign' is ignored if the result is a NaN.
4537| The addition is performed according to the IEC/IEEE Standard for Binary
4538| Floating-Point Arithmetic.
4539*----------------------------------------------------------------------------*/
4540
4541static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4542{
4543    int32 aExp, bExp, zExp;
4544    uint64_t aSig, bSig, zSig0, zSig1;
4545    int32 expDiff;
4546
4547    aSig = extractFloatx80Frac( a );
4548    aExp = extractFloatx80Exp( a );
4549    bSig = extractFloatx80Frac( b );
4550    bExp = extractFloatx80Exp( b );
4551    expDiff = aExp - bExp;
4552    if ( 0 < expDiff ) {
4553        if ( aExp == 0x7FFF ) {
4554            if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4555            return a;
4556        }
4557        if ( bExp == 0 ) --expDiff;
4558        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4559        zExp = aExp;
4560    }
4561    else if ( expDiff < 0 ) {
4562        if ( bExp == 0x7FFF ) {
4563            if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4564            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4565        }
4566        if ( aExp == 0 ) ++expDiff;
4567        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4568        zExp = bExp;
4569    }
4570    else {
4571        if ( aExp == 0x7FFF ) {
4572            if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4573                return propagateFloatx80NaN( a, b STATUS_VAR );
4574            }
4575            return a;
4576        }
4577        zSig1 = 0;
4578        zSig0 = aSig + bSig;
4579        if ( aExp == 0 ) {
4580            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4581            goto roundAndPack;
4582        }
4583        zExp = aExp;
4584        goto shiftRight1;
4585    }
4586    zSig0 = aSig + bSig;
4587    if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4588 shiftRight1:
4589    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4590    zSig0 |= LIT64( 0x8000000000000000 );
4591    ++zExp;
4592 roundAndPack:
4593    return
4594        roundAndPackFloatx80(
4595            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4596
4597}
4598
4599/*----------------------------------------------------------------------------
4600| Returns the result of subtracting the absolute values of the extended
4601| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4602| difference is negated before being returned.  `zSign' is ignored if the
4603| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4604| Standard for Binary Floating-Point Arithmetic.
4605*----------------------------------------------------------------------------*/
4606
4607static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4608{
4609    int32 aExp, bExp, zExp;
4610    uint64_t aSig, bSig, zSig0, zSig1;
4611    int32 expDiff;
4612    floatx80 z;
4613
4614    aSig = extractFloatx80Frac( a );
4615    aExp = extractFloatx80Exp( a );
4616    bSig = extractFloatx80Frac( b );
4617    bExp = extractFloatx80Exp( b );
4618    expDiff = aExp - bExp;
4619    if ( 0 < expDiff ) goto aExpBigger;
4620    if ( expDiff < 0 ) goto bExpBigger;
4621    if ( aExp == 0x7FFF ) {
4622        if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4623            return propagateFloatx80NaN( a, b STATUS_VAR );
4624        }
4625        float_raise( float_flag_invalid STATUS_VAR);
4626        z.low = floatx80_default_nan_low;
4627        z.high = floatx80_default_nan_high;
4628        return z;
4629    }
4630    if ( aExp == 0 ) {
4631        aExp = 1;
4632        bExp = 1;
4633    }
4634    zSig1 = 0;
4635    if ( bSig < aSig ) goto aBigger;
4636    if ( aSig < bSig ) goto bBigger;
4637    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4638 bExpBigger:
4639    if ( bExp == 0x7FFF ) {
4640        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4641        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4642    }
4643    if ( aExp == 0 ) ++expDiff;
4644    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4645 bBigger:
4646    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4647    zExp = bExp;
4648    zSign ^= 1;
4649    goto normalizeRoundAndPack;
4650 aExpBigger:
4651    if ( aExp == 0x7FFF ) {
4652        if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4653        return a;
4654    }
4655    if ( bExp == 0 ) --expDiff;
4656    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4657 aBigger:
4658    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4659    zExp = aExp;
4660 normalizeRoundAndPack:
4661    return
4662        normalizeRoundAndPackFloatx80(
4663            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4664
4665}
4666
4667/*----------------------------------------------------------------------------
4668| Returns the result of adding the extended double-precision floating-point
4669| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4670| Standard for Binary Floating-Point Arithmetic.
4671*----------------------------------------------------------------------------*/
4672
4673floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4674{
4675    flag aSign, bSign;
4676
4677    aSign = extractFloatx80Sign( a );
4678    bSign = extractFloatx80Sign( b );
4679    if ( aSign == bSign ) {
4680        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4681    }
4682    else {
4683        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4684    }
4685
4686}
4687
4688/*----------------------------------------------------------------------------
4689| Returns the result of subtracting the extended double-precision floating-
4690| point values `a' and `b'.  The operation is performed according to the
4691| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4692*----------------------------------------------------------------------------*/
4693
4694floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4695{
4696    flag aSign, bSign;
4697
4698    aSign = extractFloatx80Sign( a );
4699    bSign = extractFloatx80Sign( b );
4700    if ( aSign == bSign ) {
4701        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4702    }
4703    else {
4704        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4705    }
4706
4707}
4708
4709/*----------------------------------------------------------------------------
4710| Returns the result of multiplying the extended double-precision floating-
4711| point values `a' and `b'.  The operation is performed according to the
4712| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4713*----------------------------------------------------------------------------*/
4714
4715floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4716{
4717    flag aSign, bSign, zSign;
4718    int32 aExp, bExp, zExp;
4719    uint64_t aSig, bSig, zSig0, zSig1;
4720    floatx80 z;
4721
4722    aSig = extractFloatx80Frac( a );
4723    aExp = extractFloatx80Exp( a );
4724    aSign = extractFloatx80Sign( a );
4725    bSig = extractFloatx80Frac( b );
4726    bExp = extractFloatx80Exp( b );
4727    bSign = extractFloatx80Sign( b );
4728    zSign = aSign ^ bSign;
4729    if ( aExp == 0x7FFF ) {
4730        if (    (uint64_t) ( aSig<<1 )
4731             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4732            return propagateFloatx80NaN( a, b STATUS_VAR );
4733        }
4734        if ( ( bExp | bSig ) == 0 ) goto invalid;
4735        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4736    }
4737    if ( bExp == 0x7FFF ) {
4738        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4739        if ( ( aExp | aSig ) == 0 ) {
4740 invalid:
4741            float_raise( float_flag_invalid STATUS_VAR);
4742            z.low = floatx80_default_nan_low;
4743            z.high = floatx80_default_nan_high;
4744            return z;
4745        }
4746        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4747    }
4748    if ( aExp == 0 ) {
4749        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4750        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4751    }
4752    if ( bExp == 0 ) {
4753        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4754        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4755    }
4756    zExp = aExp + bExp - 0x3FFE;
4757    mul64To128( aSig, bSig, &zSig0, &zSig1 );
4758    if ( 0 < (int64_t) zSig0 ) {
4759        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4760        --zExp;
4761    }
4762    return
4763        roundAndPackFloatx80(
4764            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4765
4766}
4767
4768/*----------------------------------------------------------------------------
4769| Returns the result of dividing the extended double-precision floating-point
4770| value `a' by the corresponding value `b'.  The operation is performed
4771| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4772*----------------------------------------------------------------------------*/
4773
4774floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4775{
4776    flag aSign, bSign, zSign;
4777    int32 aExp, bExp, zExp;
4778    uint64_t aSig, bSig, zSig0, zSig1;
4779    uint64_t rem0, rem1, rem2, term0, term1, term2;
4780    floatx80 z;
4781
4782    aSig = extractFloatx80Frac( a );
4783    aExp = extractFloatx80Exp( a );
4784    aSign = extractFloatx80Sign( a );
4785    bSig = extractFloatx80Frac( b );
4786    bExp = extractFloatx80Exp( b );
4787    bSign = extractFloatx80Sign( b );
4788    zSign = aSign ^ bSign;
4789    if ( aExp == 0x7FFF ) {
4790        if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4791        if ( bExp == 0x7FFF ) {
4792            if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4793            goto invalid;
4794        }
4795        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4796    }
4797    if ( bExp == 0x7FFF ) {
4798        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4799        return packFloatx80( zSign, 0, 0 );
4800    }
4801    if ( bExp == 0 ) {
4802        if ( bSig == 0 ) {
4803            if ( ( aExp | aSig ) == 0 ) {
4804 invalid:
4805                float_raise( float_flag_invalid STATUS_VAR);
4806                z.low = floatx80_default_nan_low;
4807                z.high = floatx80_default_nan_high;
4808                return z;
4809            }
4810            float_raise( float_flag_divbyzero STATUS_VAR);
4811            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4812        }
4813        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4814    }
4815    if ( aExp == 0 ) {
4816        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4817        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4818    }
4819    zExp = aExp - bExp + 0x3FFE;
4820    rem1 = 0;
4821    if ( bSig <= aSig ) {
4822        shift128Right( aSig, 0, 1, &aSig, &rem1 );
4823        ++zExp;
4824    }
4825    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4826    mul64To128( bSig, zSig0, &term0, &term1 );
4827    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4828    while ( (int64_t) rem0 < 0 ) {
4829        --zSig0;
4830        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4831    }
4832    zSig1 = estimateDiv128To64( rem1, 0, bSig );
4833    if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4834        mul64To128( bSig, zSig1, &term1, &term2 );
4835        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4836        while ( (int64_t) rem1 < 0 ) {
4837            --zSig1;
4838            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4839        }
4840        zSig1 |= ( ( rem1 | rem2 ) != 0 );
4841    }
4842    return
4843        roundAndPackFloatx80(
4844            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4845
4846}
4847
4848/*----------------------------------------------------------------------------
4849| Returns the remainder of the extended double-precision floating-point value
4850| `a' with respect to the corresponding value `b'.  The operation is performed
4851| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4852*----------------------------------------------------------------------------*/
4853
4854floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4855{
4856    flag aSign, zSign;
4857    int32 aExp, bExp, expDiff;
4858    uint64_t aSig0, aSig1, bSig;
4859    uint64_t q, term0, term1, alternateASig0, alternateASig1;
4860    floatx80 z;
4861
4862    aSig0 = extractFloatx80Frac( a );
4863    aExp = extractFloatx80Exp( a );
4864    aSign = extractFloatx80Sign( a );
4865    bSig = extractFloatx80Frac( b );
4866    bExp = extractFloatx80Exp( b );
4867    if ( aExp == 0x7FFF ) {
4868        if (    (uint64_t) ( aSig0<<1 )
4869             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4870            return propagateFloatx80NaN( a, b STATUS_VAR );
4871        }
4872        goto invalid;
4873    }
4874    if ( bExp == 0x7FFF ) {
4875        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4876        return a;
4877    }
4878    if ( bExp == 0 ) {
4879        if ( bSig == 0 ) {
4880 invalid:
4881            float_raise( float_flag_invalid STATUS_VAR);
4882            z.low = floatx80_default_nan_low;
4883            z.high = floatx80_default_nan_high;
4884            return z;
4885        }
4886        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4887    }
4888    if ( aExp == 0 ) {
4889        if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4890        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4891    }
4892    bSig |= LIT64( 0x8000000000000000 );
4893    zSign = aSign;
4894    expDiff = aExp - bExp;
4895    aSig1 = 0;
4896    if ( expDiff < 0 ) {
4897        if ( expDiff < -1 ) return a;
4898        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4899        expDiff = 0;
4900    }
4901    q = ( bSig <= aSig0 );
4902    if ( q ) aSig0 -= bSig;
4903    expDiff -= 64;
4904    while ( 0 < expDiff ) {
4905        q = estimateDiv128To64( aSig0, aSig1, bSig );
4906        q = ( 2 < q ) ? q - 2 : 0;
4907        mul64To128( bSig, q, &term0, &term1 );
4908        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4909        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4910        expDiff -= 62;
4911    }
4912    expDiff += 64;
4913    if ( 0 < expDiff ) {
4914        q = estimateDiv128To64( aSig0, aSig1, bSig );
4915        q = ( 2 < q ) ? q - 2 : 0;
4916        q >>= 64 - expDiff;
4917        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4918        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4919        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4920        while ( le128( term0, term1, aSig0, aSig1 ) ) {
4921            ++q;
4922            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4923        }
4924    }
4925    else {
4926        term1 = 0;
4927        term0 = bSig;
4928    }
4929    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4930    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4931         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4932              && ( q & 1 ) )
4933       ) {
4934        aSig0 = alternateASig0;
4935        aSig1 = alternateASig1;
4936        zSign = ! zSign;
4937    }
4938    return
4939        normalizeRoundAndPackFloatx80(
4940            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4941
4942}
4943
4944/*----------------------------------------------------------------------------
4945| Returns the square root of the extended double-precision floating-point
4946| value `a'.  The operation is performed according to the IEC/IEEE Standard
4947| for Binary Floating-Point Arithmetic.
4948*----------------------------------------------------------------------------*/
4949
4950floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4951{
4952    flag aSign;
4953    int32 aExp, zExp;
4954    uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4955    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4956    floatx80 z;
4957
4958    aSig0 = extractFloatx80Frac( a );
4959    aExp = extractFloatx80Exp( a );
4960    aSign = extractFloatx80Sign( a );
4961    if ( aExp == 0x7FFF ) {
4962        if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4963        if ( ! aSign ) return a;
4964        goto invalid;
4965    }
4966    if ( aSign ) {
4967        if ( ( aExp | aSig0 ) == 0 ) return a;
4968 invalid:
4969        float_raise( float_flag_invalid STATUS_VAR);
4970        z.low = floatx80_default_nan_low;
4971        z.high = floatx80_default_nan_high;
4972        return z;
4973    }
4974    if ( aExp == 0 ) {
4975        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4976        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4977    }
4978    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4979    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4980    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4981    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4982    doubleZSig0 = zSig0<<1;
4983    mul64To128( zSig0, zSig0, &term0, &term1 );
4984    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4985    while ( (int64_t) rem0 < 0 ) {
4986        --zSig0;
4987        doubleZSig0 -= 2;
4988        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4989    }
4990    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4991    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4992        if ( zSig1 == 0 ) zSig1 = 1;
4993        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4994        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4995        mul64To128( zSig1, zSig1, &term2, &term3 );
4996        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4997        while ( (int64_t) rem1 < 0 ) {
4998            --zSig1;
4999            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5000            term3 |= 1;
5001            term2 |= doubleZSig0;
5002            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5003        }
5004        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5005    }
5006    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5007    zSig0 |= doubleZSig0;
5008    return
5009        roundAndPackFloatx80(
5010            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5011
5012}
5013
5014/*----------------------------------------------------------------------------
5015| Returns 1 if the extended double-precision floating-point value `a' is equal
5016| to the corresponding value `b', and 0 otherwise.  The invalid exception is
5017| raised if either operand is a NaN.  Otherwise, the comparison is performed
5018| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5019*----------------------------------------------------------------------------*/
5020
5021int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
5022{
5023
5024    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5025              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5026         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5027              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5028       ) {
5029        float_raise( float_flag_invalid STATUS_VAR);
5030        return 0;
5031    }
5032    return
5033           ( a.low == b.low )
5034        && (    ( a.high == b.high )
5035             || (    ( a.low == 0 )
5036                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5037           );
5038
5039}
5040
5041/*----------------------------------------------------------------------------
5042| Returns 1 if the extended double-precision floating-point value `a' is
5043| less than or equal to the corresponding value `b', and 0 otherwise.  The
5044| invalid exception is raised if either operand is a NaN.  The comparison is
5045| performed according to the IEC/IEEE Standard for Binary Floating-Point
5046| Arithmetic.
5047*----------------------------------------------------------------------------*/
5048
5049int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
5050{
5051    flag aSign, bSign;
5052
5053    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5054              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5055         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5056              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5057       ) {
5058        float_raise( float_flag_invalid STATUS_VAR);
5059        return 0;
5060    }
5061    aSign = extractFloatx80Sign( a );
5062    bSign = extractFloatx80Sign( b );
5063    if ( aSign != bSign ) {
5064        return
5065               aSign
5066            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5067                 == 0 );
5068    }
5069    return
5070          aSign ? le128( b.high, b.low, a.high, a.low )
5071        : le128( a.high, a.low, b.high, b.low );
5072
5073}
5074
5075/*----------------------------------------------------------------------------
5076| Returns 1 if the extended double-precision floating-point value `a' is
5077| less than the corresponding value `b', and 0 otherwise.  The invalid
5078| exception is raised if either operand is a NaN.  The comparison is performed
5079| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5080*----------------------------------------------------------------------------*/
5081
5082int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
5083{
5084    flag aSign, bSign;
5085
5086    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5087              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5088         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5089              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5090       ) {
5091        float_raise( float_flag_invalid STATUS_VAR);
5092        return 0;
5093    }
5094    aSign = extractFloatx80Sign( a );
5095    bSign = extractFloatx80Sign( b );
5096    if ( aSign != bSign ) {
5097        return
5098               aSign
5099            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5100                 != 0 );
5101    }
5102    return
5103          aSign ? lt128( b.high, b.low, a.high, a.low )
5104        : lt128( a.high, a.low, b.high, b.low );
5105
5106}
5107
5108/*----------------------------------------------------------------------------
5109| Returns 1 if the extended double-precision floating-point values `a' and `b'
5110| cannot be compared, and 0 otherwise.  The invalid exception is raised if
5111| either operand is a NaN.   The comparison is performed according to the
5112| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5113*----------------------------------------------------------------------------*/
5114int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5115{
5116    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5117              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5118         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5119              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5120       ) {
5121        float_raise( float_flag_invalid STATUS_VAR);
5122        return 1;
5123    }
5124    return 0;
5125}
5126
5127/*----------------------------------------------------------------------------
5128| Returns 1 if the extended double-precision floating-point value `a' is
5129| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5130| cause an exception.  The comparison is performed according to the IEC/IEEE
5131| Standard for Binary Floating-Point Arithmetic.
5132*----------------------------------------------------------------------------*/
5133
5134int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5135{
5136
5137    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5138              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5139         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5140              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5141       ) {
5142        if (    floatx80_is_signaling_nan( a )
5143             || floatx80_is_signaling_nan( b ) ) {
5144            float_raise( float_flag_invalid STATUS_VAR);
5145        }
5146        return 0;
5147    }
5148    return
5149           ( a.low == b.low )
5150        && (    ( a.high == b.high )
5151             || (    ( a.low == 0 )
5152                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5153           );
5154
5155}
5156
5157/*----------------------------------------------------------------------------
5158| Returns 1 if the extended double-precision floating-point value `a' is less
5159| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5160| do not cause an exception.  Otherwise, the comparison is performed according
5161| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5162*----------------------------------------------------------------------------*/
5163
5164int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5165{
5166    flag aSign, bSign;
5167
5168    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5169              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5170         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5171              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5172       ) {
5173        if (    floatx80_is_signaling_nan( a )
5174             || floatx80_is_signaling_nan( b ) ) {
5175            float_raise( float_flag_invalid STATUS_VAR);
5176        }
5177        return 0;
5178    }
5179    aSign = extractFloatx80Sign( a );
5180    bSign = extractFloatx80Sign( b );
5181    if ( aSign != bSign ) {
5182        return
5183               aSign
5184            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5185                 == 0 );
5186    }
5187    return
5188          aSign ? le128( b.high, b.low, a.high, a.low )
5189        : le128( a.high, a.low, b.high, b.low );
5190
5191}
5192
5193/*----------------------------------------------------------------------------
5194| Returns 1 if the extended double-precision floating-point value `a' is less
5195| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5196| an exception.  Otherwise, the comparison is performed according to the
5197| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5198*----------------------------------------------------------------------------*/
5199
5200int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5201{
5202    flag aSign, bSign;
5203
5204    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5205              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5206         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5207              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5208       ) {
5209        if (    floatx80_is_signaling_nan( a )
5210             || floatx80_is_signaling_nan( b ) ) {
5211            float_raise( float_flag_invalid STATUS_VAR);
5212        }
5213        return 0;
5214    }
5215    aSign = extractFloatx80Sign( a );
5216    bSign = extractFloatx80Sign( b );
5217    if ( aSign != bSign ) {
5218        return
5219               aSign
5220            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5221                 != 0 );
5222    }
5223    return
5224          aSign ? lt128( b.high, b.low, a.high, a.low )
5225        : lt128( a.high, a.low, b.high, b.low );
5226
5227}
5228
5229/*----------------------------------------------------------------------------
5230| Returns 1 if the extended double-precision floating-point values `a' and `b'
5231| cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5232| The comparison is performed according to the IEC/IEEE Standard for Binary
5233| Floating-Point Arithmetic.
5234*----------------------------------------------------------------------------*/
5235int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5236{
5237    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5238              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5239         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5240              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5241       ) {
5242        if (    floatx80_is_signaling_nan( a )
5243             || floatx80_is_signaling_nan( b ) ) {
5244            float_raise( float_flag_invalid STATUS_VAR);
5245        }
5246        return 1;
5247    }
5248    return 0;
5249}
5250
5251/*----------------------------------------------------------------------------
5252| Returns the result of converting the quadruple-precision floating-point
5253| value `a' to the 32-bit two's complement integer format.  The conversion
5254| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5255| Arithmetic---which means in particular that the conversion is rounded
5256| according to the current rounding mode.  If `a' is a NaN, the largest
5257| positive integer is returned.  Otherwise, if the conversion overflows, the
5258| largest integer with the same sign as `a' is returned.
5259*----------------------------------------------------------------------------*/
5260
5261int32 float128_to_int32( float128 a STATUS_PARAM )
5262{
5263    flag aSign;
5264    int32 aExp, shiftCount;
5265    uint64_t aSig0, aSig1;
5266
5267    aSig1 = extractFloat128Frac1( a );
5268    aSig0 = extractFloat128Frac0( a );
5269    aExp = extractFloat128Exp( a );
5270    aSign = extractFloat128Sign( a );
5271    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5272    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5273    aSig0 |= ( aSig1 != 0 );
5274    shiftCount = 0x4028 - aExp;
5275    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5276    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5277
5278}
5279
5280/*----------------------------------------------------------------------------
5281| Returns the result of converting the quadruple-precision floating-point
5282| value `a' to the 32-bit two's complement integer format.  The conversion
5283| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5284| Arithmetic, except that the conversion is always rounded toward zero.  If
5285| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5286| conversion overflows, the largest integer with the same sign as `a' is
5287| returned.
5288*----------------------------------------------------------------------------*/
5289
5290int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5291{
5292    flag aSign;
5293    int32 aExp, shiftCount;
5294    uint64_t aSig0, aSig1, savedASig;
5295    int32_t z;
5296
5297    aSig1 = extractFloat128Frac1( a );
5298    aSig0 = extractFloat128Frac0( a );
5299    aExp = extractFloat128Exp( a );
5300    aSign = extractFloat128Sign( a );
5301    aSig0 |= ( aSig1 != 0 );
5302    if ( 0x401E < aExp ) {
5303        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5304        goto invalid;
5305    }
5306    else if ( aExp < 0x3FFF ) {
5307        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5308        return 0;
5309    }
5310    aSig0 |= LIT64( 0x0001000000000000 );
5311    shiftCount = 0x402F - aExp;
5312    savedASig = aSig0;
5313    aSig0 >>= shiftCount;
5314    z = aSig0;
5315    if ( aSign ) z = - z;
5316    if ( ( z < 0 ) ^ aSign ) {
5317 invalid:
5318        float_raise( float_flag_invalid STATUS_VAR);
5319        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5320    }
5321    if ( ( aSig0<<shiftCount ) != savedASig ) {
5322        STATUS(float_exception_flags) |= float_flag_inexact;
5323    }
5324    return z;
5325
5326}
5327
5328/*----------------------------------------------------------------------------
5329| Returns the result of converting the quadruple-precision floating-point
5330| value `a' to the 64-bit two's complement integer format.  The conversion
5331| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5332| Arithmetic---which means in particular that the conversion is rounded
5333| according to the current rounding mode.  If `a' is a NaN, the largest
5334| positive integer is returned.  Otherwise, if the conversion overflows, the
5335| largest integer with the same sign as `a' is returned.
5336*----------------------------------------------------------------------------*/
5337
5338int64 float128_to_int64( float128 a STATUS_PARAM )
5339{
5340    flag aSign;
5341    int32 aExp, shiftCount;
5342    uint64_t aSig0, aSig1;
5343
5344    aSig1 = extractFloat128Frac1( a );
5345    aSig0 = extractFloat128Frac0( a );
5346    aExp = extractFloat128Exp( a );
5347    aSign = extractFloat128Sign( a );
5348    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5349    shiftCount = 0x402F - aExp;
5350    if ( shiftCount <= 0 ) {
5351        if ( 0x403E < aExp ) {
5352            float_raise( float_flag_invalid STATUS_VAR);
5353            if (    ! aSign
5354                 || (    ( aExp == 0x7FFF )
5355                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5356                    )
5357               ) {
5358                return LIT64( 0x7FFFFFFFFFFFFFFF );
5359            }
5360            return (int64_t) LIT64( 0x8000000000000000 );
5361        }
5362        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5363    }
5364    else {
5365        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5366    }
5367    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5368
5369}
5370
5371/*----------------------------------------------------------------------------
5372| Returns the result of converting the quadruple-precision floating-point
5373| value `a' to the 64-bit two's complement integer format.  The conversion
5374| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5375| Arithmetic, except that the conversion is always rounded toward zero.
5376| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5377| the conversion overflows, the largest integer with the same sign as `a' is
5378| returned.
5379*----------------------------------------------------------------------------*/
5380
5381int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5382{
5383    flag aSign;
5384    int32 aExp, shiftCount;
5385    uint64_t aSig0, aSig1;
5386    int64 z;
5387
5388    aSig1 = extractFloat128Frac1( a );
5389    aSig0 = extractFloat128Frac0( a );
5390    aExp = extractFloat128Exp( a );
5391    aSign = extractFloat128Sign( a );
5392    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5393    shiftCount = aExp - 0x402F;
5394    if ( 0 < shiftCount ) {
5395        if ( 0x403E <= aExp ) {
5396            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5397            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5398                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5399                if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5400            }
5401            else {
5402                float_raise( float_flag_invalid STATUS_VAR);
5403                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5404                    return LIT64( 0x7FFFFFFFFFFFFFFF );
5405                }
5406            }
5407            return (int64_t) LIT64( 0x8000000000000000 );
5408        }
5409        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5410        if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5411            STATUS(float_exception_flags) |= float_flag_inexact;
5412        }
5413    }
5414    else {
5415        if ( aExp < 0x3FFF ) {
5416            if ( aExp | aSig0 | aSig1 ) {
5417                STATUS(float_exception_flags) |= float_flag_inexact;
5418            }
5419            return 0;
5420        }
5421        z = aSig0>>( - shiftCount );
5422        if (    aSig1
5423             || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5424            STATUS(float_exception_flags) |= float_flag_inexact;
5425        }
5426    }
5427    if ( aSign ) z = - z;
5428    return z;
5429
5430}
5431
5432/*----------------------------------------------------------------------------
5433| Returns the result of converting the quadruple-precision floating-point
5434| value `a' to the single-precision floating-point format.  The conversion
5435| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5436| Arithmetic.
5437*----------------------------------------------------------------------------*/
5438
5439float32 float128_to_float32( float128 a STATUS_PARAM )
5440{
5441    flag aSign;
5442    int32 aExp;
5443    uint64_t aSig0, aSig1;
5444    uint32_t zSig;
5445
5446    aSig1 = extractFloat128Frac1( a );
5447    aSig0 = extractFloat128Frac0( a );
5448    aExp = extractFloat128Exp( a );
5449    aSign = extractFloat128Sign( a );
5450    if ( aExp == 0x7FFF ) {
5451        if ( aSig0 | aSig1 ) {
5452            return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5453        }
5454        return packFloat32( aSign, 0xFF, 0 );
5455    }
5456    aSig0 |= ( aSig1 != 0 );
5457    shift64RightJamming( aSig0, 18, &aSig0 );
5458    zSig = aSig0;
5459    if ( aExp || zSig ) {
5460        zSig |= 0x40000000;
5461        aExp -= 0x3F81;
5462    }
5463    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5464
5465}
5466
5467/*----------------------------------------------------------------------------
5468| Returns the result of converting the quadruple-precision floating-point
5469| value `a' to the double-precision floating-point format.  The conversion
5470| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5471| Arithmetic.
5472*----------------------------------------------------------------------------*/
5473
5474float64 float128_to_float64( float128 a STATUS_PARAM )
5475{
5476    flag aSign;
5477    int32 aExp;
5478    uint64_t aSig0, aSig1;
5479
5480    aSig1 = extractFloat128Frac1( a );
5481    aSig0 = extractFloat128Frac0( a );
5482    aExp = extractFloat128Exp( a );
5483    aSign = extractFloat128Sign( a );
5484    if ( aExp == 0x7FFF ) {
5485        if ( aSig0 | aSig1 ) {
5486            return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5487        }
5488        return packFloat64( aSign, 0x7FF, 0 );
5489    }
5490    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5491    aSig0 |= ( aSig1 != 0 );
5492    if ( aExp || aSig0 ) {
5493        aSig0 |= LIT64( 0x4000000000000000 );
5494        aExp -= 0x3C01;
5495    }
5496    return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5497
5498}
5499
5500/*----------------------------------------------------------------------------
5501| Returns the result of converting the quadruple-precision floating-point
5502| value `a' to the extended double-precision floating-point format.  The
5503| conversion is performed according to the IEC/IEEE Standard for Binary
5504| Floating-Point Arithmetic.
5505*----------------------------------------------------------------------------*/
5506
5507floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5508{
5509    flag aSign;
5510    int32 aExp;
5511    uint64_t aSig0, aSig1;
5512
5513    aSig1 = extractFloat128Frac1( a );
5514    aSig0 = extractFloat128Frac0( a );
5515    aExp = extractFloat128Exp( a );
5516    aSign = extractFloat128Sign( a );
5517    if ( aExp == 0x7FFF ) {
5518        if ( aSig0 | aSig1 ) {
5519            return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5520        }
5521        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5522    }
5523    if ( aExp == 0 ) {
5524        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5525        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5526    }
5527    else {
5528        aSig0 |= LIT64( 0x0001000000000000 );
5529    }
5530    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5531    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5532
5533}
5534
5535/*----------------------------------------------------------------------------
5536| Rounds the quadruple-precision floating-point value `a' to an integer, and
5537| returns the result as a quadruple-precision floating-point value.  The
5538| operation is performed according to the IEC/IEEE Standard for Binary
5539| Floating-Point Arithmetic.
5540*----------------------------------------------------------------------------*/
5541
5542float128 float128_round_to_int( float128 a STATUS_PARAM )
5543{
5544    flag aSign;
5545    int32 aExp;
5546    uint64_t lastBitMask, roundBitsMask;
5547    int8 roundingMode;
5548    float128 z;
5549
5550    aExp = extractFloat128Exp( a );
5551    if ( 0x402F <= aExp ) {
5552        if ( 0x406F <= aExp ) {
5553            if (    ( aExp == 0x7FFF )
5554                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5555               ) {
5556                return propagateFloat128NaN( a, a STATUS_VAR );
5557            }
5558            return a;
5559        }
5560        lastBitMask = 1;
5561        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5562        roundBitsMask = lastBitMask - 1;
5563        z = a;
5564        roundingMode = STATUS(float_rounding_mode);
5565        if ( roundingMode == float_round_nearest_even ) {
5566            if ( lastBitMask ) {
5567                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5568                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5569            }
5570            else {
5571                if ( (int64_t) z.low < 0 ) {
5572                    ++z.high;
5573                    if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5574                }
5575            }
5576        }
5577        else if ( roundingMode != float_round_to_zero ) {
5578            if (   extractFloat128Sign( z )
5579                 ^ ( roundingMode == float_round_up ) ) {
5580                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5581            }
5582        }
5583        z.low &= ~ roundBitsMask;
5584    }
5585    else {
5586        if ( aExp < 0x3FFF ) {
5587            if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5588            STATUS(float_exception_flags) |= float_flag_inexact;
5589            aSign = extractFloat128Sign( a );
5590            switch ( STATUS(float_rounding_mode) ) {
5591             case float_round_nearest_even:
5592                if (    ( aExp == 0x3FFE )
5593                     && (   extractFloat128Frac0( a )
5594                          | extractFloat128Frac1( a ) )
5595                   ) {
5596                    return packFloat128( aSign, 0x3FFF, 0, 0 );
5597                }
5598                break;
5599             case float_round_down:
5600                return
5601                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5602                    : packFloat128( 0, 0, 0, 0 );
5603             case float_round_up:
5604                return
5605                      aSign ? packFloat128( 1, 0, 0, 0 )
5606                    : packFloat128( 0, 0x3FFF, 0, 0 );
5607            }
5608            return packFloat128( aSign, 0, 0, 0 );
5609        }
5610        lastBitMask = 1;
5611        lastBitMask <<= 0x402F - aExp;
5612        roundBitsMask = lastBitMask - 1;
5613        z.low = 0;
5614        z.high = a.high;
5615        roundingMode = STATUS(float_rounding_mode);
5616        if ( roundingMode == float_round_nearest_even ) {
5617            z.high += lastBitMask>>1;
5618            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5619                z.high &= ~ lastBitMask;
5620            }
5621        }
5622        else if ( roundingMode != float_round_to_zero ) {
5623            if (   extractFloat128Sign( z )
5624                 ^ ( roundingMode == float_round_up ) ) {
5625                z.high |= ( a.low != 0 );
5626                z.high += roundBitsMask;
5627            }
5628        }
5629        z.high &= ~ roundBitsMask;
5630    }
5631    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5632        STATUS(float_exception_flags) |= float_flag_inexact;
5633    }
5634    return z;
5635
5636}
5637
5638/*----------------------------------------------------------------------------
5639| Returns the result of adding the absolute values of the quadruple-precision
5640| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5641| before being returned.  `zSign' is ignored if the result is a NaN.
5642| The addition is performed according to the IEC/IEEE Standard for Binary
5643| Floating-Point Arithmetic.
5644*----------------------------------------------------------------------------*/
5645
5646static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5647{
5648    int32 aExp, bExp, zExp;
5649    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5650    int32 expDiff;
5651
5652    aSig1 = extractFloat128Frac1( a );
5653    aSig0 = extractFloat128Frac0( a );
5654    aExp = extractFloat128Exp( a );
5655    bSig1 = extractFloat128Frac1( b );
5656    bSig0 = extractFloat128Frac0( b );
5657    bExp = extractFloat128Exp( b );
5658    expDiff = aExp - bExp;
5659    if ( 0 < expDiff ) {
5660        if ( aExp == 0x7FFF ) {
5661            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5662            return a;
5663        }
5664        if ( bExp == 0 ) {
5665            --expDiff;
5666        }
5667        else {
5668            bSig0 |= LIT64( 0x0001000000000000 );
5669        }
5670        shift128ExtraRightJamming(
5671            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5672        zExp = aExp;
5673    }
5674    else if ( expDiff < 0 ) {
5675        if ( bExp == 0x7FFF ) {
5676            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5677            return packFloat128( zSign, 0x7FFF, 0, 0 );
5678        }
5679        if ( aExp == 0 ) {
5680            ++expDiff;
5681        }
5682        else {
5683            aSig0 |= LIT64( 0x0001000000000000 );
5684        }
5685        shift128ExtraRightJamming(
5686            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5687        zExp = bExp;
5688    }
5689    else {
5690        if ( aExp == 0x7FFF ) {
5691            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5692                return propagateFloat128NaN( a, b STATUS_VAR );
5693            }
5694            return a;
5695        }
5696        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5697        if ( aExp == 0 ) {
5698            if (STATUS(flush_to_zero)) {
5699                if (zSig0 | zSig1) {
5700                    float_raise(float_flag_output_denormal STATUS_VAR);
5701                }
5702                return packFloat128(zSign, 0, 0, 0);
5703            }
5704            return packFloat128( zSign, 0, zSig0, zSig1 );
5705        }
5706        zSig2 = 0;
5707        zSig0 |= LIT64( 0x0002000000000000 );
5708        zExp = aExp;
5709        goto shiftRight1;
5710    }
5711    aSig0 |= LIT64( 0x0001000000000000 );
5712    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5713    --zExp;
5714    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5715    ++zExp;
5716 shiftRight1:
5717    shift128ExtraRightJamming(
5718        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5719 roundAndPack:
5720    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5721
5722}
5723
5724/*----------------------------------------------------------------------------
5725| Returns the result of subtracting the absolute values of the quadruple-
5726| precision floating-point values `a' and `b'.  If `zSign' is 1, the
5727| difference is negated before being returned.  `zSign' is ignored if the
5728| result is a NaN.  The subtraction is performed according to the IEC/IEEE
5729| Standard for Binary Floating-Point Arithmetic.
5730*----------------------------------------------------------------------------*/
5731
5732static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5733{
5734    int32 aExp, bExp, zExp;
5735    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5736    int32 expDiff;
5737    float128 z;
5738
5739    aSig1 = extractFloat128Frac1( a );
5740    aSig0 = extractFloat128Frac0( a );
5741    aExp = extractFloat128Exp( a );
5742    bSig1 = extractFloat128Frac1( b );
5743    bSig0 = extractFloat128Frac0( b );
5744    bExp = extractFloat128Exp( b );
5745    expDiff = aExp - bExp;
5746    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5747    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5748    if ( 0 < expDiff ) goto aExpBigger;
5749    if ( expDiff < 0 ) goto bExpBigger;
5750    if ( aExp == 0x7FFF ) {
5751        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5752            return propagateFloat128NaN( a, b STATUS_VAR );
5753        }
5754        float_raise( float_flag_invalid STATUS_VAR);
5755        z.low = float128_default_nan_low;
5756        z.high = float128_default_nan_high;
5757        return z;
5758    }
5759    if ( aExp == 0 ) {
5760        aExp = 1;
5761        bExp = 1;
5762    }
5763    if ( bSig0 < aSig0 ) goto aBigger;
5764    if ( aSig0 < bSig0 ) goto bBigger;
5765    if ( bSig1 < aSig1 ) goto aBigger;
5766    if ( aSig1 < bSig1 ) goto bBigger;
5767    return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5768 bExpBigger:
5769    if ( bExp == 0x7FFF ) {
5770        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5771        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5772    }
5773    if ( aExp == 0 ) {
5774        ++expDiff;
5775    }
5776    else {
5777        aSig0 |= LIT64( 0x4000000000000000 );
5778    }
5779    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5780    bSig0 |= LIT64( 0x4000000000000000 );
5781 bBigger:
5782    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5783    zExp = bExp;
5784    zSign ^= 1;
5785    goto normalizeRoundAndPack;
5786 aExpBigger:
5787    if ( aExp == 0x7FFF ) {
5788        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5789        return a;
5790    }
5791    if ( bExp == 0 ) {
5792        --expDiff;
5793    }
5794    else {
5795        bSig0 |= LIT64( 0x4000000000000000 );
5796    }
5797    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5798    aSig0 |= LIT64( 0x4000000000000000 );
5799 aBigger:
5800    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5801    zExp = aExp;
5802 normalizeRoundAndPack:
5803    --zExp;
5804    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5805
5806}
5807
5808/*----------------------------------------------------------------------------
5809| Returns the result of adding the quadruple-precision floating-point values
5810| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5811| for Binary Floating-Point Arithmetic.
5812*----------------------------------------------------------------------------*/
5813
5814float128 float128_add( float128 a, float128 b STATUS_PARAM )
5815{
5816    flag aSign, bSign;
5817
5818    aSign = extractFloat128Sign( a );
5819    bSign = extractFloat128Sign( b );
5820    if ( aSign == bSign ) {
5821        return addFloat128Sigs( a, b, aSign STATUS_VAR );
5822    }
5823    else {
5824        return subFloat128Sigs( a, b, aSign STATUS_VAR );
5825    }
5826
5827}
5828
5829/*----------------------------------------------------------------------------
5830| Returns the result of subtracting the quadruple-precision floating-point
5831| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5832| Standard for Binary Floating-Point Arithmetic.
5833*----------------------------------------------------------------------------*/
5834
5835float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5836{
5837    flag aSign, bSign;
5838
5839    aSign = extractFloat128Sign( a );
5840    bSign = extractFloat128Sign( b );
5841    if ( aSign == bSign ) {
5842        return subFloat128Sigs( a, b, aSign STATUS_VAR );
5843    }
5844    else {
5845        return addFloat128Sigs( a, b, aSign STATUS_VAR );
5846    }
5847
5848}
5849
5850/*----------------------------------------------------------------------------
5851| Returns the result of multiplying the quadruple-precision floating-point
5852| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5853| Standard for Binary Floating-Point Arithmetic.
5854*----------------------------------------------------------------------------*/
5855
5856float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5857{
5858    flag aSign, bSign, zSign;
5859    int32 aExp, bExp, zExp;
5860    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5861    float128 z;
5862
5863    aSig1 = extractFloat128Frac1( a );
5864    aSig0 = extractFloat128Frac0( a );
5865    aExp = extractFloat128Exp( a );
5866    aSign = extractFloat128Sign( a );
5867    bSig1 = extractFloat128Frac1( b );
5868    bSig0 = extractFloat128Frac0( b );
5869    bExp = extractFloat128Exp( b );
5870    bSign = extractFloat128Sign( b );
5871    zSign = aSign ^ bSign;
5872    if ( aExp == 0x7FFF ) {
5873        if (    ( aSig0 | aSig1 )
5874             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5875            return propagateFloat128NaN( a, b STATUS_VAR );
5876        }
5877        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5878        return packFloat128( zSign, 0x7FFF, 0, 0 );
5879    }
5880    if ( bExp == 0x7FFF ) {
5881        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5882        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5883 invalid:
5884            float_raise( float_flag_invalid STATUS_VAR);
5885            z.low = float128_default_nan_low;
5886            z.high = float128_default_nan_high;
5887            return z;
5888        }
5889        return packFloat128( zSign, 0x7FFF, 0, 0 );
5890    }
5891    if ( aExp == 0 ) {
5892        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5893        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5894    }
5895    if ( bExp == 0 ) {
5896        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5897        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5898    }
5899    zExp = aExp + bExp - 0x4000;
5900    aSig0 |= LIT64( 0x0001000000000000 );
5901    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5902    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5903    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5904    zSig2 |= ( zSig3 != 0 );
5905    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5906        shift128ExtraRightJamming(
5907            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5908        ++zExp;
5909    }
5910    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5911
5912}
5913
5914/*----------------------------------------------------------------------------
5915| Returns the result of dividing the quadruple-precision floating-point value
5916| `a' by the corresponding value `b'.  The operation is performed according to
5917| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5918*----------------------------------------------------------------------------*/
5919
5920float128 float128_div( float128 a, float128 b STATUS_PARAM )
5921{
5922    flag aSign, bSign, zSign;
5923    int32 aExp, bExp, zExp;
5924    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5925    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5926    float128 z;
5927
5928    aSig1 = extractFloat128Frac1( a );
5929    aSig0 = extractFloat128Frac0( a );
5930    aExp = extractFloat128Exp( a );
5931    aSign = extractFloat128Sign( a );
5932    bSig1 = extractFloat128Frac1( b );
5933    bSig0 = extractFloat128Frac0( b );
5934    bExp = extractFloat128Exp( b );
5935    bSign = extractFloat128Sign( b );
5936    zSign = aSign ^ bSign;
5937    if ( aExp == 0x7FFF ) {
5938        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5939        if ( bExp == 0x7FFF ) {
5940            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5941            goto invalid;
5942        }
5943        return packFloat128( zSign, 0x7FFF, 0, 0 );
5944    }
5945    if ( bExp == 0x7FFF ) {
5946        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5947        return packFloat128( zSign, 0, 0, 0 );
5948    }
5949    if ( bExp == 0 ) {
5950        if ( ( bSig0 | bSig1 ) == 0 ) {
5951            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5952 invalid:
5953                float_raise( float_flag_invalid STATUS_VAR);
5954                z.low = float128_default_nan_low;
5955                z.high = float128_default_nan_high;
5956                return z;
5957            }
5958            float_raise( float_flag_divbyzero STATUS_VAR);
5959            return packFloat128( zSign, 0x7FFF, 0, 0 );
5960        }
5961        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5962    }
5963    if ( aExp == 0 ) {
5964        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5965        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5966    }
5967    zExp = aExp - bExp + 0x3FFD;
5968    shortShift128Left(
5969        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5970    shortShift128Left(
5971        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5972    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5973        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5974        ++zExp;
5975    }
5976    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5977    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5978    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5979    while ( (int64_t) rem0 < 0 ) {
5980        --zSig0;
5981        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5982    }
5983    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5984    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5985        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5986        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5987        while ( (int64_t) rem1 < 0 ) {
5988            --zSig1;
5989            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5990        }
5991        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5992    }
5993    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5994    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5995
5996}
5997
5998/*----------------------------------------------------------------------------
5999| Returns the remainder of the quadruple-precision floating-point value `a'
6000| with respect to the corresponding value `b'.  The operation is performed
6001| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6002*----------------------------------------------------------------------------*/
6003
6004float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6005{
6006    flag aSign, zSign;
6007    int32 aExp, bExp, expDiff;
6008    uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6009    uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6010    int64_t sigMean0;
6011    float128 z;
6012
6013    aSig1 = extractFloat128Frac1( a );
6014    aSig0 = extractFloat128Frac0( a );
6015    aExp = extractFloat128Exp( a );
6016    aSign = extractFloat128Sign( a );
6017    bSig1 = extractFloat128Frac1( b );
6018    bSig0 = extractFloat128Frac0( b );
6019    bExp = extractFloat128Exp( b );
6020    if ( aExp == 0x7FFF ) {
6021        if (    ( aSig0 | aSig1 )
6022             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6023            return propagateFloat128NaN( a, b STATUS_VAR );
6024        }
6025        goto invalid;
6026    }
6027    if ( bExp == 0x7FFF ) {
6028        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6029        return a;
6030    }
6031    if ( bExp == 0 ) {
6032        if ( ( bSig0 | bSig1 ) == 0 ) {
6033 invalid:
6034            float_raise( float_flag_invalid STATUS_VAR);
6035            z.low = float128_default_nan_low;
6036            z.high = float128_default_nan_high;
6037            return z;
6038        }
6039        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6040    }
6041    if ( aExp == 0 ) {
6042        if ( ( aSig0 | aSig1 ) == 0 ) return a;
6043        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6044    }
6045    expDiff = aExp - bExp;
6046    if ( expDiff < -1 ) return a;
6047    shortShift128Left(
6048        aSig0 | LIT64( 0x0001000000000000 ),
6049        aSig1,
6050        15 - ( expDiff < 0 ),
6051        &aSig0,
6052        &aSig1
6053    );
6054    shortShift128Left(
6055        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6056    q = le128( bSig0, bSig1, aSig0, aSig1 );
6057    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6058    expDiff -= 64;
6059    while ( 0 < expDiff ) {
6060        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6061        q = ( 4 < q ) ? q - 4 : 0;
6062        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6063        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6064        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6065        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6066        expDiff -= 61;
6067    }
6068    if ( -64 < expDiff ) {
6069        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6070        q = ( 4 < q ) ? q - 4 : 0;
6071        q >>= - expDiff;
6072        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6073        expDiff += 52;
6074        if ( expDiff < 0 ) {
6075            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6076        }
6077        else {
6078            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6079        }
6080        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6081        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6082    }
6083    else {
6084        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6085        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6086    }
6087    do {
6088        alternateASig0 = aSig0;
6089        alternateASig1 = aSig1;
6090        ++q;
6091        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6092    } while ( 0 <= (int64_t) aSig0 );
6093    add128(
6094        aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6095    if (    ( sigMean0 < 0 )
6096         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6097        aSig0 = alternateASig0;
6098        aSig1 = alternateASig1;
6099    }
6100    zSign = ( (int64_t) aSig0 < 0 );
6101    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6102    return
6103        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6104
6105}
6106
6107/*----------------------------------------------------------------------------
6108| Returns the square root of the quadruple-precision floating-point value `a'.
6109| The operation is performed according to the IEC/IEEE Standard for Binary
6110| Floating-Point Arithmetic.
6111*----------------------------------------------------------------------------*/
6112
6113float128 float128_sqrt( float128 a STATUS_PARAM )
6114{
6115    flag aSign;
6116    int32 aExp, zExp;
6117    uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6118    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6119    float128 z;
6120
6121    aSig1 = extractFloat128Frac1( a );
6122    aSig0 = extractFloat128Frac0( a );
6123    aExp = extractFloat128Exp( a );
6124    aSign = extractFloat128Sign( a );
6125    if ( aExp == 0x7FFF ) {
6126        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6127        if ( ! aSign ) return a;
6128        goto invalid;
6129    }
6130    if ( aSign ) {
6131        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6132 invalid:
6133        float_raise( float_flag_invalid STATUS_VAR);
6134        z.low = float128_default_nan_low;
6135        z.high = float128_default_nan_high;
6136        return z;
6137    }
6138    if ( aExp == 0 ) {
6139        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6140        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6141    }
6142    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6143    aSig0 |= LIT64( 0x0001000000000000 );
6144    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6145    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6146    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6147    doubleZSig0 = zSig0<<1;
6148    mul64To128( zSig0, zSig0, &term0, &term1 );
6149    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6150    while ( (int64_t) rem0 < 0 ) {
6151        --zSig0;
6152        doubleZSig0 -= 2;
6153        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6154    }
6155    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6156    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6157        if ( zSig1 == 0 ) zSig1 = 1;
6158        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6159        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6160        mul64To128( zSig1, zSig1, &term2, &term3 );
6161        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6162        while ( (int64_t) rem1 < 0 ) {
6163            --zSig1;
6164            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6165            term3 |= 1;
6166            term2 |= doubleZSig0;
6167            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6168        }
6169        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6170    }
6171    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6172    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6173
6174}
6175
6176/*----------------------------------------------------------------------------
6177| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6178| the corresponding value `b', and 0 otherwise.  The invalid exception is
6179| raised if either operand is a NaN.  Otherwise, the comparison is performed
6180| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6181*----------------------------------------------------------------------------*/
6182
6183int float128_eq( float128 a, float128 b STATUS_PARAM )
6184{
6185
6186    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6187              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6188         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6189              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6190       ) {
6191        float_raise( float_flag_invalid STATUS_VAR);
6192        return 0;
6193    }
6194    return
6195           ( a.low == b.low )
6196        && (    ( a.high == b.high )
6197             || (    ( a.low == 0 )
6198                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6199           );
6200
6201}
6202
6203/*----------------------------------------------------------------------------
6204| Returns 1 if the quadruple-precision floating-point value `a' is less than
6205| or equal to the corresponding value `b', and 0 otherwise.  The invalid
6206| exception is raised if either operand is a NaN.  The comparison is performed
6207| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6208*----------------------------------------------------------------------------*/
6209
6210int float128_le( float128 a, float128 b STATUS_PARAM )
6211{
6212    flag aSign, bSign;
6213
6214    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6215              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6216         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6217              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6218       ) {
6219        float_raise( float_flag_invalid STATUS_VAR);
6220        return 0;
6221    }
6222    aSign = extractFloat128Sign( a );
6223    bSign = extractFloat128Sign( b );
6224    if ( aSign != bSign ) {
6225        return
6226               aSign
6227            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6228                 == 0 );
6229    }
6230    return
6231          aSign ? le128( b.high, b.low, a.high, a.low )
6232        : le128( a.high, a.low, b.high, b.low );
6233
6234}
6235
6236/*----------------------------------------------------------------------------
6237| Returns 1 if the quadruple-precision floating-point value `a' is less than
6238| the corresponding value `b', and 0 otherwise.  The invalid exception is
6239| raised if either operand is a NaN.  The comparison is performed according
6240| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6241*----------------------------------------------------------------------------*/
6242
6243int float128_lt( float128 a, float128 b STATUS_PARAM )
6244{
6245    flag aSign, bSign;
6246
6247    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6248              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6249         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6250              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6251       ) {
6252        float_raise( float_flag_invalid STATUS_VAR);
6253        return 0;
6254    }
6255    aSign = extractFloat128Sign( a );
6256    bSign = extractFloat128Sign( b );
6257    if ( aSign != bSign ) {
6258        return
6259               aSign
6260            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6261                 != 0 );
6262    }
6263    return
6264          aSign ? lt128( b.high, b.low, a.high, a.low )
6265        : lt128( a.high, a.low, b.high, b.low );
6266
6267}
6268
6269/*----------------------------------------------------------------------------
6270| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6271| be compared, and 0 otherwise.  The invalid exception is raised if either
6272| operand is a NaN. The comparison is performed according to the IEC/IEEE
6273| Standard for Binary Floating-Point Arithmetic.
6274*----------------------------------------------------------------------------*/
6275
6276int float128_unordered( float128 a, float128 b STATUS_PARAM )
6277{
6278    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6279              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6280         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6281              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6282       ) {
6283        float_raise( float_flag_invalid STATUS_VAR);
6284        return 1;
6285    }
6286    return 0;
6287}
6288
6289/*----------------------------------------------------------------------------
6290| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6291| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6292| exception.  The comparison is performed according to the IEC/IEEE Standard
6293| for Binary Floating-Point Arithmetic.
6294*----------------------------------------------------------------------------*/
6295
6296int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
6297{
6298
6299    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6300              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6301         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6302              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6303       ) {
6304        if (    float128_is_signaling_nan( a )
6305             || float128_is_signaling_nan( b ) ) {
6306            float_raise( float_flag_invalid STATUS_VAR);
6307        }
6308        return 0;
6309    }
6310    return
6311           ( a.low == b.low )
6312        && (    ( a.high == b.high )
6313             || (    ( a.low == 0 )
6314                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6315           );
6316
6317}
6318
6319/*----------------------------------------------------------------------------
6320| Returns 1 if the quadruple-precision floating-point value `a' is less than
6321| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6322| cause an exception.  Otherwise, the comparison is performed according to the
6323| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6324*----------------------------------------------------------------------------*/
6325
6326int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
6327{
6328    flag aSign, bSign;
6329
6330    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6331              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6332         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6333              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6334       ) {
6335        if (    float128_is_signaling_nan( a )
6336             || float128_is_signaling_nan( b ) ) {
6337            float_raise( float_flag_invalid STATUS_VAR);
6338        }
6339        return 0;
6340    }
6341    aSign = extractFloat128Sign( a );
6342    bSign = extractFloat128Sign( b );
6343    if ( aSign != bSign ) {
6344        return
6345               aSign
6346            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6347                 == 0 );
6348    }
6349    return
6350          aSign ? le128( b.high, b.low, a.high, a.low )
6351        : le128( a.high, a.low, b.high, b.low );
6352
6353}
6354
6355/*----------------------------------------------------------------------------
6356| Returns 1 if the quadruple-precision floating-point value `a' is less than
6357| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6358| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6359| Standard for Binary Floating-Point Arithmetic.
6360*----------------------------------------------------------------------------*/
6361
6362int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
6363{
6364    flag aSign, bSign;
6365
6366    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6367              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6368         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6369              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6370       ) {
6371        if (    float128_is_signaling_nan( a )
6372             || float128_is_signaling_nan( b ) ) {
6373            float_raise( float_flag_invalid STATUS_VAR);
6374        }
6375        return 0;
6376    }
6377    aSign = extractFloat128Sign( a );
6378    bSign = extractFloat128Sign( b );
6379    if ( aSign != bSign ) {
6380        return
6381               aSign
6382            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6383                 != 0 );
6384    }
6385    return
6386          aSign ? lt128( b.high, b.low, a.high, a.low )
6387        : lt128( a.high, a.low, b.high, b.low );
6388
6389}
6390
6391/*----------------------------------------------------------------------------
6392| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6393| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6394| comparison is performed according to the IEC/IEEE Standard for Binary
6395| Floating-Point Arithmetic.
6396*----------------------------------------------------------------------------*/
6397
6398int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6399{
6400    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6401              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6402         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6403              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6404       ) {
6405        if (    float128_is_signaling_nan( a )
6406             || float128_is_signaling_nan( b ) ) {
6407            float_raise( float_flag_invalid STATUS_VAR);
6408        }
6409        return 1;
6410    }
6411    return 0;
6412}
6413
6414/* misc functions */
6415float32 uint32_to_float32( uint32 a STATUS_PARAM )
6416{
6417    return int64_to_float32(a STATUS_VAR);
6418}
6419
6420float64 uint32_to_float64( uint32 a STATUS_PARAM )
6421{
6422    return int64_to_float64(a STATUS_VAR);
6423}
6424
6425uint32 float32_to_uint32( float32 a STATUS_PARAM )
6426{
6427    int64_t v;
6428    uint32 res;
6429
6430    v = float32_to_int64(a STATUS_VAR);
6431    if (v < 0) {
6432        res = 0;
6433        float_raise( float_flag_invalid STATUS_VAR);
6434    } else if (v > 0xffffffff) {
6435        res = 0xffffffff;
6436        float_raise( float_flag_invalid STATUS_VAR);
6437    } else {
6438        res = v;
6439    }
6440    return res;
6441}
6442
6443uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6444{
6445    int64_t v;
6446    uint32 res;
6447
6448    v = float32_to_int64_round_to_zero(a STATUS_VAR);
6449    if (v < 0) {
6450        res = 0;
6451        float_raise( float_flag_invalid STATUS_VAR);
6452    } else if (v > 0xffffffff) {
6453        res = 0xffffffff;
6454        float_raise( float_flag_invalid STATUS_VAR);
6455    } else {
6456        res = v;
6457    }
6458    return res;
6459}
6460
6461uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
6462{
6463    int64_t v;
6464    uint_fast16_t res;
6465
6466    v = float32_to_int64_round_to_zero(a STATUS_VAR);
6467    if (v < 0) {
6468        res = 0;
6469        float_raise( float_flag_invalid STATUS_VAR);
6470    } else if (v > 0xffff) {
6471        res = 0xffff;
6472        float_raise( float_flag_invalid STATUS_VAR);
6473    } else {
6474        res = v;
6475    }
6476    return res;
6477}
6478
6479uint32 float64_to_uint32( float64 a STATUS_PARAM )
6480{
6481    int64_t v;
6482    uint32 res;
6483
6484    v = float64_to_int64(a STATUS_VAR);
6485    if (v < 0) {
6486        res = 0;
6487        float_raise( float_flag_invalid STATUS_VAR);
6488    } else if (v > 0xffffffff) {
6489        res = 0xffffffff;
6490        float_raise( float_flag_invalid STATUS_VAR);
6491    } else {
6492        res = v;
6493    }
6494    return res;
6495}
6496
6497uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6498{
6499    int64_t v;
6500    uint32 res;
6501
6502    v = float64_to_int64_round_to_zero(a STATUS_VAR);
6503    if (v < 0) {
6504        res = 0;
6505        float_raise( float_flag_invalid STATUS_VAR);
6506    } else if (v > 0xffffffff) {
6507        res = 0xffffffff;
6508        float_raise( float_flag_invalid STATUS_VAR);
6509    } else {
6510        res = v;
6511    }
6512    return res;
6513}
6514
6515uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
6516{
6517    int64_t v;
6518    uint_fast16_t res;
6519
6520    v = float64_to_int64_round_to_zero(a STATUS_VAR);
6521    if (v < 0) {
6522        res = 0;
6523        float_raise( float_flag_invalid STATUS_VAR);
6524    } else if (v > 0xffff) {
6525        res = 0xffff;
6526        float_raise( float_flag_invalid STATUS_VAR);
6527    } else {
6528        res = v;
6529    }
6530    return res;
6531}
6532
6533/* FIXME: This looks broken.  */
6534uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6535{
6536    int64_t v;
6537
6538    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6539    v += float64_val(a);
6540    v = float64_to_int64(make_float64(v) STATUS_VAR);
6541
6542    return v - INT64_MIN;
6543}
6544
6545uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6546{
6547    int64_t v;
6548
6549    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6550    v += float64_val(a);
6551    v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6552
6553    return v - INT64_MIN;
6554}
6555
6556#define COMPARE(s, nan_exp)                                                  \
6557INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
6558                                      int is_quiet STATUS_PARAM )            \
6559{                                                                            \
6560    flag aSign, bSign;                                                       \
6561    uint ## s ## _t av, bv;                                                  \
6562    a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
6563    b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
6564                                                                             \
6565    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
6566         extractFloat ## s ## Frac( a ) ) ||                                 \
6567        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
6568          extractFloat ## s ## Frac( b ) )) {                                \
6569        if (!is_quiet ||                                                     \
6570            float ## s ## _is_signaling_nan( a ) ||                          \
6571            float ## s ## _is_signaling_nan( b ) ) {                         \
6572            float_raise( float_flag_invalid STATUS_VAR);                     \
6573        }                                                                    \
6574        return float_relation_unordered;                                     \
6575    }                                                                        \
6576    aSign = extractFloat ## s ## Sign( a );                                  \
6577    bSign = extractFloat ## s ## Sign( b );                                  \
6578    av = float ## s ## _val(a);                                              \
6579    bv = float ## s ## _val(b);                                              \
6580    if ( aSign != bSign ) {                                                  \
6581        if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
6582            /* zero case */                                                  \
6583            return float_relation_equal;                                     \
6584        } else {                                                             \
6585            return 1 - (2 * aSign);                                          \
6586        }                                                                    \
6587    } else {                                                                 \
6588        if (av == bv) {                                                      \
6589            return float_relation_equal;                                     \
6590        } else {                                                             \
6591            return 1 - 2 * (aSign ^ ( av < bv ));                            \
6592        }                                                                    \
6593    }                                                                        \
6594}                                                                            \
6595                                                                             \
6596int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
6597{                                                                            \
6598    return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6599}                                                                            \
6600                                                                             \
6601int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6602{                                                                            \
6603    return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6604}
6605
6606COMPARE(32, 0xff)
6607COMPARE(64, 0x7ff)
6608
6609INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6610                                      int is_quiet STATUS_PARAM )
6611{
6612    flag aSign, bSign;
6613
6614    if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6615          ( extractFloatx80Frac( a )<<1 ) ) ||
6616        ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6617          ( extractFloatx80Frac( b )<<1 ) )) {
6618        if (!is_quiet ||
6619            floatx80_is_signaling_nan( a ) ||
6620            floatx80_is_signaling_nan( b ) ) {
6621            float_raise( float_flag_invalid STATUS_VAR);
6622        }
6623        return float_relation_unordered;
6624    }
6625    aSign = extractFloatx80Sign( a );
6626    bSign = extractFloatx80Sign( b );
6627    if ( aSign != bSign ) {
6628
6629        if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6630             ( ( a.low | b.low ) == 0 ) ) {
6631            /* zero case */
6632            return float_relation_equal;
6633        } else {
6634            return 1 - (2 * aSign);
6635        }
6636    } else {
6637        if (a.low == b.low && a.high == b.high) {
6638            return float_relation_equal;
6639        } else {
6640            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6641        }
6642    }
6643}
6644
6645int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6646{
6647    return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6648}
6649
6650int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6651{
6652    return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6653}
6654
6655INLINE int float128_compare_internal( float128 a, float128 b,
6656                                      int is_quiet STATUS_PARAM )
6657{
6658    flag aSign, bSign;
6659
6660    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6661          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6662        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6663          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6664        if (!is_quiet ||
6665            float128_is_signaling_nan( a ) ||
6666            float128_is_signaling_nan( b ) ) {
6667            float_raise( float_flag_invalid STATUS_VAR);
6668        }
6669        return float_relation_unordered;
6670    }
6671    aSign = extractFloat128Sign( a );
6672    bSign = extractFloat128Sign( b );
6673    if ( aSign != bSign ) {
6674        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6675            /* zero case */
6676            return float_relation_equal;
6677        } else {
6678            return 1 - (2 * aSign);
6679        }
6680    } else {
6681        if (a.low == b.low && a.high == b.high) {
6682            return float_relation_equal;
6683        } else {
6684            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6685        }
6686    }
6687}
6688
6689int float128_compare( float128 a, float128 b STATUS_PARAM )
6690{
6691    return float128_compare_internal(a, b, 0 STATUS_VAR);
6692}
6693
6694int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6695{
6696    return float128_compare_internal(a, b, 1 STATUS_VAR);
6697}
6698
6699/* min() and max() functions. These can't be implemented as
6700 * 'compare and pick one input' because that would mishandle
6701 * NaNs and +0 vs -0.
6702 */
6703#define MINMAX(s, nan_exp)                                              \
6704INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
6705                                        int ismin STATUS_PARAM )        \
6706{                                                                       \
6707    flag aSign, bSign;                                                  \
6708    uint ## s ## _t av, bv;                                             \
6709    a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
6710    b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
6711    if (float ## s ## _is_any_nan(a) ||                                 \
6712        float ## s ## _is_any_nan(b)) {                                 \
6713        return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
6714    }                                                                   \
6715    aSign = extractFloat ## s ## Sign(a);                               \
6716    bSign = extractFloat ## s ## Sign(b);                               \
6717    av = float ## s ## _val(a);                                         \
6718    bv = float ## s ## _val(b);                                         \
6719    if (aSign != bSign) {                                               \
6720        if (ismin) {                                                    \
6721            return aSign ? a : b;                                       \
6722        } else {                                                        \
6723            return aSign ? b : a;                                       \
6724        }                                                               \
6725    } else {                                                            \
6726        if (ismin) {                                                    \
6727            return (aSign ^ (av < bv)) ? a : b;                         \
6728        } else {                                                        \
6729            return (aSign ^ (av < bv)) ? b : a;                         \
6730        }                                                               \
6731    }                                                                   \
6732}                                                                       \
6733                                                                        \
6734float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
6735{                                                                       \
6736    return float ## s ## _minmax(a, b, 1 STATUS_VAR);                   \
6737}                                                                       \
6738                                                                        \
6739float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
6740{                                                                       \
6741    return float ## s ## _minmax(a, b, 0 STATUS_VAR);                   \
6742}
6743
6744MINMAX(32, 0xff)
6745MINMAX(64, 0x7ff)
6746
6747
6748/* Multiply A by 2 raised to the power N.  */
6749float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6750{
6751    flag aSign;
6752    int16_t aExp;
6753    uint32_t aSig;
6754
6755    a = float32_squash_input_denormal(a STATUS_VAR);
6756    aSig = extractFloat32Frac( a );
6757    aExp = extractFloat32Exp( a );
6758    aSign = extractFloat32Sign( a );
6759
6760    if ( aExp == 0xFF ) {
6761        if ( aSig ) {
6762            return propagateFloat32NaN( a, a STATUS_VAR );
6763        }
6764        return a;
6765    }
6766    if ( aExp != 0 )
6767        aSig |= 0x00800000;
6768    else if ( aSig == 0 )
6769        return a;
6770
6771    if (n > 0x200) {
6772        n = 0x200;
6773    } else if (n < -0x200) {
6774        n = -0x200;
6775    }
6776
6777    aExp += n - 1;
6778    aSig <<= 7;
6779    return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6780}
6781
6782float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6783{
6784    flag aSign;
6785    int16_t aExp;
6786    uint64_t aSig;
6787
6788    a = float64_squash_input_denormal(a STATUS_VAR);
6789    aSig = extractFloat64Frac( a );
6790    aExp = extractFloat64Exp( a );
6791    aSign = extractFloat64Sign( a );
6792
6793    if ( aExp == 0x7FF ) {
6794        if ( aSig ) {
6795            return propagateFloat64NaN( a, a STATUS_VAR );
6796        }
6797        return a;
6798    }
6799    if ( aExp != 0 )
6800        aSig |= LIT64( 0x0010000000000000 );
6801    else if ( aSig == 0 )
6802        return a;
6803
6804    if (n > 0x1000) {
6805        n = 0x1000;
6806    } else if (n < -0x1000) {
6807        n = -0x1000;
6808    }
6809
6810    aExp += n - 1;
6811    aSig <<= 10;
6812    return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6813}
6814
6815floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6816{
6817    flag aSign;
6818    int32_t aExp;
6819    uint64_t aSig;
6820
6821    aSig = extractFloatx80Frac( a );
6822    aExp = extractFloatx80Exp( a );
6823    aSign = extractFloatx80Sign( a );
6824
6825    if ( aExp == 0x7FFF ) {
6826        if ( aSig<<1 ) {
6827            return propagateFloatx80NaN( a, a STATUS_VAR );
6828        }
6829        return a;
6830    }
6831
6832    if (aExp == 0 && aSig == 0)
6833        return a;
6834
6835    if (n > 0x10000) {
6836        n = 0x10000;
6837    } else if (n < -0x10000) {
6838        n = -0x10000;
6839    }
6840
6841    aExp += n;
6842    return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6843                                          aSign, aExp, aSig, 0 STATUS_VAR );
6844}
6845
6846float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6847{
6848    flag aSign;
6849    int32_t aExp;
6850    uint64_t aSig0, aSig1;
6851
6852    aSig1 = extractFloat128Frac1( a );
6853    aSig0 = extractFloat128Frac0( a );
6854    aExp = extractFloat128Exp( a );
6855    aSign = extractFloat128Sign( a );
6856    if ( aExp == 0x7FFF ) {
6857        if ( aSig0 | aSig1 ) {
6858            return propagateFloat128NaN( a, a STATUS_VAR );
6859        }
6860        return a;
6861    }
6862    if ( aExp != 0 )
6863        aSig0 |= LIT64( 0x0001000000000000 );
6864    else if ( aSig0 == 0 && aSig1 == 0 )
6865        return a;
6866
6867    if (n > 0x10000) {
6868        n = 0x10000;
6869    } else if (n < -0x10000) {
6870        n = -0x10000;
6871    }
6872
6873    aExp += n - 1;
6874    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6875                                          STATUS_VAR );
6876
6877}
6878