qemu/fpu/softfloat.c
<<
>>
Prefs
   1/*
   2 * QEMU float support
   3 *
   4 * Derived from SoftFloat.
   5 */
   6
   7/*============================================================================
   8
   9This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  10Package, Release 2b.
  11
  12Written by John R. Hauser.  This work was made possible in part by the
  13International Computer Science Institute, located at Suite 600, 1947 Center
  14Street, Berkeley, California 94704.  Funding was partially provided by the
  15National Science Foundation under grant MIP-9311980.  The original version
  16of this code was written as part of a project to build a fixed-point vector
  17processor in collaboration with the University of California at Berkeley,
  18overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  19is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  20arithmetic/SoftFloat.html'.
  21
  22THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  23been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  24RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  25AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  26COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  27EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  28INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  29OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  30
  31Derivative works are acceptable, even for commercial purposes, so long as
  32(1) the source code for the derivative work includes prominent notice that
  33the work is derivative, and (2) the source code includes prominent notice with
  34these four paragraphs for those parts of this code that are retained.
  35
  36=============================================================================*/
  37
  38/* softfloat (and in particular the code in softfloat-specialize.h) is
  39 * target-dependent and needs the TARGET_* macros.
  40 */
  41#include "config.h"
  42
  43#include "softfloat.h"
  44
  45/*----------------------------------------------------------------------------
  46| Primitive arithmetic functions, including multi-word arithmetic, and
  47| division and square root approximations.  (Can be specialized to target if
  48| desired.)
  49*----------------------------------------------------------------------------*/
  50#include "softfloat-macros.h"
  51
  52/*----------------------------------------------------------------------------
  53| Functions and definitions to determine:  (1) whether tininess for underflow
  54| is detected before or after rounding by default, (2) what (if anything)
  55| happens when exceptions are raised, (3) how signaling NaNs are distinguished
  56| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  57| are propagated from function inputs to output.  These details are target-
  58| specific.
  59*----------------------------------------------------------------------------*/
  60#include "softfloat-specialize.h"
  61
  62void set_float_rounding_mode(int val STATUS_PARAM)
  63{
  64    STATUS(float_rounding_mode) = val;
  65}
  66
  67void set_float_exception_flags(int val STATUS_PARAM)
  68{
  69    STATUS(float_exception_flags) = val;
  70}
  71
  72void set_floatx80_rounding_precision(int val STATUS_PARAM)
  73{
  74    STATUS(floatx80_rounding_precision) = val;
  75}
  76
  77/*----------------------------------------------------------------------------
  78| Returns the fraction bits of the half-precision floating-point value `a'.
  79*----------------------------------------------------------------------------*/
  80
  81INLINE uint32_t extractFloat16Frac(float16 a)
  82{
  83    return float16_val(a) & 0x3ff;
  84}
  85
  86/*----------------------------------------------------------------------------
  87| Returns the exponent bits of the half-precision floating-point value `a'.
  88*----------------------------------------------------------------------------*/
  89
  90INLINE int_fast16_t extractFloat16Exp(float16 a)
  91{
  92    return (float16_val(a) >> 10) & 0x1f;
  93}
  94
  95/*----------------------------------------------------------------------------
  96| Returns the sign bit of the single-precision floating-point value `a'.
  97*----------------------------------------------------------------------------*/
  98
  99INLINE flag extractFloat16Sign(float16 a)
 100{
 101    return float16_val(a)>>15;
 102}
 103
 104/*----------------------------------------------------------------------------
 105| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 106| and 7, and returns the properly rounded 32-bit integer corresponding to the
 107| input.  If `zSign' is 1, the input is negated before being converted to an
 108| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 109| is simply rounded to an integer, with the inexact exception raised if the
 110| input cannot be represented exactly as an integer.  However, if the fixed-
 111| point input is too large, the invalid exception is raised and the largest
 112| positive or negative integer is returned.
 113*----------------------------------------------------------------------------*/
 114
 115static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
 116{
 117    int8 roundingMode;
 118    flag roundNearestEven;
 119    int8 roundIncrement, roundBits;
 120    int32_t z;
 121
 122    roundingMode = STATUS(float_rounding_mode);
 123    roundNearestEven = ( roundingMode == float_round_nearest_even );
 124    roundIncrement = 0x40;
 125    if ( ! roundNearestEven ) {
 126        if ( roundingMode == float_round_to_zero ) {
 127            roundIncrement = 0;
 128        }
 129        else {
 130            roundIncrement = 0x7F;
 131            if ( zSign ) {
 132                if ( roundingMode == float_round_up ) roundIncrement = 0;
 133            }
 134            else {
 135                if ( roundingMode == float_round_down ) roundIncrement = 0;
 136            }
 137        }
 138    }
 139    roundBits = absZ & 0x7F;
 140    absZ = ( absZ + roundIncrement )>>7;
 141    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 142    z = absZ;
 143    if ( zSign ) z = - z;
 144    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 145        float_raise( float_flag_invalid STATUS_VAR);
 146        return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 147    }
 148    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 149    return z;
 150
 151}
 152
 153/*----------------------------------------------------------------------------
 154| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 155| `absZ1', with binary point between bits 63 and 64 (between the input words),
 156| and returns the properly rounded 64-bit integer corresponding to the input.
 157| If `zSign' is 1, the input is negated before being converted to an integer.
 158| Ordinarily, the fixed-point input is simply rounded to an integer, with
 159| the inexact exception raised if the input cannot be represented exactly as
 160| an integer.  However, if the fixed-point input is too large, the invalid
 161| exception is raised and the largest positive or negative integer is
 162| returned.
 163*----------------------------------------------------------------------------*/
 164
 165static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
 166{
 167    int8 roundingMode;
 168    flag roundNearestEven, increment;
 169    int64_t z;
 170
 171    roundingMode = STATUS(float_rounding_mode);
 172    roundNearestEven = ( roundingMode == float_round_nearest_even );
 173    increment = ( (int64_t) absZ1 < 0 );
 174    if ( ! roundNearestEven ) {
 175        if ( roundingMode == float_round_to_zero ) {
 176            increment = 0;
 177        }
 178        else {
 179            if ( zSign ) {
 180                increment = ( roundingMode == float_round_down ) && absZ1;
 181            }
 182            else {
 183                increment = ( roundingMode == float_round_up ) && absZ1;
 184            }
 185        }
 186    }
 187    if ( increment ) {
 188        ++absZ0;
 189        if ( absZ0 == 0 ) goto overflow;
 190        absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 191    }
 192    z = absZ0;
 193    if ( zSign ) z = - z;
 194    if ( z && ( ( z < 0 ) ^ zSign ) ) {
 195 overflow:
 196        float_raise( float_flag_invalid STATUS_VAR);
 197        return
 198              zSign ? (int64_t) LIT64( 0x8000000000000000 )
 199            : LIT64( 0x7FFFFFFFFFFFFFFF );
 200    }
 201    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 202    return z;
 203
 204}
 205
 206/*----------------------------------------------------------------------------
 207| Returns the fraction bits of the single-precision floating-point value `a'.
 208*----------------------------------------------------------------------------*/
 209
 210INLINE uint32_t extractFloat32Frac( float32 a )
 211{
 212
 213    return float32_val(a) & 0x007FFFFF;
 214
 215}
 216
 217/*----------------------------------------------------------------------------
 218| Returns the exponent bits of the single-precision floating-point value `a'.
 219*----------------------------------------------------------------------------*/
 220
 221INLINE int_fast16_t extractFloat32Exp(float32 a)
 222{
 223
 224    return ( float32_val(a)>>23 ) & 0xFF;
 225
 226}
 227
 228/*----------------------------------------------------------------------------
 229| Returns the sign bit of the single-precision floating-point value `a'.
 230*----------------------------------------------------------------------------*/
 231
 232INLINE flag extractFloat32Sign( float32 a )
 233{
 234
 235    return float32_val(a)>>31;
 236
 237}
 238
 239/*----------------------------------------------------------------------------
 240| If `a' is denormal and we are in flush-to-zero mode then set the
 241| input-denormal exception and return zero. Otherwise just return the value.
 242*----------------------------------------------------------------------------*/
 243static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
 244{
 245    if (STATUS(flush_inputs_to_zero)) {
 246        if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 247            float_raise(float_flag_input_denormal STATUS_VAR);
 248            return make_float32(float32_val(a) & 0x80000000);
 249        }
 250    }
 251    return a;
 252}
 253
 254/*----------------------------------------------------------------------------
 255| Normalizes the subnormal single-precision floating-point value represented
 256| by the denormalized significand `aSig'.  The normalized exponent and
 257| significand are stored at the locations pointed to by `zExpPtr' and
 258| `zSigPtr', respectively.
 259*----------------------------------------------------------------------------*/
 260
 261static void
 262 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
 263{
 264    int8 shiftCount;
 265
 266    shiftCount = countLeadingZeros32( aSig ) - 8;
 267    *zSigPtr = aSig<<shiftCount;
 268    *zExpPtr = 1 - shiftCount;
 269
 270}
 271
 272/*----------------------------------------------------------------------------
 273| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 274| single-precision floating-point value, returning the result.  After being
 275| shifted into the proper positions, the three fields are simply added
 276| together to form the result.  This means that any integer portion of `zSig'
 277| will be added into the exponent.  Since a properly normalized significand
 278| will have an integer portion equal to 1, the `zExp' input should be 1 less
 279| than the desired result exponent whenever `zSig' is a complete, normalized
 280| significand.
 281*----------------------------------------------------------------------------*/
 282
 283INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
 284{
 285
 286    return make_float32(
 287          ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 288
 289}
 290
 291/*----------------------------------------------------------------------------
 292| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 293| and significand `zSig', and returns the proper single-precision floating-
 294| point value corresponding to the abstract input.  Ordinarily, the abstract
 295| value is simply rounded and packed into the single-precision format, with
 296| the inexact exception raised if the abstract input cannot be represented
 297| exactly.  However, if the abstract value is too large, the overflow and
 298| inexact exceptions are raised and an infinity or maximal finite value is
 299| returned.  If the abstract value is too small, the input value is rounded to
 300| a subnormal number, and the underflow and inexact exceptions are raised if
 301| the abstract input cannot be represented exactly as a subnormal single-
 302| precision floating-point number.
 303|     The input significand `zSig' has its binary point between bits 30
 304| and 29, which is 7 bits to the left of the usual location.  This shifted
 305| significand must be normalized or smaller.  If `zSig' is not normalized,
 306| `zExp' must be 0; in that case, the result returned is a subnormal number,
 307| and it must not require rounding.  In the usual case that `zSig' is
 308| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 309| The handling of underflow and overflow follows the IEC/IEEE Standard for
 310| Binary Floating-Point Arithmetic.
 311*----------------------------------------------------------------------------*/
 312
 313static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
 314{
 315    int8 roundingMode;
 316    flag roundNearestEven;
 317    int8 roundIncrement, roundBits;
 318    flag isTiny;
 319
 320    roundingMode = STATUS(float_rounding_mode);
 321    roundNearestEven = ( roundingMode == float_round_nearest_even );
 322    roundIncrement = 0x40;
 323    if ( ! roundNearestEven ) {
 324        if ( roundingMode == float_round_to_zero ) {
 325            roundIncrement = 0;
 326        }
 327        else {
 328            roundIncrement = 0x7F;
 329            if ( zSign ) {
 330                if ( roundingMode == float_round_up ) roundIncrement = 0;
 331            }
 332            else {
 333                if ( roundingMode == float_round_down ) roundIncrement = 0;
 334            }
 335        }
 336    }
 337    roundBits = zSig & 0x7F;
 338    if ( 0xFD <= (uint16_t) zExp ) {
 339        if (    ( 0xFD < zExp )
 340             || (    ( zExp == 0xFD )
 341                  && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 342           ) {
 343            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 344            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 345        }
 346        if ( zExp < 0 ) {
 347            if (STATUS(flush_to_zero)) {
 348                float_raise(float_flag_output_denormal STATUS_VAR);
 349                return packFloat32(zSign, 0, 0);
 350            }
 351            isTiny =
 352                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 353                || ( zExp < -1 )
 354                || ( zSig + roundIncrement < 0x80000000 );
 355            shift32RightJamming( zSig, - zExp, &zSig );
 356            zExp = 0;
 357            roundBits = zSig & 0x7F;
 358            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 359        }
 360    }
 361    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 362    zSig = ( zSig + roundIncrement )>>7;
 363    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 364    if ( zSig == 0 ) zExp = 0;
 365    return packFloat32( zSign, zExp, zSig );
 366
 367}
 368
 369/*----------------------------------------------------------------------------
 370| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 371| and significand `zSig', and returns the proper single-precision floating-
 372| point value corresponding to the abstract input.  This routine is just like
 373| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 374| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 375| floating-point exponent.
 376*----------------------------------------------------------------------------*/
 377
 378static float32
 379 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
 380{
 381    int8 shiftCount;
 382
 383    shiftCount = countLeadingZeros32( zSig ) - 1;
 384    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 385
 386}
 387
 388/*----------------------------------------------------------------------------
 389| Returns the fraction bits of the double-precision floating-point value `a'.
 390*----------------------------------------------------------------------------*/
 391
 392INLINE uint64_t extractFloat64Frac( float64 a )
 393{
 394
 395    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 396
 397}
 398
 399/*----------------------------------------------------------------------------
 400| Returns the exponent bits of the double-precision floating-point value `a'.
 401*----------------------------------------------------------------------------*/
 402
 403INLINE int_fast16_t extractFloat64Exp(float64 a)
 404{
 405
 406    return ( float64_val(a)>>52 ) & 0x7FF;
 407
 408}
 409
 410/*----------------------------------------------------------------------------
 411| Returns the sign bit of the double-precision floating-point value `a'.
 412*----------------------------------------------------------------------------*/
 413
 414INLINE flag extractFloat64Sign( float64 a )
 415{
 416
 417    return float64_val(a)>>63;
 418
 419}
 420
 421/*----------------------------------------------------------------------------
 422| If `a' is denormal and we are in flush-to-zero mode then set the
 423| input-denormal exception and return zero. Otherwise just return the value.
 424*----------------------------------------------------------------------------*/
 425static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
 426{
 427    if (STATUS(flush_inputs_to_zero)) {
 428        if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 429            float_raise(float_flag_input_denormal STATUS_VAR);
 430            return make_float64(float64_val(a) & (1ULL << 63));
 431        }
 432    }
 433    return a;
 434}
 435
 436/*----------------------------------------------------------------------------
 437| Normalizes the subnormal double-precision floating-point value represented
 438| by the denormalized significand `aSig'.  The normalized exponent and
 439| significand are stored at the locations pointed to by `zExpPtr' and
 440| `zSigPtr', respectively.
 441*----------------------------------------------------------------------------*/
 442
 443static void
 444 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
 445{
 446    int8 shiftCount;
 447
 448    shiftCount = countLeadingZeros64( aSig ) - 11;
 449    *zSigPtr = aSig<<shiftCount;
 450    *zExpPtr = 1 - shiftCount;
 451
 452}
 453
 454/*----------------------------------------------------------------------------
 455| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 456| double-precision floating-point value, returning the result.  After being
 457| shifted into the proper positions, the three fields are simply added
 458| together to form the result.  This means that any integer portion of `zSig'
 459| will be added into the exponent.  Since a properly normalized significand
 460| will have an integer portion equal to 1, the `zExp' input should be 1 less
 461| than the desired result exponent whenever `zSig' is a complete, normalized
 462| significand.
 463*----------------------------------------------------------------------------*/
 464
 465INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
 466{
 467
 468    return make_float64(
 469        ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 470
 471}
 472
 473/*----------------------------------------------------------------------------
 474| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 475| and significand `zSig', and returns the proper double-precision floating-
 476| point value corresponding to the abstract input.  Ordinarily, the abstract
 477| value is simply rounded and packed into the double-precision format, with
 478| the inexact exception raised if the abstract input cannot be represented
 479| exactly.  However, if the abstract value is too large, the overflow and
 480| inexact exceptions are raised and an infinity or maximal finite value is
 481| returned.  If the abstract value is too small, the input value is rounded
 482| to a subnormal number, and the underflow and inexact exceptions are raised
 483| if the abstract input cannot be represented exactly as a subnormal double-
 484| precision floating-point number.
 485|     The input significand `zSig' has its binary point between bits 62
 486| and 61, which is 10 bits to the left of the usual location.  This shifted
 487| significand must be normalized or smaller.  If `zSig' is not normalized,
 488| `zExp' must be 0; in that case, the result returned is a subnormal number,
 489| and it must not require rounding.  In the usual case that `zSig' is
 490| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 491| The handling of underflow and overflow follows the IEC/IEEE Standard for
 492| Binary Floating-Point Arithmetic.
 493*----------------------------------------------------------------------------*/
 494
 495static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
 496{
 497    int8 roundingMode;
 498    flag roundNearestEven;
 499    int_fast16_t roundIncrement, roundBits;
 500    flag isTiny;
 501
 502    roundingMode = STATUS(float_rounding_mode);
 503    roundNearestEven = ( roundingMode == float_round_nearest_even );
 504    roundIncrement = 0x200;
 505    if ( ! roundNearestEven ) {
 506        if ( roundingMode == float_round_to_zero ) {
 507            roundIncrement = 0;
 508        }
 509        else {
 510            roundIncrement = 0x3FF;
 511            if ( zSign ) {
 512                if ( roundingMode == float_round_up ) roundIncrement = 0;
 513            }
 514            else {
 515                if ( roundingMode == float_round_down ) roundIncrement = 0;
 516            }
 517        }
 518    }
 519    roundBits = zSig & 0x3FF;
 520    if ( 0x7FD <= (uint16_t) zExp ) {
 521        if (    ( 0x7FD < zExp )
 522             || (    ( zExp == 0x7FD )
 523                  && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 524           ) {
 525            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 526            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 527        }
 528        if ( zExp < 0 ) {
 529            if (STATUS(flush_to_zero)) {
 530                float_raise(float_flag_output_denormal STATUS_VAR);
 531                return packFloat64(zSign, 0, 0);
 532            }
 533            isTiny =
 534                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 535                || ( zExp < -1 )
 536                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 537            shift64RightJamming( zSig, - zExp, &zSig );
 538            zExp = 0;
 539            roundBits = zSig & 0x3FF;
 540            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 541        }
 542    }
 543    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 544    zSig = ( zSig + roundIncrement )>>10;
 545    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 546    if ( zSig == 0 ) zExp = 0;
 547    return packFloat64( zSign, zExp, zSig );
 548
 549}
 550
 551/*----------------------------------------------------------------------------
 552| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 553| and significand `zSig', and returns the proper double-precision floating-
 554| point value corresponding to the abstract input.  This routine is just like
 555| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 556| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 557| floating-point exponent.
 558*----------------------------------------------------------------------------*/
 559
 560static float64
 561 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
 562{
 563    int8 shiftCount;
 564
 565    shiftCount = countLeadingZeros64( zSig ) - 1;
 566    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 567
 568}
 569
 570/*----------------------------------------------------------------------------
 571| Returns the fraction bits of the extended double-precision floating-point
 572| value `a'.
 573*----------------------------------------------------------------------------*/
 574
 575INLINE uint64_t extractFloatx80Frac( floatx80 a )
 576{
 577
 578    return a.low;
 579
 580}
 581
 582/*----------------------------------------------------------------------------
 583| Returns the exponent bits of the extended double-precision floating-point
 584| value `a'.
 585*----------------------------------------------------------------------------*/
 586
 587INLINE int32 extractFloatx80Exp( floatx80 a )
 588{
 589
 590    return a.high & 0x7FFF;
 591
 592}
 593
 594/*----------------------------------------------------------------------------
 595| Returns the sign bit of the extended double-precision floating-point value
 596| `a'.
 597*----------------------------------------------------------------------------*/
 598
 599INLINE flag extractFloatx80Sign( floatx80 a )
 600{
 601
 602    return a.high>>15;
 603
 604}
 605
 606/*----------------------------------------------------------------------------
 607| Normalizes the subnormal extended double-precision floating-point value
 608| represented by the denormalized significand `aSig'.  The normalized exponent
 609| and significand are stored at the locations pointed to by `zExpPtr' and
 610| `zSigPtr', respectively.
 611*----------------------------------------------------------------------------*/
 612
 613static void
 614 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
 615{
 616    int8 shiftCount;
 617
 618    shiftCount = countLeadingZeros64( aSig );
 619    *zSigPtr = aSig<<shiftCount;
 620    *zExpPtr = 1 - shiftCount;
 621
 622}
 623
 624/*----------------------------------------------------------------------------
 625| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 626| extended double-precision floating-point value, returning the result.
 627*----------------------------------------------------------------------------*/
 628
 629INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
 630{
 631    floatx80 z;
 632
 633    z.low = zSig;
 634    z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 635    return z;
 636
 637}
 638
 639/*----------------------------------------------------------------------------
 640| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 641| and extended significand formed by the concatenation of `zSig0' and `zSig1',
 642| and returns the proper extended double-precision floating-point value
 643| corresponding to the abstract input.  Ordinarily, the abstract value is
 644| rounded and packed into the extended double-precision format, with the
 645| inexact exception raised if the abstract input cannot be represented
 646| exactly.  However, if the abstract value is too large, the overflow and
 647| inexact exceptions are raised and an infinity or maximal finite value is
 648| returned.  If the abstract value is too small, the input value is rounded to
 649| a subnormal number, and the underflow and inexact exceptions are raised if
 650| the abstract input cannot be represented exactly as a subnormal extended
 651| double-precision floating-point number.
 652|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 653| number of bits as single or double precision, respectively.  Otherwise, the
 654| result is rounded to the full precision of the extended double-precision
 655| format.
 656|     The input significand must be normalized or smaller.  If the input
 657| significand is not normalized, `zExp' must be 0; in that case, the result
 658| returned is a subnormal number, and it must not require rounding.  The
 659| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 660| Floating-Point Arithmetic.
 661*----------------------------------------------------------------------------*/
 662
 663static floatx80
 664 roundAndPackFloatx80(
 665     int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 666 STATUS_PARAM)
 667{
 668    int8 roundingMode;
 669    flag roundNearestEven, increment, isTiny;
 670    int64 roundIncrement, roundMask, roundBits;
 671
 672    roundingMode = STATUS(float_rounding_mode);
 673    roundNearestEven = ( roundingMode == float_round_nearest_even );
 674    if ( roundingPrecision == 80 ) goto precision80;
 675    if ( roundingPrecision == 64 ) {
 676        roundIncrement = LIT64( 0x0000000000000400 );
 677        roundMask = LIT64( 0x00000000000007FF );
 678    }
 679    else if ( roundingPrecision == 32 ) {
 680        roundIncrement = LIT64( 0x0000008000000000 );
 681        roundMask = LIT64( 0x000000FFFFFFFFFF );
 682    }
 683    else {
 684        goto precision80;
 685    }
 686    zSig0 |= ( zSig1 != 0 );
 687    if ( ! roundNearestEven ) {
 688        if ( roundingMode == float_round_to_zero ) {
 689            roundIncrement = 0;
 690        }
 691        else {
 692            roundIncrement = roundMask;
 693            if ( zSign ) {
 694                if ( roundingMode == float_round_up ) roundIncrement = 0;
 695            }
 696            else {
 697                if ( roundingMode == float_round_down ) roundIncrement = 0;
 698            }
 699        }
 700    }
 701    roundBits = zSig0 & roundMask;
 702    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 703        if (    ( 0x7FFE < zExp )
 704             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 705           ) {
 706            goto overflow;
 707        }
 708        if ( zExp <= 0 ) {
 709            if (STATUS(flush_to_zero)) {
 710                float_raise(float_flag_output_denormal STATUS_VAR);
 711                return packFloatx80(zSign, 0, 0);
 712            }
 713            isTiny =
 714                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 715                || ( zExp < 0 )
 716                || ( zSig0 <= zSig0 + roundIncrement );
 717            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 718            zExp = 0;
 719            roundBits = zSig0 & roundMask;
 720            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 721            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 722            zSig0 += roundIncrement;
 723            if ( (int64_t) zSig0 < 0 ) zExp = 1;
 724            roundIncrement = roundMask + 1;
 725            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 726                roundMask |= roundIncrement;
 727            }
 728            zSig0 &= ~ roundMask;
 729            return packFloatx80( zSign, zExp, zSig0 );
 730        }
 731    }
 732    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 733    zSig0 += roundIncrement;
 734    if ( zSig0 < roundIncrement ) {
 735        ++zExp;
 736        zSig0 = LIT64( 0x8000000000000000 );
 737    }
 738    roundIncrement = roundMask + 1;
 739    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 740        roundMask |= roundIncrement;
 741    }
 742    zSig0 &= ~ roundMask;
 743    if ( zSig0 == 0 ) zExp = 0;
 744    return packFloatx80( zSign, zExp, zSig0 );
 745 precision80:
 746    increment = ( (int64_t) zSig1 < 0 );
 747    if ( ! roundNearestEven ) {
 748        if ( roundingMode == float_round_to_zero ) {
 749            increment = 0;
 750        }
 751        else {
 752            if ( zSign ) {
 753                increment = ( roundingMode == float_round_down ) && zSig1;
 754            }
 755            else {
 756                increment = ( roundingMode == float_round_up ) && zSig1;
 757            }
 758        }
 759    }
 760    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 761        if (    ( 0x7FFE < zExp )
 762             || (    ( zExp == 0x7FFE )
 763                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 764                  && increment
 765                )
 766           ) {
 767            roundMask = 0;
 768 overflow:
 769            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 770            if (    ( roundingMode == float_round_to_zero )
 771                 || ( zSign && ( roundingMode == float_round_up ) )
 772                 || ( ! zSign && ( roundingMode == float_round_down ) )
 773               ) {
 774                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 775            }
 776            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 777        }
 778        if ( zExp <= 0 ) {
 779            isTiny =
 780                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 781                || ( zExp < 0 )
 782                || ! increment
 783                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 784            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 785            zExp = 0;
 786            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 787            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 788            if ( roundNearestEven ) {
 789                increment = ( (int64_t) zSig1 < 0 );
 790            }
 791            else {
 792                if ( zSign ) {
 793                    increment = ( roundingMode == float_round_down ) && zSig1;
 794                }
 795                else {
 796                    increment = ( roundingMode == float_round_up ) && zSig1;
 797                }
 798            }
 799            if ( increment ) {
 800                ++zSig0;
 801                zSig0 &=
 802                    ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 803                if ( (int64_t) zSig0 < 0 ) zExp = 1;
 804            }
 805            return packFloatx80( zSign, zExp, zSig0 );
 806        }
 807    }
 808    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 809    if ( increment ) {
 810        ++zSig0;
 811        if ( zSig0 == 0 ) {
 812            ++zExp;
 813            zSig0 = LIT64( 0x8000000000000000 );
 814        }
 815        else {
 816            zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 817        }
 818    }
 819    else {
 820        if ( zSig0 == 0 ) zExp = 0;
 821    }
 822    return packFloatx80( zSign, zExp, zSig0 );
 823
 824}
 825
 826/*----------------------------------------------------------------------------
 827| Takes an abstract floating-point value having sign `zSign', exponent
 828| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 829| and returns the proper extended double-precision floating-point value
 830| corresponding to the abstract input.  This routine is just like
 831| `roundAndPackFloatx80' except that the input significand does not have to be
 832| normalized.
 833*----------------------------------------------------------------------------*/
 834
 835static floatx80
 836 normalizeRoundAndPackFloatx80(
 837     int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 838 STATUS_PARAM)
 839{
 840    int8 shiftCount;
 841
 842    if ( zSig0 == 0 ) {
 843        zSig0 = zSig1;
 844        zSig1 = 0;
 845        zExp -= 64;
 846    }
 847    shiftCount = countLeadingZeros64( zSig0 );
 848    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 849    zExp -= shiftCount;
 850    return
 851        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 852
 853}
 854
 855/*----------------------------------------------------------------------------
 856| Returns the least-significant 64 fraction bits of the quadruple-precision
 857| floating-point value `a'.
 858*----------------------------------------------------------------------------*/
 859
 860INLINE uint64_t extractFloat128Frac1( float128 a )
 861{
 862
 863    return a.low;
 864
 865}
 866
 867/*----------------------------------------------------------------------------
 868| Returns the most-significant 48 fraction bits of the quadruple-precision
 869| floating-point value `a'.
 870*----------------------------------------------------------------------------*/
 871
 872INLINE uint64_t extractFloat128Frac0( float128 a )
 873{
 874
 875    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 876
 877}
 878
 879/*----------------------------------------------------------------------------
 880| Returns the exponent bits of the quadruple-precision floating-point value
 881| `a'.
 882*----------------------------------------------------------------------------*/
 883
 884INLINE int32 extractFloat128Exp( float128 a )
 885{
 886
 887    return ( a.high>>48 ) & 0x7FFF;
 888
 889}
 890
 891/*----------------------------------------------------------------------------
 892| Returns the sign bit of the quadruple-precision floating-point value `a'.
 893*----------------------------------------------------------------------------*/
 894
 895INLINE flag extractFloat128Sign( float128 a )
 896{
 897
 898    return a.high>>63;
 899
 900}
 901
 902/*----------------------------------------------------------------------------
 903| Normalizes the subnormal quadruple-precision floating-point value
 904| represented by the denormalized significand formed by the concatenation of
 905| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 906| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 907| significand are stored at the location pointed to by `zSig0Ptr', and the
 908| least significant 64 bits of the normalized significand are stored at the
 909| location pointed to by `zSig1Ptr'.
 910*----------------------------------------------------------------------------*/
 911
 912static void
 913 normalizeFloat128Subnormal(
 914     uint64_t aSig0,
 915     uint64_t aSig1,
 916     int32 *zExpPtr,
 917     uint64_t *zSig0Ptr,
 918     uint64_t *zSig1Ptr
 919 )
 920{
 921    int8 shiftCount;
 922
 923    if ( aSig0 == 0 ) {
 924        shiftCount = countLeadingZeros64( aSig1 ) - 15;
 925        if ( shiftCount < 0 ) {
 926            *zSig0Ptr = aSig1>>( - shiftCount );
 927            *zSig1Ptr = aSig1<<( shiftCount & 63 );
 928        }
 929        else {
 930            *zSig0Ptr = aSig1<<shiftCount;
 931            *zSig1Ptr = 0;
 932        }
 933        *zExpPtr = - shiftCount - 63;
 934    }
 935    else {
 936        shiftCount = countLeadingZeros64( aSig0 ) - 15;
 937        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 938        *zExpPtr = 1 - shiftCount;
 939    }
 940
 941}
 942
 943/*----------------------------------------------------------------------------
 944| Packs the sign `zSign', the exponent `zExp', and the significand formed
 945| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 946| floating-point value, returning the result.  After being shifted into the
 947| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 948| added together to form the most significant 32 bits of the result.  This
 949| means that any integer portion of `zSig0' will be added into the exponent.
 950| Since a properly normalized significand will have an integer portion equal
 951| to 1, the `zExp' input should be 1 less than the desired result exponent
 952| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 953| significand.
 954*----------------------------------------------------------------------------*/
 955
 956INLINE float128
 957 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
 958{
 959    float128 z;
 960
 961    z.low = zSig1;
 962    z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
 963    return z;
 964
 965}
 966
 967/*----------------------------------------------------------------------------
 968| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 969| and extended significand formed by the concatenation of `zSig0', `zSig1',
 970| and `zSig2', and returns the proper quadruple-precision floating-point value
 971| corresponding to the abstract input.  Ordinarily, the abstract value is
 972| simply rounded and packed into the quadruple-precision format, with the
 973| inexact exception raised if the abstract input cannot be represented
 974| exactly.  However, if the abstract value is too large, the overflow and
 975| inexact exceptions are raised and an infinity or maximal finite value is
 976| returned.  If the abstract value is too small, the input value is rounded to
 977| a subnormal number, and the underflow and inexact exceptions are raised if
 978| the abstract input cannot be represented exactly as a subnormal quadruple-
 979| precision floating-point number.
 980|     The input significand must be normalized or smaller.  If the input
 981| significand is not normalized, `zExp' must be 0; in that case, the result
 982| returned is a subnormal number, and it must not require rounding.  In the
 983| usual case that the input significand is normalized, `zExp' must be 1 less
 984| than the ``true'' floating-point exponent.  The handling of underflow and
 985| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 986*----------------------------------------------------------------------------*/
 987
 988static float128
 989 roundAndPackFloat128(
 990     flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
 991{
 992    int8 roundingMode;
 993    flag roundNearestEven, increment, isTiny;
 994
 995    roundingMode = STATUS(float_rounding_mode);
 996    roundNearestEven = ( roundingMode == float_round_nearest_even );
 997    increment = ( (int64_t) zSig2 < 0 );
 998    if ( ! roundNearestEven ) {
 999        if ( roundingMode == float_round_to_zero ) {
1000            increment = 0;
1001        }
1002        else {
1003            if ( zSign ) {
1004                increment = ( roundingMode == float_round_down ) && zSig2;
1005            }
1006            else {
1007                increment = ( roundingMode == float_round_up ) && zSig2;
1008            }
1009        }
1010    }
1011    if ( 0x7FFD <= (uint32_t) zExp ) {
1012        if (    ( 0x7FFD < zExp )
1013             || (    ( zExp == 0x7FFD )
1014                  && eq128(
1015                         LIT64( 0x0001FFFFFFFFFFFF ),
1016                         LIT64( 0xFFFFFFFFFFFFFFFF ),
1017                         zSig0,
1018                         zSig1
1019                     )
1020                  && increment
1021                )
1022           ) {
1023            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1024            if (    ( roundingMode == float_round_to_zero )
1025                 || ( zSign && ( roundingMode == float_round_up ) )
1026                 || ( ! zSign && ( roundingMode == float_round_down ) )
1027               ) {
1028                return
1029                    packFloat128(
1030                        zSign,
1031                        0x7FFE,
1032                        LIT64( 0x0000FFFFFFFFFFFF ),
1033                        LIT64( 0xFFFFFFFFFFFFFFFF )
1034                    );
1035            }
1036            return packFloat128( zSign, 0x7FFF, 0, 0 );
1037        }
1038        if ( zExp < 0 ) {
1039            if (STATUS(flush_to_zero)) {
1040                float_raise(float_flag_output_denormal STATUS_VAR);
1041                return packFloat128(zSign, 0, 0, 0);
1042            }
1043            isTiny =
1044                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1045                || ( zExp < -1 )
1046                || ! increment
1047                || lt128(
1048                       zSig0,
1049                       zSig1,
1050                       LIT64( 0x0001FFFFFFFFFFFF ),
1051                       LIT64( 0xFFFFFFFFFFFFFFFF )
1052                   );
1053            shift128ExtraRightJamming(
1054                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1055            zExp = 0;
1056            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1057            if ( roundNearestEven ) {
1058                increment = ( (int64_t) zSig2 < 0 );
1059            }
1060            else {
1061                if ( zSign ) {
1062                    increment = ( roundingMode == float_round_down ) && zSig2;
1063                }
1064                else {
1065                    increment = ( roundingMode == float_round_up ) && zSig2;
1066                }
1067            }
1068        }
1069    }
1070    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1071    if ( increment ) {
1072        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1073        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1074    }
1075    else {
1076        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1077    }
1078    return packFloat128( zSign, zExp, zSig0, zSig1 );
1079
1080}
1081
1082/*----------------------------------------------------------------------------
1083| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1084| and significand formed by the concatenation of `zSig0' and `zSig1', and
1085| returns the proper quadruple-precision floating-point value corresponding
1086| to the abstract input.  This routine is just like `roundAndPackFloat128'
1087| except that the input significand has fewer bits and does not have to be
1088| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1089| point exponent.
1090*----------------------------------------------------------------------------*/
1091
1092static float128
1093 normalizeRoundAndPackFloat128(
1094     flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1095{
1096    int8 shiftCount;
1097    uint64_t zSig2;
1098
1099    if ( zSig0 == 0 ) {
1100        zSig0 = zSig1;
1101        zSig1 = 0;
1102        zExp -= 64;
1103    }
1104    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1105    if ( 0 <= shiftCount ) {
1106        zSig2 = 0;
1107        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1108    }
1109    else {
1110        shift128ExtraRightJamming(
1111            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1112    }
1113    zExp -= shiftCount;
1114    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1115
1116}
1117
1118/*----------------------------------------------------------------------------
1119| Returns the result of converting the 32-bit two's complement integer `a'
1120| to the single-precision floating-point format.  The conversion is performed
1121| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1122*----------------------------------------------------------------------------*/
1123
1124float32 int32_to_float32( int32 a STATUS_PARAM )
1125{
1126    flag zSign;
1127
1128    if ( a == 0 ) return float32_zero;
1129    if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1130    zSign = ( a < 0 );
1131    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1132
1133}
1134
1135/*----------------------------------------------------------------------------
1136| Returns the result of converting the 32-bit two's complement integer `a'
1137| to the double-precision floating-point format.  The conversion is performed
1138| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139*----------------------------------------------------------------------------*/
1140
1141float64 int32_to_float64( int32 a STATUS_PARAM )
1142{
1143    flag zSign;
1144    uint32 absA;
1145    int8 shiftCount;
1146    uint64_t zSig;
1147
1148    if ( a == 0 ) return float64_zero;
1149    zSign = ( a < 0 );
1150    absA = zSign ? - a : a;
1151    shiftCount = countLeadingZeros32( absA ) + 21;
1152    zSig = absA;
1153    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1154
1155}
1156
1157/*----------------------------------------------------------------------------
1158| Returns the result of converting the 32-bit two's complement integer `a'
1159| to the extended double-precision floating-point format.  The conversion
1160| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1161| Arithmetic.
1162*----------------------------------------------------------------------------*/
1163
1164floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1165{
1166    flag zSign;
1167    uint32 absA;
1168    int8 shiftCount;
1169    uint64_t zSig;
1170
1171    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1172    zSign = ( a < 0 );
1173    absA = zSign ? - a : a;
1174    shiftCount = countLeadingZeros32( absA ) + 32;
1175    zSig = absA;
1176    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1177
1178}
1179
1180/*----------------------------------------------------------------------------
1181| Returns the result of converting the 32-bit two's complement integer `a' to
1182| the quadruple-precision floating-point format.  The conversion is performed
1183| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1184*----------------------------------------------------------------------------*/
1185
1186float128 int32_to_float128( int32 a STATUS_PARAM )
1187{
1188    flag zSign;
1189    uint32 absA;
1190    int8 shiftCount;
1191    uint64_t zSig0;
1192
1193    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1194    zSign = ( a < 0 );
1195    absA = zSign ? - a : a;
1196    shiftCount = countLeadingZeros32( absA ) + 17;
1197    zSig0 = absA;
1198    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1199
1200}
1201
1202/*----------------------------------------------------------------------------
1203| Returns the result of converting the 64-bit two's complement integer `a'
1204| to the single-precision floating-point format.  The conversion is performed
1205| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1206*----------------------------------------------------------------------------*/
1207
1208float32 int64_to_float32( int64 a STATUS_PARAM )
1209{
1210    flag zSign;
1211    uint64 absA;
1212    int8 shiftCount;
1213
1214    if ( a == 0 ) return float32_zero;
1215    zSign = ( a < 0 );
1216    absA = zSign ? - a : a;
1217    shiftCount = countLeadingZeros64( absA ) - 40;
1218    if ( 0 <= shiftCount ) {
1219        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1220    }
1221    else {
1222        shiftCount += 7;
1223        if ( shiftCount < 0 ) {
1224            shift64RightJamming( absA, - shiftCount, &absA );
1225        }
1226        else {
1227            absA <<= shiftCount;
1228        }
1229        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1230    }
1231
1232}
1233
1234float32 uint64_to_float32( uint64 a STATUS_PARAM )
1235{
1236    int8 shiftCount;
1237
1238    if ( a == 0 ) return float32_zero;
1239    shiftCount = countLeadingZeros64( a ) - 40;
1240    if ( 0 <= shiftCount ) {
1241        return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
1242    }
1243    else {
1244        shiftCount += 7;
1245        if ( shiftCount < 0 ) {
1246            shift64RightJamming( a, - shiftCount, &a );
1247        }
1248        else {
1249            a <<= shiftCount;
1250        }
1251        return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
1252    }
1253}
1254
1255/*----------------------------------------------------------------------------
1256| Returns the result of converting the 64-bit two's complement integer `a'
1257| to the double-precision floating-point format.  The conversion is performed
1258| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1259*----------------------------------------------------------------------------*/
1260
1261float64 int64_to_float64( int64 a STATUS_PARAM )
1262{
1263    flag zSign;
1264
1265    if ( a == 0 ) return float64_zero;
1266    if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1267        return packFloat64( 1, 0x43E, 0 );
1268    }
1269    zSign = ( a < 0 );
1270    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1271
1272}
1273
1274float64 uint64_to_float64( uint64 a STATUS_PARAM )
1275{
1276    if ( a == 0 ) return float64_zero;
1277    return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1278
1279}
1280
1281/*----------------------------------------------------------------------------
1282| Returns the result of converting the 64-bit two's complement integer `a'
1283| to the extended double-precision floating-point format.  The conversion
1284| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1285| Arithmetic.
1286*----------------------------------------------------------------------------*/
1287
1288floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1289{
1290    flag zSign;
1291    uint64 absA;
1292    int8 shiftCount;
1293
1294    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1295    zSign = ( a < 0 );
1296    absA = zSign ? - a : a;
1297    shiftCount = countLeadingZeros64( absA );
1298    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1299
1300}
1301
1302/*----------------------------------------------------------------------------
1303| Returns the result of converting the 64-bit two's complement integer `a' to
1304| the quadruple-precision floating-point format.  The conversion is performed
1305| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1306*----------------------------------------------------------------------------*/
1307
1308float128 int64_to_float128( int64 a STATUS_PARAM )
1309{
1310    flag zSign;
1311    uint64 absA;
1312    int8 shiftCount;
1313    int32 zExp;
1314    uint64_t zSig0, zSig1;
1315
1316    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1317    zSign = ( a < 0 );
1318    absA = zSign ? - a : a;
1319    shiftCount = countLeadingZeros64( absA ) + 49;
1320    zExp = 0x406E - shiftCount;
1321    if ( 64 <= shiftCount ) {
1322        zSig1 = 0;
1323        zSig0 = absA;
1324        shiftCount -= 64;
1325    }
1326    else {
1327        zSig1 = absA;
1328        zSig0 = 0;
1329    }
1330    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1331    return packFloat128( zSign, zExp, zSig0, zSig1 );
1332
1333}
1334
1335/*----------------------------------------------------------------------------
1336| Returns the result of converting the single-precision floating-point value
1337| `a' to the 32-bit two's complement integer format.  The conversion is
1338| performed according to the IEC/IEEE Standard for Binary Floating-Point
1339| Arithmetic---which means in particular that the conversion is rounded
1340| according to the current rounding mode.  If `a' is a NaN, the largest
1341| positive integer is returned.  Otherwise, if the conversion overflows, the
1342| largest integer with the same sign as `a' is returned.
1343*----------------------------------------------------------------------------*/
1344
1345int32 float32_to_int32( float32 a STATUS_PARAM )
1346{
1347    flag aSign;
1348    int_fast16_t aExp, shiftCount;
1349    uint32_t aSig;
1350    uint64_t aSig64;
1351
1352    a = float32_squash_input_denormal(a STATUS_VAR);
1353    aSig = extractFloat32Frac( a );
1354    aExp = extractFloat32Exp( a );
1355    aSign = extractFloat32Sign( a );
1356    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1357    if ( aExp ) aSig |= 0x00800000;
1358    shiftCount = 0xAF - aExp;
1359    aSig64 = aSig;
1360    aSig64 <<= 32;
1361    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1362    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1363
1364}
1365
1366/*----------------------------------------------------------------------------
1367| Returns the result of converting the single-precision floating-point value
1368| `a' to the 32-bit two's complement integer format.  The conversion is
1369| performed according to the IEC/IEEE Standard for Binary Floating-Point
1370| Arithmetic, except that the conversion is always rounded toward zero.
1371| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1372| the conversion overflows, the largest integer with the same sign as `a' is
1373| returned.
1374*----------------------------------------------------------------------------*/
1375
1376int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1377{
1378    flag aSign;
1379    int_fast16_t aExp, shiftCount;
1380    uint32_t aSig;
1381    int32_t z;
1382    a = float32_squash_input_denormal(a STATUS_VAR);
1383
1384    aSig = extractFloat32Frac( a );
1385    aExp = extractFloat32Exp( a );
1386    aSign = extractFloat32Sign( a );
1387    shiftCount = aExp - 0x9E;
1388    if ( 0 <= shiftCount ) {
1389        if ( float32_val(a) != 0xCF000000 ) {
1390            float_raise( float_flag_invalid STATUS_VAR);
1391            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1392        }
1393        return (int32_t) 0x80000000;
1394    }
1395    else if ( aExp <= 0x7E ) {
1396        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1397        return 0;
1398    }
1399    aSig = ( aSig | 0x00800000 )<<8;
1400    z = aSig>>( - shiftCount );
1401    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1402        STATUS(float_exception_flags) |= float_flag_inexact;
1403    }
1404    if ( aSign ) z = - z;
1405    return z;
1406
1407}
1408
1409/*----------------------------------------------------------------------------
1410| Returns the result of converting the single-precision floating-point value
1411| `a' to the 16-bit two's complement integer format.  The conversion is
1412| performed according to the IEC/IEEE Standard for Binary Floating-Point
1413| Arithmetic, except that the conversion is always rounded toward zero.
1414| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1415| the conversion overflows, the largest integer with the same sign as `a' is
1416| returned.
1417*----------------------------------------------------------------------------*/
1418
1419int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
1420{
1421    flag aSign;
1422    int_fast16_t aExp, shiftCount;
1423    uint32_t aSig;
1424    int32 z;
1425
1426    aSig = extractFloat32Frac( a );
1427    aExp = extractFloat32Exp( a );
1428    aSign = extractFloat32Sign( a );
1429    shiftCount = aExp - 0x8E;
1430    if ( 0 <= shiftCount ) {
1431        if ( float32_val(a) != 0xC7000000 ) {
1432            float_raise( float_flag_invalid STATUS_VAR);
1433            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1434                return 0x7FFF;
1435            }
1436        }
1437        return (int32_t) 0xffff8000;
1438    }
1439    else if ( aExp <= 0x7E ) {
1440        if ( aExp | aSig ) {
1441            STATUS(float_exception_flags) |= float_flag_inexact;
1442        }
1443        return 0;
1444    }
1445    shiftCount -= 0x10;
1446    aSig = ( aSig | 0x00800000 )<<8;
1447    z = aSig>>( - shiftCount );
1448    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1449        STATUS(float_exception_flags) |= float_flag_inexact;
1450    }
1451    if ( aSign ) {
1452        z = - z;
1453    }
1454    return z;
1455
1456}
1457
1458/*----------------------------------------------------------------------------
1459| Returns the result of converting the single-precision floating-point value
1460| `a' to the 64-bit two's complement integer format.  The conversion is
1461| performed according to the IEC/IEEE Standard for Binary Floating-Point
1462| Arithmetic---which means in particular that the conversion is rounded
1463| according to the current rounding mode.  If `a' is a NaN, the largest
1464| positive integer is returned.  Otherwise, if the conversion overflows, the
1465| largest integer with the same sign as `a' is returned.
1466*----------------------------------------------------------------------------*/
1467
1468int64 float32_to_int64( float32 a STATUS_PARAM )
1469{
1470    flag aSign;
1471    int_fast16_t aExp, shiftCount;
1472    uint32_t aSig;
1473    uint64_t aSig64, aSigExtra;
1474    a = float32_squash_input_denormal(a STATUS_VAR);
1475
1476    aSig = extractFloat32Frac( a );
1477    aExp = extractFloat32Exp( a );
1478    aSign = extractFloat32Sign( a );
1479    shiftCount = 0xBE - aExp;
1480    if ( shiftCount < 0 ) {
1481        float_raise( float_flag_invalid STATUS_VAR);
1482        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1483            return LIT64( 0x7FFFFFFFFFFFFFFF );
1484        }
1485        return (int64_t) LIT64( 0x8000000000000000 );
1486    }
1487    if ( aExp ) aSig |= 0x00800000;
1488    aSig64 = aSig;
1489    aSig64 <<= 40;
1490    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1491    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1492
1493}
1494
1495/*----------------------------------------------------------------------------
1496| Returns the result of converting the single-precision floating-point value
1497| `a' to the 64-bit two's complement integer format.  The conversion is
1498| performed according to the IEC/IEEE Standard for Binary Floating-Point
1499| Arithmetic, except that the conversion is always rounded toward zero.  If
1500| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1501| conversion overflows, the largest integer with the same sign as `a' is
1502| returned.
1503*----------------------------------------------------------------------------*/
1504
1505int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1506{
1507    flag aSign;
1508    int_fast16_t aExp, shiftCount;
1509    uint32_t aSig;
1510    uint64_t aSig64;
1511    int64 z;
1512    a = float32_squash_input_denormal(a STATUS_VAR);
1513
1514    aSig = extractFloat32Frac( a );
1515    aExp = extractFloat32Exp( a );
1516    aSign = extractFloat32Sign( a );
1517    shiftCount = aExp - 0xBE;
1518    if ( 0 <= shiftCount ) {
1519        if ( float32_val(a) != 0xDF000000 ) {
1520            float_raise( float_flag_invalid STATUS_VAR);
1521            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1522                return LIT64( 0x7FFFFFFFFFFFFFFF );
1523            }
1524        }
1525        return (int64_t) LIT64( 0x8000000000000000 );
1526    }
1527    else if ( aExp <= 0x7E ) {
1528        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1529        return 0;
1530    }
1531    aSig64 = aSig | 0x00800000;
1532    aSig64 <<= 40;
1533    z = aSig64>>( - shiftCount );
1534    if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1535        STATUS(float_exception_flags) |= float_flag_inexact;
1536    }
1537    if ( aSign ) z = - z;
1538    return z;
1539
1540}
1541
1542/*----------------------------------------------------------------------------
1543| Returns the result of converting the single-precision floating-point value
1544| `a' to the double-precision floating-point format.  The conversion is
1545| performed according to the IEC/IEEE Standard for Binary Floating-Point
1546| Arithmetic.
1547*----------------------------------------------------------------------------*/
1548
1549float64 float32_to_float64( float32 a STATUS_PARAM )
1550{
1551    flag aSign;
1552    int_fast16_t aExp;
1553    uint32_t aSig;
1554    a = float32_squash_input_denormal(a STATUS_VAR);
1555
1556    aSig = extractFloat32Frac( a );
1557    aExp = extractFloat32Exp( a );
1558    aSign = extractFloat32Sign( a );
1559    if ( aExp == 0xFF ) {
1560        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1561        return packFloat64( aSign, 0x7FF, 0 );
1562    }
1563    if ( aExp == 0 ) {
1564        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1565        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1566        --aExp;
1567    }
1568    return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1569
1570}
1571
1572/*----------------------------------------------------------------------------
1573| Returns the result of converting the single-precision floating-point value
1574| `a' to the extended double-precision floating-point format.  The conversion
1575| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1576| Arithmetic.
1577*----------------------------------------------------------------------------*/
1578
1579floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1580{
1581    flag aSign;
1582    int_fast16_t aExp;
1583    uint32_t aSig;
1584
1585    a = float32_squash_input_denormal(a STATUS_VAR);
1586    aSig = extractFloat32Frac( a );
1587    aExp = extractFloat32Exp( a );
1588    aSign = extractFloat32Sign( a );
1589    if ( aExp == 0xFF ) {
1590        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1591        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1592    }
1593    if ( aExp == 0 ) {
1594        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1595        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1596    }
1597    aSig |= 0x00800000;
1598    return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1599
1600}
1601
1602/*----------------------------------------------------------------------------
1603| Returns the result of converting the single-precision floating-point value
1604| `a' to the double-precision floating-point format.  The conversion is
1605| performed according to the IEC/IEEE Standard for Binary Floating-Point
1606| Arithmetic.
1607*----------------------------------------------------------------------------*/
1608
1609float128 float32_to_float128( float32 a STATUS_PARAM )
1610{
1611    flag aSign;
1612    int_fast16_t aExp;
1613    uint32_t aSig;
1614
1615    a = float32_squash_input_denormal(a STATUS_VAR);
1616    aSig = extractFloat32Frac( a );
1617    aExp = extractFloat32Exp( a );
1618    aSign = extractFloat32Sign( a );
1619    if ( aExp == 0xFF ) {
1620        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1621        return packFloat128( aSign, 0x7FFF, 0, 0 );
1622    }
1623    if ( aExp == 0 ) {
1624        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1625        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1626        --aExp;
1627    }
1628    return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1629
1630}
1631
1632/*----------------------------------------------------------------------------
1633| Rounds the single-precision floating-point value `a' to an integer, and
1634| returns the result as a single-precision floating-point value.  The
1635| operation is performed according to the IEC/IEEE Standard for Binary
1636| Floating-Point Arithmetic.
1637*----------------------------------------------------------------------------*/
1638
1639float32 float32_round_to_int( float32 a STATUS_PARAM)
1640{
1641    flag aSign;
1642    int_fast16_t aExp;
1643    uint32_t lastBitMask, roundBitsMask;
1644    int8 roundingMode;
1645    uint32_t z;
1646    a = float32_squash_input_denormal(a STATUS_VAR);
1647
1648    aExp = extractFloat32Exp( a );
1649    if ( 0x96 <= aExp ) {
1650        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1651            return propagateFloat32NaN( a, a STATUS_VAR );
1652        }
1653        return a;
1654    }
1655    if ( aExp <= 0x7E ) {
1656        if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1657        STATUS(float_exception_flags) |= float_flag_inexact;
1658        aSign = extractFloat32Sign( a );
1659        switch ( STATUS(float_rounding_mode) ) {
1660         case float_round_nearest_even:
1661            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1662                return packFloat32( aSign, 0x7F, 0 );
1663            }
1664            break;
1665         case float_round_down:
1666            return make_float32(aSign ? 0xBF800000 : 0);
1667         case float_round_up:
1668            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1669        }
1670        return packFloat32( aSign, 0, 0 );
1671    }
1672    lastBitMask = 1;
1673    lastBitMask <<= 0x96 - aExp;
1674    roundBitsMask = lastBitMask - 1;
1675    z = float32_val(a);
1676    roundingMode = STATUS(float_rounding_mode);
1677    if ( roundingMode == float_round_nearest_even ) {
1678        z += lastBitMask>>1;
1679        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1680    }
1681    else if ( roundingMode != float_round_to_zero ) {
1682        if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1683            z += roundBitsMask;
1684        }
1685    }
1686    z &= ~ roundBitsMask;
1687    if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1688    return make_float32(z);
1689
1690}
1691
1692/*----------------------------------------------------------------------------
1693| Returns the result of adding the absolute values of the single-precision
1694| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1695| before being returned.  `zSign' is ignored if the result is a NaN.
1696| The addition is performed according to the IEC/IEEE Standard for Binary
1697| Floating-Point Arithmetic.
1698*----------------------------------------------------------------------------*/
1699
1700static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1701{
1702    int_fast16_t aExp, bExp, zExp;
1703    uint32_t aSig, bSig, zSig;
1704    int_fast16_t expDiff;
1705
1706    aSig = extractFloat32Frac( a );
1707    aExp = extractFloat32Exp( a );
1708    bSig = extractFloat32Frac( b );
1709    bExp = extractFloat32Exp( b );
1710    expDiff = aExp - bExp;
1711    aSig <<= 6;
1712    bSig <<= 6;
1713    if ( 0 < expDiff ) {
1714        if ( aExp == 0xFF ) {
1715            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1716            return a;
1717        }
1718        if ( bExp == 0 ) {
1719            --expDiff;
1720        }
1721        else {
1722            bSig |= 0x20000000;
1723        }
1724        shift32RightJamming( bSig, expDiff, &bSig );
1725        zExp = aExp;
1726    }
1727    else if ( expDiff < 0 ) {
1728        if ( bExp == 0xFF ) {
1729            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1730            return packFloat32( zSign, 0xFF, 0 );
1731        }
1732        if ( aExp == 0 ) {
1733            ++expDiff;
1734        }
1735        else {
1736            aSig |= 0x20000000;
1737        }
1738        shift32RightJamming( aSig, - expDiff, &aSig );
1739        zExp = bExp;
1740    }
1741    else {
1742        if ( aExp == 0xFF ) {
1743            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1744            return a;
1745        }
1746        if ( aExp == 0 ) {
1747            if (STATUS(flush_to_zero)) {
1748                if (aSig | bSig) {
1749                    float_raise(float_flag_output_denormal STATUS_VAR);
1750                }
1751                return packFloat32(zSign, 0, 0);
1752            }
1753            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1754        }
1755        zSig = 0x40000000 + aSig + bSig;
1756        zExp = aExp;
1757        goto roundAndPack;
1758    }
1759    aSig |= 0x20000000;
1760    zSig = ( aSig + bSig )<<1;
1761    --zExp;
1762    if ( (int32_t) zSig < 0 ) {
1763        zSig = aSig + bSig;
1764        ++zExp;
1765    }
1766 roundAndPack:
1767    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1768
1769}
1770
1771/*----------------------------------------------------------------------------
1772| Returns the result of subtracting the absolute values of the single-
1773| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1774| difference is negated before being returned.  `zSign' is ignored if the
1775| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1776| Standard for Binary Floating-Point Arithmetic.
1777*----------------------------------------------------------------------------*/
1778
1779static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1780{
1781    int_fast16_t aExp, bExp, zExp;
1782    uint32_t aSig, bSig, zSig;
1783    int_fast16_t expDiff;
1784
1785    aSig = extractFloat32Frac( a );
1786    aExp = extractFloat32Exp( a );
1787    bSig = extractFloat32Frac( b );
1788    bExp = extractFloat32Exp( b );
1789    expDiff = aExp - bExp;
1790    aSig <<= 7;
1791    bSig <<= 7;
1792    if ( 0 < expDiff ) goto aExpBigger;
1793    if ( expDiff < 0 ) goto bExpBigger;
1794    if ( aExp == 0xFF ) {
1795        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1796        float_raise( float_flag_invalid STATUS_VAR);
1797        return float32_default_nan;
1798    }
1799    if ( aExp == 0 ) {
1800        aExp = 1;
1801        bExp = 1;
1802    }
1803    if ( bSig < aSig ) goto aBigger;
1804    if ( aSig < bSig ) goto bBigger;
1805    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1806 bExpBigger:
1807    if ( bExp == 0xFF ) {
1808        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1809        return packFloat32( zSign ^ 1, 0xFF, 0 );
1810    }
1811    if ( aExp == 0 ) {
1812        ++expDiff;
1813    }
1814    else {
1815        aSig |= 0x40000000;
1816    }
1817    shift32RightJamming( aSig, - expDiff, &aSig );
1818    bSig |= 0x40000000;
1819 bBigger:
1820    zSig = bSig - aSig;
1821    zExp = bExp;
1822    zSign ^= 1;
1823    goto normalizeRoundAndPack;
1824 aExpBigger:
1825    if ( aExp == 0xFF ) {
1826        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1827        return a;
1828    }
1829    if ( bExp == 0 ) {
1830        --expDiff;
1831    }
1832    else {
1833        bSig |= 0x40000000;
1834    }
1835    shift32RightJamming( bSig, expDiff, &bSig );
1836    aSig |= 0x40000000;
1837 aBigger:
1838    zSig = aSig - bSig;
1839    zExp = aExp;
1840 normalizeRoundAndPack:
1841    --zExp;
1842    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1843
1844}
1845
1846/*----------------------------------------------------------------------------
1847| Returns the result of adding the single-precision floating-point values `a'
1848| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1849| Binary Floating-Point Arithmetic.
1850*----------------------------------------------------------------------------*/
1851
1852float32 float32_add( float32 a, float32 b STATUS_PARAM )
1853{
1854    flag aSign, bSign;
1855    a = float32_squash_input_denormal(a STATUS_VAR);
1856    b = float32_squash_input_denormal(b STATUS_VAR);
1857
1858    aSign = extractFloat32Sign( a );
1859    bSign = extractFloat32Sign( b );
1860    if ( aSign == bSign ) {
1861        return addFloat32Sigs( a, b, aSign STATUS_VAR);
1862    }
1863    else {
1864        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1865    }
1866
1867}
1868
1869/*----------------------------------------------------------------------------
1870| Returns the result of subtracting the single-precision floating-point values
1871| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1872| for Binary Floating-Point Arithmetic.
1873*----------------------------------------------------------------------------*/
1874
1875float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1876{
1877    flag aSign, bSign;
1878    a = float32_squash_input_denormal(a STATUS_VAR);
1879    b = float32_squash_input_denormal(b STATUS_VAR);
1880
1881    aSign = extractFloat32Sign( a );
1882    bSign = extractFloat32Sign( b );
1883    if ( aSign == bSign ) {
1884        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1885    }
1886    else {
1887        return addFloat32Sigs( a, b, aSign STATUS_VAR );
1888    }
1889
1890}
1891
1892/*----------------------------------------------------------------------------
1893| Returns the result of multiplying the single-precision floating-point values
1894| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1895| for Binary Floating-Point Arithmetic.
1896*----------------------------------------------------------------------------*/
1897
1898float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1899{
1900    flag aSign, bSign, zSign;
1901    int_fast16_t aExp, bExp, zExp;
1902    uint32_t aSig, bSig;
1903    uint64_t zSig64;
1904    uint32_t zSig;
1905
1906    a = float32_squash_input_denormal(a STATUS_VAR);
1907    b = float32_squash_input_denormal(b STATUS_VAR);
1908
1909    aSig = extractFloat32Frac( a );
1910    aExp = extractFloat32Exp( a );
1911    aSign = extractFloat32Sign( a );
1912    bSig = extractFloat32Frac( b );
1913    bExp = extractFloat32Exp( b );
1914    bSign = extractFloat32Sign( b );
1915    zSign = aSign ^ bSign;
1916    if ( aExp == 0xFF ) {
1917        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1918            return propagateFloat32NaN( a, b STATUS_VAR );
1919        }
1920        if ( ( bExp | bSig ) == 0 ) {
1921            float_raise( float_flag_invalid STATUS_VAR);
1922            return float32_default_nan;
1923        }
1924        return packFloat32( zSign, 0xFF, 0 );
1925    }
1926    if ( bExp == 0xFF ) {
1927        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1928        if ( ( aExp | aSig ) == 0 ) {
1929            float_raise( float_flag_invalid STATUS_VAR);
1930            return float32_default_nan;
1931        }
1932        return packFloat32( zSign, 0xFF, 0 );
1933    }
1934    if ( aExp == 0 ) {
1935        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1936        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1937    }
1938    if ( bExp == 0 ) {
1939        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1940        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1941    }
1942    zExp = aExp + bExp - 0x7F;
1943    aSig = ( aSig | 0x00800000 )<<7;
1944    bSig = ( bSig | 0x00800000 )<<8;
1945    shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1946    zSig = zSig64;
1947    if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1948        zSig <<= 1;
1949        --zExp;
1950    }
1951    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1952
1953}
1954
1955/*----------------------------------------------------------------------------
1956| Returns the result of dividing the single-precision floating-point value `a'
1957| by the corresponding value `b'.  The operation is performed according to the
1958| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1959*----------------------------------------------------------------------------*/
1960
1961float32 float32_div( float32 a, float32 b STATUS_PARAM )
1962{
1963    flag aSign, bSign, zSign;
1964    int_fast16_t aExp, bExp, zExp;
1965    uint32_t aSig, bSig, zSig;
1966    a = float32_squash_input_denormal(a STATUS_VAR);
1967    b = float32_squash_input_denormal(b STATUS_VAR);
1968
1969    aSig = extractFloat32Frac( a );
1970    aExp = extractFloat32Exp( a );
1971    aSign = extractFloat32Sign( a );
1972    bSig = extractFloat32Frac( b );
1973    bExp = extractFloat32Exp( b );
1974    bSign = extractFloat32Sign( b );
1975    zSign = aSign ^ bSign;
1976    if ( aExp == 0xFF ) {
1977        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1978        if ( bExp == 0xFF ) {
1979            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1980            float_raise( float_flag_invalid STATUS_VAR);
1981            return float32_default_nan;
1982        }
1983        return packFloat32( zSign, 0xFF, 0 );
1984    }
1985    if ( bExp == 0xFF ) {
1986        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1987        return packFloat32( zSign, 0, 0 );
1988    }
1989    if ( bExp == 0 ) {
1990        if ( bSig == 0 ) {
1991            if ( ( aExp | aSig ) == 0 ) {
1992                float_raise( float_flag_invalid STATUS_VAR);
1993                return float32_default_nan;
1994            }
1995            float_raise( float_flag_divbyzero STATUS_VAR);
1996            return packFloat32( zSign, 0xFF, 0 );
1997        }
1998        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1999    }
2000    if ( aExp == 0 ) {
2001        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2002        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2003    }
2004    zExp = aExp - bExp + 0x7D;
2005    aSig = ( aSig | 0x00800000 )<<7;
2006    bSig = ( bSig | 0x00800000 )<<8;
2007    if ( bSig <= ( aSig + aSig ) ) {
2008        aSig >>= 1;
2009        ++zExp;
2010    }
2011    zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2012    if ( ( zSig & 0x3F ) == 0 ) {
2013        zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2014    }
2015    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2016
2017}
2018
2019/*----------------------------------------------------------------------------
2020| Returns the remainder of the single-precision floating-point value `a'
2021| with respect to the corresponding value `b'.  The operation is performed
2022| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2023*----------------------------------------------------------------------------*/
2024
2025float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2026{
2027    flag aSign, zSign;
2028    int_fast16_t aExp, bExp, expDiff;
2029    uint32_t aSig, bSig;
2030    uint32_t q;
2031    uint64_t aSig64, bSig64, q64;
2032    uint32_t alternateASig;
2033    int32_t sigMean;
2034    a = float32_squash_input_denormal(a STATUS_VAR);
2035    b = float32_squash_input_denormal(b STATUS_VAR);
2036
2037    aSig = extractFloat32Frac( a );
2038    aExp = extractFloat32Exp( a );
2039    aSign = extractFloat32Sign( a );
2040    bSig = extractFloat32Frac( b );
2041    bExp = extractFloat32Exp( b );
2042    if ( aExp == 0xFF ) {
2043        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2044            return propagateFloat32NaN( a, b STATUS_VAR );
2045        }
2046        float_raise( float_flag_invalid STATUS_VAR);
2047        return float32_default_nan;
2048    }
2049    if ( bExp == 0xFF ) {
2050        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2051        return a;
2052    }
2053    if ( bExp == 0 ) {
2054        if ( bSig == 0 ) {
2055            float_raise( float_flag_invalid STATUS_VAR);
2056            return float32_default_nan;
2057        }
2058        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2059    }
2060    if ( aExp == 0 ) {
2061        if ( aSig == 0 ) return a;
2062        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2063    }
2064    expDiff = aExp - bExp;
2065    aSig |= 0x00800000;
2066    bSig |= 0x00800000;
2067    if ( expDiff < 32 ) {
2068        aSig <<= 8;
2069        bSig <<= 8;
2070        if ( expDiff < 0 ) {
2071            if ( expDiff < -1 ) return a;
2072            aSig >>= 1;
2073        }
2074        q = ( bSig <= aSig );
2075        if ( q ) aSig -= bSig;
2076        if ( 0 < expDiff ) {
2077            q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2078            q >>= 32 - expDiff;
2079            bSig >>= 2;
2080            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2081        }
2082        else {
2083            aSig >>= 2;
2084            bSig >>= 2;
2085        }
2086    }
2087    else {
2088        if ( bSig <= aSig ) aSig -= bSig;
2089        aSig64 = ( (uint64_t) aSig )<<40;
2090        bSig64 = ( (uint64_t) bSig )<<40;
2091        expDiff -= 64;
2092        while ( 0 < expDiff ) {
2093            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2094            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2095            aSig64 = - ( ( bSig * q64 )<<38 );
2096            expDiff -= 62;
2097        }
2098        expDiff += 64;
2099        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2100        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2101        q = q64>>( 64 - expDiff );
2102        bSig <<= 6;
2103        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2104    }
2105    do {
2106        alternateASig = aSig;
2107        ++q;
2108        aSig -= bSig;
2109    } while ( 0 <= (int32_t) aSig );
2110    sigMean = aSig + alternateASig;
2111    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2112        aSig = alternateASig;
2113    }
2114    zSign = ( (int32_t) aSig < 0 );
2115    if ( zSign ) aSig = - aSig;
2116    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2117
2118}
2119
2120/*----------------------------------------------------------------------------
2121| Returns the result of multiplying the single-precision floating-point values
2122| `a' and `b' then adding 'c', with no intermediate rounding step after the
2123| multiplication.  The operation is performed according to the IEC/IEEE
2124| Standard for Binary Floating-Point Arithmetic 754-2008.
2125| The flags argument allows the caller to select negation of the
2126| addend, the intermediate product, or the final result. (The difference
2127| between this and having the caller do a separate negation is that negating
2128| externally will flip the sign bit on NaNs.)
2129*----------------------------------------------------------------------------*/
2130
2131float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2132{
2133    flag aSign, bSign, cSign, zSign;
2134    int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2135    uint32_t aSig, bSig, cSig;
2136    flag pInf, pZero, pSign;
2137    uint64_t pSig64, cSig64, zSig64;
2138    uint32_t pSig;
2139    int shiftcount;
2140    flag signflip, infzero;
2141
2142    a = float32_squash_input_denormal(a STATUS_VAR);
2143    b = float32_squash_input_denormal(b STATUS_VAR);
2144    c = float32_squash_input_denormal(c STATUS_VAR);
2145    aSig = extractFloat32Frac(a);
2146    aExp = extractFloat32Exp(a);
2147    aSign = extractFloat32Sign(a);
2148    bSig = extractFloat32Frac(b);
2149    bExp = extractFloat32Exp(b);
2150    bSign = extractFloat32Sign(b);
2151    cSig = extractFloat32Frac(c);
2152    cExp = extractFloat32Exp(c);
2153    cSign = extractFloat32Sign(c);
2154
2155    infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2156               (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2157
2158    /* It is implementation-defined whether the cases of (0,inf,qnan)
2159     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2160     * they return if they do), so we have to hand this information
2161     * off to the target-specific pick-a-NaN routine.
2162     */
2163    if (((aExp == 0xff) && aSig) ||
2164        ((bExp == 0xff) && bSig) ||
2165        ((cExp == 0xff) && cSig)) {
2166        return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2167    }
2168
2169    if (infzero) {
2170        float_raise(float_flag_invalid STATUS_VAR);
2171        return float32_default_nan;
2172    }
2173
2174    if (flags & float_muladd_negate_c) {
2175        cSign ^= 1;
2176    }
2177
2178    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2179
2180    /* Work out the sign and type of the product */
2181    pSign = aSign ^ bSign;
2182    if (flags & float_muladd_negate_product) {
2183        pSign ^= 1;
2184    }
2185    pInf = (aExp == 0xff) || (bExp == 0xff);
2186    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2187
2188    if (cExp == 0xff) {
2189        if (pInf && (pSign ^ cSign)) {
2190            /* addition of opposite-signed infinities => InvalidOperation */
2191            float_raise(float_flag_invalid STATUS_VAR);
2192            return float32_default_nan;
2193        }
2194        /* Otherwise generate an infinity of the same sign */
2195        return packFloat32(cSign ^ signflip, 0xff, 0);
2196    }
2197
2198    if (pInf) {
2199        return packFloat32(pSign ^ signflip, 0xff, 0);
2200    }
2201
2202    if (pZero) {
2203        if (cExp == 0) {
2204            if (cSig == 0) {
2205                /* Adding two exact zeroes */
2206                if (pSign == cSign) {
2207                    zSign = pSign;
2208                } else if (STATUS(float_rounding_mode) == float_round_down) {
2209                    zSign = 1;
2210                } else {
2211                    zSign = 0;
2212                }
2213                return packFloat32(zSign ^ signflip, 0, 0);
2214            }
2215            /* Exact zero plus a denorm */
2216            if (STATUS(flush_to_zero)) {
2217                float_raise(float_flag_output_denormal STATUS_VAR);
2218                return packFloat32(cSign ^ signflip, 0, 0);
2219            }
2220        }
2221        /* Zero plus something non-zero : just return the something */
2222        return make_float32(float32_val(c) ^ (signflip << 31));
2223    }
2224
2225    if (aExp == 0) {
2226        normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2227    }
2228    if (bExp == 0) {
2229        normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2230    }
2231
2232    /* Calculate the actual result a * b + c */
2233
2234    /* Multiply first; this is easy. */
2235    /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2236     * because we want the true exponent, not the "one-less-than"
2237     * flavour that roundAndPackFloat32() takes.
2238     */
2239    pExp = aExp + bExp - 0x7e;
2240    aSig = (aSig | 0x00800000) << 7;
2241    bSig = (bSig | 0x00800000) << 8;
2242    pSig64 = (uint64_t)aSig * bSig;
2243    if ((int64_t)(pSig64 << 1) >= 0) {
2244        pSig64 <<= 1;
2245        pExp--;
2246    }
2247
2248    zSign = pSign ^ signflip;
2249
2250    /* Now pSig64 is the significand of the multiply, with the explicit bit in
2251     * position 62.
2252     */
2253    if (cExp == 0) {
2254        if (!cSig) {
2255            /* Throw out the special case of c being an exact zero now */
2256            shift64RightJamming(pSig64, 32, &pSig64);
2257            pSig = pSig64;
2258            return roundAndPackFloat32(zSign, pExp - 1,
2259                                       pSig STATUS_VAR);
2260        }
2261        normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2262    }
2263
2264    cSig64 = (uint64_t)cSig << (62 - 23);
2265    cSig64 |= LIT64(0x4000000000000000);
2266    expDiff = pExp - cExp;
2267
2268    if (pSign == cSign) {
2269        /* Addition */
2270        if (expDiff > 0) {
2271            /* scale c to match p */
2272            shift64RightJamming(cSig64, expDiff, &cSig64);
2273            zExp = pExp;
2274        } else if (expDiff < 0) {
2275            /* scale p to match c */
2276            shift64RightJamming(pSig64, -expDiff, &pSig64);
2277            zExp = cExp;
2278        } else {
2279            /* no scaling needed */
2280            zExp = cExp;
2281        }
2282        /* Add significands and make sure explicit bit ends up in posn 62 */
2283        zSig64 = pSig64 + cSig64;
2284        if ((int64_t)zSig64 < 0) {
2285            shift64RightJamming(zSig64, 1, &zSig64);
2286        } else {
2287            zExp--;
2288        }
2289    } else {
2290        /* Subtraction */
2291        if (expDiff > 0) {
2292            shift64RightJamming(cSig64, expDiff, &cSig64);
2293            zSig64 = pSig64 - cSig64;
2294            zExp = pExp;
2295        } else if (expDiff < 0) {
2296            shift64RightJamming(pSig64, -expDiff, &pSig64);
2297            zSig64 = cSig64 - pSig64;
2298            zExp = cExp;
2299            zSign ^= 1;
2300        } else {
2301            zExp = pExp;
2302            if (cSig64 < pSig64) {
2303                zSig64 = pSig64 - cSig64;
2304            } else if (pSig64 < cSig64) {
2305                zSig64 = cSig64 - pSig64;
2306                zSign ^= 1;
2307            } else {
2308                /* Exact zero */
2309                zSign = signflip;
2310                if (STATUS(float_rounding_mode) == float_round_down) {
2311                    zSign ^= 1;
2312                }
2313                return packFloat32(zSign, 0, 0);
2314            }
2315        }
2316        --zExp;
2317        /* Normalize to put the explicit bit back into bit 62. */
2318        shiftcount = countLeadingZeros64(zSig64) - 1;
2319        zSig64 <<= shiftcount;
2320        zExp -= shiftcount;
2321    }
2322    shift64RightJamming(zSig64, 32, &zSig64);
2323    return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2324}
2325
2326
2327/*----------------------------------------------------------------------------
2328| Returns the square root of the single-precision floating-point value `a'.
2329| The operation is performed according to the IEC/IEEE Standard for Binary
2330| Floating-Point Arithmetic.
2331*----------------------------------------------------------------------------*/
2332
2333float32 float32_sqrt( float32 a STATUS_PARAM )
2334{
2335    flag aSign;
2336    int_fast16_t aExp, zExp;
2337    uint32_t aSig, zSig;
2338    uint64_t rem, term;
2339    a = float32_squash_input_denormal(a STATUS_VAR);
2340
2341    aSig = extractFloat32Frac( a );
2342    aExp = extractFloat32Exp( a );
2343    aSign = extractFloat32Sign( a );
2344    if ( aExp == 0xFF ) {
2345        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2346        if ( ! aSign ) return a;
2347        float_raise( float_flag_invalid STATUS_VAR);
2348        return float32_default_nan;
2349    }
2350    if ( aSign ) {
2351        if ( ( aExp | aSig ) == 0 ) return a;
2352        float_raise( float_flag_invalid STATUS_VAR);
2353        return float32_default_nan;
2354    }
2355    if ( aExp == 0 ) {
2356        if ( aSig == 0 ) return float32_zero;
2357        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2358    }
2359    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2360    aSig = ( aSig | 0x00800000 )<<8;
2361    zSig = estimateSqrt32( aExp, aSig ) + 2;
2362    if ( ( zSig & 0x7F ) <= 5 ) {
2363        if ( zSig < 2 ) {
2364            zSig = 0x7FFFFFFF;
2365            goto roundAndPack;
2366        }
2367        aSig >>= aExp & 1;
2368        term = ( (uint64_t) zSig ) * zSig;
2369        rem = ( ( (uint64_t) aSig )<<32 ) - term;
2370        while ( (int64_t) rem < 0 ) {
2371            --zSig;
2372            rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2373        }
2374        zSig |= ( rem != 0 );
2375    }
2376    shift32RightJamming( zSig, 1, &zSig );
2377 roundAndPack:
2378    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2379
2380}
2381
2382/*----------------------------------------------------------------------------
2383| Returns the binary exponential of the single-precision floating-point value
2384| `a'. The operation is performed according to the IEC/IEEE Standard for
2385| Binary Floating-Point Arithmetic.
2386|
2387| Uses the following identities:
2388|
2389| 1. -------------------------------------------------------------------------
2390|      x    x*ln(2)
2391|     2  = e
2392|
2393| 2. -------------------------------------------------------------------------
2394|                      2     3     4     5           n
2395|      x        x     x     x     x     x           x
2396|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2397|               1!    2!    3!    4!    5!          n!
2398*----------------------------------------------------------------------------*/
2399
2400static const float64 float32_exp2_coefficients[15] =
2401{
2402    const_float64( 0x3ff0000000000000ll ), /*  1 */
2403    const_float64( 0x3fe0000000000000ll ), /*  2 */
2404    const_float64( 0x3fc5555555555555ll ), /*  3 */
2405    const_float64( 0x3fa5555555555555ll ), /*  4 */
2406    const_float64( 0x3f81111111111111ll ), /*  5 */
2407    const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2408    const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2409    const_float64( 0x3efa01a01a01a01all ), /*  8 */
2410    const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2411    const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2412    const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2413    const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2414    const_float64( 0x3de6124613a86d09ll ), /* 13 */
2415    const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2416    const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2417};
2418
2419float32 float32_exp2( float32 a STATUS_PARAM )
2420{
2421    flag aSign;
2422    int_fast16_t aExp;
2423    uint32_t aSig;
2424    float64 r, x, xn;
2425    int i;
2426    a = float32_squash_input_denormal(a STATUS_VAR);
2427
2428    aSig = extractFloat32Frac( a );
2429    aExp = extractFloat32Exp( a );
2430    aSign = extractFloat32Sign( a );
2431
2432    if ( aExp == 0xFF) {
2433        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2434        return (aSign) ? float32_zero : a;
2435    }
2436    if (aExp == 0) {
2437        if (aSig == 0) return float32_one;
2438    }
2439
2440    float_raise( float_flag_inexact STATUS_VAR);
2441
2442    /* ******************************* */
2443    /* using float64 for approximation */
2444    /* ******************************* */
2445    x = float32_to_float64(a STATUS_VAR);
2446    x = float64_mul(x, float64_ln2 STATUS_VAR);
2447
2448    xn = x;
2449    r = float64_one;
2450    for (i = 0 ; i < 15 ; i++) {
2451        float64 f;
2452
2453        f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2454        r = float64_add(r, f STATUS_VAR);
2455
2456        xn = float64_mul(xn, x STATUS_VAR);
2457    }
2458
2459    return float64_to_float32(r, status);
2460}
2461
2462/*----------------------------------------------------------------------------
2463| Returns the binary log of the single-precision floating-point value `a'.
2464| The operation is performed according to the IEC/IEEE Standard for Binary
2465| Floating-Point Arithmetic.
2466*----------------------------------------------------------------------------*/
2467float32 float32_log2( float32 a STATUS_PARAM )
2468{
2469    flag aSign, zSign;
2470    int_fast16_t aExp;
2471    uint32_t aSig, zSig, i;
2472
2473    a = float32_squash_input_denormal(a STATUS_VAR);
2474    aSig = extractFloat32Frac( a );
2475    aExp = extractFloat32Exp( a );
2476    aSign = extractFloat32Sign( a );
2477
2478    if ( aExp == 0 ) {
2479        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2480        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2481    }
2482    if ( aSign ) {
2483        float_raise( float_flag_invalid STATUS_VAR);
2484        return float32_default_nan;
2485    }
2486    if ( aExp == 0xFF ) {
2487        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2488        return a;
2489    }
2490
2491    aExp -= 0x7F;
2492    aSig |= 0x00800000;
2493    zSign = aExp < 0;
2494    zSig = aExp << 23;
2495
2496    for (i = 1 << 22; i > 0; i >>= 1) {
2497        aSig = ( (uint64_t)aSig * aSig ) >> 23;
2498        if ( aSig & 0x01000000 ) {
2499            aSig >>= 1;
2500            zSig |= i;
2501        }
2502    }
2503
2504    if ( zSign )
2505        zSig = -zSig;
2506
2507    return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2508}
2509
2510/*----------------------------------------------------------------------------
2511| Returns 1 if the single-precision floating-point value `a' is equal to
2512| the corresponding value `b', and 0 otherwise.  The invalid exception is
2513| raised if either operand is a NaN.  Otherwise, the comparison is performed
2514| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2515*----------------------------------------------------------------------------*/
2516
2517int float32_eq( float32 a, float32 b STATUS_PARAM )
2518{
2519    uint32_t av, bv;
2520    a = float32_squash_input_denormal(a STATUS_VAR);
2521    b = float32_squash_input_denormal(b STATUS_VAR);
2522
2523    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2524         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2525       ) {
2526        float_raise( float_flag_invalid STATUS_VAR);
2527        return 0;
2528    }
2529    av = float32_val(a);
2530    bv = float32_val(b);
2531    return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2532}
2533
2534/*----------------------------------------------------------------------------
2535| Returns 1 if the single-precision floating-point value `a' is less than
2536| or equal to the corresponding value `b', and 0 otherwise.  The invalid
2537| exception is raised if either operand is a NaN.  The comparison is performed
2538| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2539*----------------------------------------------------------------------------*/
2540
2541int float32_le( float32 a, float32 b STATUS_PARAM )
2542{
2543    flag aSign, bSign;
2544    uint32_t av, bv;
2545    a = float32_squash_input_denormal(a STATUS_VAR);
2546    b = float32_squash_input_denormal(b STATUS_VAR);
2547
2548    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2549         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2550       ) {
2551        float_raise( float_flag_invalid STATUS_VAR);
2552        return 0;
2553    }
2554    aSign = extractFloat32Sign( a );
2555    bSign = extractFloat32Sign( b );
2556    av = float32_val(a);
2557    bv = float32_val(b);
2558    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2559    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2560
2561}
2562
2563/*----------------------------------------------------------------------------
2564| Returns 1 if the single-precision floating-point value `a' is less than
2565| the corresponding value `b', and 0 otherwise.  The invalid exception is
2566| raised if either operand is a NaN.  The comparison is performed according
2567| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2568*----------------------------------------------------------------------------*/
2569
2570int float32_lt( float32 a, float32 b STATUS_PARAM )
2571{
2572    flag aSign, bSign;
2573    uint32_t av, bv;
2574    a = float32_squash_input_denormal(a STATUS_VAR);
2575    b = float32_squash_input_denormal(b STATUS_VAR);
2576
2577    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2578         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2579       ) {
2580        float_raise( float_flag_invalid STATUS_VAR);
2581        return 0;
2582    }
2583    aSign = extractFloat32Sign( a );
2584    bSign = extractFloat32Sign( b );
2585    av = float32_val(a);
2586    bv = float32_val(b);
2587    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2588    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2589
2590}
2591
2592/*----------------------------------------------------------------------------
2593| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2594| be compared, and 0 otherwise.  The invalid exception is raised if either
2595| operand is a NaN.  The comparison is performed according to the IEC/IEEE
2596| Standard for Binary Floating-Point Arithmetic.
2597*----------------------------------------------------------------------------*/
2598
2599int float32_unordered( float32 a, float32 b STATUS_PARAM )
2600{
2601    a = float32_squash_input_denormal(a STATUS_VAR);
2602    b = float32_squash_input_denormal(b STATUS_VAR);
2603
2604    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2605         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2606       ) {
2607        float_raise( float_flag_invalid STATUS_VAR);
2608        return 1;
2609    }
2610    return 0;
2611}
2612
2613/*----------------------------------------------------------------------------
2614| Returns 1 if the single-precision floating-point value `a' is equal to
2615| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2616| exception.  The comparison is performed according to the IEC/IEEE Standard
2617| for Binary Floating-Point Arithmetic.
2618*----------------------------------------------------------------------------*/
2619
2620int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2621{
2622    a = float32_squash_input_denormal(a STATUS_VAR);
2623    b = float32_squash_input_denormal(b STATUS_VAR);
2624
2625    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2626         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2627       ) {
2628        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2629            float_raise( float_flag_invalid STATUS_VAR);
2630        }
2631        return 0;
2632    }
2633    return ( float32_val(a) == float32_val(b) ) ||
2634            ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2635}
2636
2637/*----------------------------------------------------------------------------
2638| Returns 1 if the single-precision floating-point value `a' is less than or
2639| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2640| cause an exception.  Otherwise, the comparison is performed according to the
2641| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2642*----------------------------------------------------------------------------*/
2643
2644int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2645{
2646    flag aSign, bSign;
2647    uint32_t av, bv;
2648    a = float32_squash_input_denormal(a STATUS_VAR);
2649    b = float32_squash_input_denormal(b STATUS_VAR);
2650
2651    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2652         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2653       ) {
2654        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2655            float_raise( float_flag_invalid STATUS_VAR);
2656        }
2657        return 0;
2658    }
2659    aSign = extractFloat32Sign( a );
2660    bSign = extractFloat32Sign( b );
2661    av = float32_val(a);
2662    bv = float32_val(b);
2663    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2664    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2665
2666}
2667
2668/*----------------------------------------------------------------------------
2669| Returns 1 if the single-precision floating-point value `a' is less than
2670| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2671| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2672| Standard for Binary Floating-Point Arithmetic.
2673*----------------------------------------------------------------------------*/
2674
2675int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2676{
2677    flag aSign, bSign;
2678    uint32_t av, bv;
2679    a = float32_squash_input_denormal(a STATUS_VAR);
2680    b = float32_squash_input_denormal(b STATUS_VAR);
2681
2682    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2683         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2684       ) {
2685        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2686            float_raise( float_flag_invalid STATUS_VAR);
2687        }
2688        return 0;
2689    }
2690    aSign = extractFloat32Sign( a );
2691    bSign = extractFloat32Sign( b );
2692    av = float32_val(a);
2693    bv = float32_val(b);
2694    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2695    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2696
2697}
2698
2699/*----------------------------------------------------------------------------
2700| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2701| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2702| comparison is performed according to the IEC/IEEE Standard for Binary
2703| Floating-Point Arithmetic.
2704*----------------------------------------------------------------------------*/
2705
2706int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2707{
2708    a = float32_squash_input_denormal(a STATUS_VAR);
2709    b = float32_squash_input_denormal(b STATUS_VAR);
2710
2711    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2712         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2713       ) {
2714        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2715            float_raise( float_flag_invalid STATUS_VAR);
2716        }
2717        return 1;
2718    }
2719    return 0;
2720}
2721
2722/*----------------------------------------------------------------------------
2723| Returns the result of converting the double-precision floating-point value
2724| `a' to the 32-bit two's complement integer format.  The conversion is
2725| performed according to the IEC/IEEE Standard for Binary Floating-Point
2726| Arithmetic---which means in particular that the conversion is rounded
2727| according to the current rounding mode.  If `a' is a NaN, the largest
2728| positive integer is returned.  Otherwise, if the conversion overflows, the
2729| largest integer with the same sign as `a' is returned.
2730*----------------------------------------------------------------------------*/
2731
2732int32 float64_to_int32( float64 a STATUS_PARAM )
2733{
2734    flag aSign;
2735    int_fast16_t aExp, shiftCount;
2736    uint64_t aSig;
2737    a = float64_squash_input_denormal(a STATUS_VAR);
2738
2739    aSig = extractFloat64Frac( a );
2740    aExp = extractFloat64Exp( a );
2741    aSign = extractFloat64Sign( a );
2742    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2743    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2744    shiftCount = 0x42C - aExp;
2745    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2746    return roundAndPackInt32( aSign, aSig STATUS_VAR );
2747
2748}
2749
2750/*----------------------------------------------------------------------------
2751| Returns the result of converting the double-precision floating-point value
2752| `a' to the 32-bit two's complement integer format.  The conversion is
2753| performed according to the IEC/IEEE Standard for Binary Floating-Point
2754| Arithmetic, except that the conversion is always rounded toward zero.
2755| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2756| the conversion overflows, the largest integer with the same sign as `a' is
2757| returned.
2758*----------------------------------------------------------------------------*/
2759
2760int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2761{
2762    flag aSign;
2763    int_fast16_t aExp, shiftCount;
2764    uint64_t aSig, savedASig;
2765    int32_t z;
2766    a = float64_squash_input_denormal(a STATUS_VAR);
2767
2768    aSig = extractFloat64Frac( a );
2769    aExp = extractFloat64Exp( a );
2770    aSign = extractFloat64Sign( a );
2771    if ( 0x41E < aExp ) {
2772        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2773        goto invalid;
2774    }
2775    else if ( aExp < 0x3FF ) {
2776        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2777        return 0;
2778    }
2779    aSig |= LIT64( 0x0010000000000000 );
2780    shiftCount = 0x433 - aExp;
2781    savedASig = aSig;
2782    aSig >>= shiftCount;
2783    z = aSig;
2784    if ( aSign ) z = - z;
2785    if ( ( z < 0 ) ^ aSign ) {
2786 invalid:
2787        float_raise( float_flag_invalid STATUS_VAR);
2788        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2789    }
2790    if ( ( aSig<<shiftCount ) != savedASig ) {
2791        STATUS(float_exception_flags) |= float_flag_inexact;
2792    }
2793    return z;
2794
2795}
2796
2797/*----------------------------------------------------------------------------
2798| Returns the result of converting the double-precision floating-point value
2799| `a' to the 16-bit two's complement integer format.  The conversion is
2800| performed according to the IEC/IEEE Standard for Binary Floating-Point
2801| Arithmetic, except that the conversion is always rounded toward zero.
2802| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2803| the conversion overflows, the largest integer with the same sign as `a' is
2804| returned.
2805*----------------------------------------------------------------------------*/
2806
2807int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
2808{
2809    flag aSign;
2810    int_fast16_t aExp, shiftCount;
2811    uint64_t aSig, savedASig;
2812    int32 z;
2813
2814    aSig = extractFloat64Frac( a );
2815    aExp = extractFloat64Exp( a );
2816    aSign = extractFloat64Sign( a );
2817    if ( 0x40E < aExp ) {
2818        if ( ( aExp == 0x7FF ) && aSig ) {
2819            aSign = 0;
2820        }
2821        goto invalid;
2822    }
2823    else if ( aExp < 0x3FF ) {
2824        if ( aExp || aSig ) {
2825            STATUS(float_exception_flags) |= float_flag_inexact;
2826        }
2827        return 0;
2828    }
2829    aSig |= LIT64( 0x0010000000000000 );
2830    shiftCount = 0x433 - aExp;
2831    savedASig = aSig;
2832    aSig >>= shiftCount;
2833    z = aSig;
2834    if ( aSign ) {
2835        z = - z;
2836    }
2837    if ( ( (int16_t)z < 0 ) ^ aSign ) {
2838 invalid:
2839        float_raise( float_flag_invalid STATUS_VAR);
2840        return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2841    }
2842    if ( ( aSig<<shiftCount ) != savedASig ) {
2843        STATUS(float_exception_flags) |= float_flag_inexact;
2844    }
2845    return z;
2846}
2847
2848/*----------------------------------------------------------------------------
2849| Returns the result of converting the double-precision floating-point value
2850| `a' to the 64-bit two's complement integer format.  The conversion is
2851| performed according to the IEC/IEEE Standard for Binary Floating-Point
2852| Arithmetic---which means in particular that the conversion is rounded
2853| according to the current rounding mode.  If `a' is a NaN, the largest
2854| positive integer is returned.  Otherwise, if the conversion overflows, the
2855| largest integer with the same sign as `a' is returned.
2856*----------------------------------------------------------------------------*/
2857
2858int64 float64_to_int64( float64 a STATUS_PARAM )
2859{
2860    flag aSign;
2861    int_fast16_t aExp, shiftCount;
2862    uint64_t aSig, aSigExtra;
2863    a = float64_squash_input_denormal(a STATUS_VAR);
2864
2865    aSig = extractFloat64Frac( a );
2866    aExp = extractFloat64Exp( a );
2867    aSign = extractFloat64Sign( a );
2868    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2869    shiftCount = 0x433 - aExp;
2870    if ( shiftCount <= 0 ) {
2871        if ( 0x43E < aExp ) {
2872            float_raise( float_flag_invalid STATUS_VAR);
2873            if (    ! aSign
2874                 || (    ( aExp == 0x7FF )
2875                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2876               ) {
2877                return LIT64( 0x7FFFFFFFFFFFFFFF );
2878            }
2879            return (int64_t) LIT64( 0x8000000000000000 );
2880        }
2881        aSigExtra = 0;
2882        aSig <<= - shiftCount;
2883    }
2884    else {
2885        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2886    }
2887    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2888
2889}
2890
2891/*----------------------------------------------------------------------------
2892| Returns the result of converting the double-precision floating-point value
2893| `a' to the 64-bit two's complement integer format.  The conversion is
2894| performed according to the IEC/IEEE Standard for Binary Floating-Point
2895| Arithmetic, except that the conversion is always rounded toward zero.
2896| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2897| the conversion overflows, the largest integer with the same sign as `a' is
2898| returned.
2899*----------------------------------------------------------------------------*/
2900
2901int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2902{
2903    flag aSign;
2904    int_fast16_t aExp, shiftCount;
2905    uint64_t aSig;
2906    int64 z;
2907    a = float64_squash_input_denormal(a STATUS_VAR);
2908
2909    aSig = extractFloat64Frac( a );
2910    aExp = extractFloat64Exp( a );
2911    aSign = extractFloat64Sign( a );
2912    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2913    shiftCount = aExp - 0x433;
2914    if ( 0 <= shiftCount ) {
2915        if ( 0x43E <= aExp ) {
2916            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2917                float_raise( float_flag_invalid STATUS_VAR);
2918                if (    ! aSign
2919                     || (    ( aExp == 0x7FF )
2920                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2921                   ) {
2922                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2923                }
2924            }
2925            return (int64_t) LIT64( 0x8000000000000000 );
2926        }
2927        z = aSig<<shiftCount;
2928    }
2929    else {
2930        if ( aExp < 0x3FE ) {
2931            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2932            return 0;
2933        }
2934        z = aSig>>( - shiftCount );
2935        if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2936            STATUS(float_exception_flags) |= float_flag_inexact;
2937        }
2938    }
2939    if ( aSign ) z = - z;
2940    return z;
2941
2942}
2943
2944/*----------------------------------------------------------------------------
2945| Returns the result of converting the double-precision floating-point value
2946| `a' to the single-precision floating-point format.  The conversion is
2947| performed according to the IEC/IEEE Standard for Binary Floating-Point
2948| Arithmetic.
2949*----------------------------------------------------------------------------*/
2950
2951float32 float64_to_float32( float64 a STATUS_PARAM )
2952{
2953    flag aSign;
2954    int_fast16_t aExp;
2955    uint64_t aSig;
2956    uint32_t zSig;
2957    a = float64_squash_input_denormal(a STATUS_VAR);
2958
2959    aSig = extractFloat64Frac( a );
2960    aExp = extractFloat64Exp( a );
2961    aSign = extractFloat64Sign( a );
2962    if ( aExp == 0x7FF ) {
2963        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2964        return packFloat32( aSign, 0xFF, 0 );
2965    }
2966    shift64RightJamming( aSig, 22, &aSig );
2967    zSig = aSig;
2968    if ( aExp || zSig ) {
2969        zSig |= 0x40000000;
2970        aExp -= 0x381;
2971    }
2972    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2973
2974}
2975
2976
2977/*----------------------------------------------------------------------------
2978| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2979| half-precision floating-point value, returning the result.  After being
2980| shifted into the proper positions, the three fields are simply added
2981| together to form the result.  This means that any integer portion of `zSig'
2982| will be added into the exponent.  Since a properly normalized significand
2983| will have an integer portion equal to 1, the `zExp' input should be 1 less
2984| than the desired result exponent whenever `zSig' is a complete, normalized
2985| significand.
2986*----------------------------------------------------------------------------*/
2987static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
2988{
2989    return make_float16(
2990        (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2991}
2992
2993/* Half precision floats come in two formats: standard IEEE and "ARM" format.
2994   The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2995
2996float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2997{
2998    flag aSign;
2999    int_fast16_t aExp;
3000    uint32_t aSig;
3001
3002    aSign = extractFloat16Sign(a);
3003    aExp = extractFloat16Exp(a);
3004    aSig = extractFloat16Frac(a);
3005
3006    if (aExp == 0x1f && ieee) {
3007        if (aSig) {
3008            return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3009        }
3010        return packFloat32(aSign, 0xff, 0);
3011    }
3012    if (aExp == 0) {
3013        int8 shiftCount;
3014
3015        if (aSig == 0) {
3016            return packFloat32(aSign, 0, 0);
3017        }
3018
3019        shiftCount = countLeadingZeros32( aSig ) - 21;
3020        aSig = aSig << shiftCount;
3021        aExp = -shiftCount;
3022    }
3023    return packFloat32( aSign, aExp + 0x70, aSig << 13);
3024}
3025
3026float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3027{
3028    flag aSign;
3029    int_fast16_t aExp;
3030    uint32_t aSig;
3031    uint32_t mask;
3032    uint32_t increment;
3033    int8 roundingMode;
3034    a = float32_squash_input_denormal(a STATUS_VAR);
3035
3036    aSig = extractFloat32Frac( a );
3037    aExp = extractFloat32Exp( a );
3038    aSign = extractFloat32Sign( a );
3039    if ( aExp == 0xFF ) {
3040        if (aSig) {
3041            /* Input is a NaN */
3042            float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3043            if (!ieee) {
3044                return packFloat16(aSign, 0, 0);
3045            }
3046            return r;
3047        }
3048        /* Infinity */
3049        if (!ieee) {
3050            float_raise(float_flag_invalid STATUS_VAR);
3051            return packFloat16(aSign, 0x1f, 0x3ff);
3052        }
3053        return packFloat16(aSign, 0x1f, 0);
3054    }
3055    if (aExp == 0 && aSig == 0) {
3056        return packFloat16(aSign, 0, 0);
3057    }
3058    /* Decimal point between bits 22 and 23.  */
3059    aSig |= 0x00800000;
3060    aExp -= 0x7f;
3061    if (aExp < -14) {
3062        mask = 0x00ffffff;
3063        if (aExp >= -24) {
3064            mask >>= 25 + aExp;
3065        }
3066    } else {
3067        mask = 0x00001fff;
3068    }
3069    if (aSig & mask) {
3070        float_raise( float_flag_underflow STATUS_VAR );
3071        roundingMode = STATUS(float_rounding_mode);
3072        switch (roundingMode) {
3073        case float_round_nearest_even:
3074            increment = (mask + 1) >> 1;
3075            if ((aSig & mask) == increment) {
3076                increment = aSig & (increment << 1);
3077            }
3078            break;
3079        case float_round_up:
3080            increment = aSign ? 0 : mask;
3081            break;
3082        case float_round_down:
3083            increment = aSign ? mask : 0;
3084            break;
3085        default: /* round_to_zero */
3086            increment = 0;
3087            break;
3088        }
3089        aSig += increment;
3090        if (aSig >= 0x01000000) {
3091            aSig >>= 1;
3092            aExp++;
3093        }
3094    } else if (aExp < -14
3095          && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
3096        float_raise( float_flag_underflow STATUS_VAR);
3097    }
3098
3099    if (ieee) {
3100        if (aExp > 15) {
3101            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
3102            return packFloat16(aSign, 0x1f, 0);
3103        }
3104    } else {
3105        if (aExp > 16) {
3106            float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
3107            return packFloat16(aSign, 0x1f, 0x3ff);
3108        }
3109    }
3110    if (aExp < -24) {
3111        return packFloat16(aSign, 0, 0);
3112    }
3113    if (aExp < -14) {
3114        aSig >>= -14 - aExp;
3115        aExp = -14;
3116    }
3117    return packFloat16(aSign, aExp + 14, aSig >> 13);
3118}
3119
3120/*----------------------------------------------------------------------------
3121| Returns the result of converting the double-precision floating-point value
3122| `a' to the extended double-precision floating-point format.  The conversion
3123| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3124| Arithmetic.
3125*----------------------------------------------------------------------------*/
3126
3127floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3128{
3129    flag aSign;
3130    int_fast16_t aExp;
3131    uint64_t aSig;
3132
3133    a = float64_squash_input_denormal(a STATUS_VAR);
3134    aSig = extractFloat64Frac( a );
3135    aExp = extractFloat64Exp( a );
3136    aSign = extractFloat64Sign( a );
3137    if ( aExp == 0x7FF ) {
3138        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3139        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3140    }
3141    if ( aExp == 0 ) {
3142        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3143        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3144    }
3145    return
3146        packFloatx80(
3147            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3148
3149}
3150
3151/*----------------------------------------------------------------------------
3152| Returns the result of converting the double-precision floating-point value
3153| `a' to the quadruple-precision floating-point format.  The conversion is
3154| performed according to the IEC/IEEE Standard for Binary Floating-Point
3155| Arithmetic.
3156*----------------------------------------------------------------------------*/
3157
3158float128 float64_to_float128( float64 a STATUS_PARAM )
3159{
3160    flag aSign;
3161    int_fast16_t aExp;
3162    uint64_t aSig, zSig0, zSig1;
3163
3164    a = float64_squash_input_denormal(a STATUS_VAR);
3165    aSig = extractFloat64Frac( a );
3166    aExp = extractFloat64Exp( a );
3167    aSign = extractFloat64Sign( a );
3168    if ( aExp == 0x7FF ) {
3169        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3170        return packFloat128( aSign, 0x7FFF, 0, 0 );
3171    }
3172    if ( aExp == 0 ) {
3173        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3174        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3175        --aExp;
3176    }
3177    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3178    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3179
3180}
3181
3182/*----------------------------------------------------------------------------
3183| Rounds the double-precision floating-point value `a' to an integer, and
3184| returns the result as a double-precision floating-point value.  The
3185| operation is performed according to the IEC/IEEE Standard for Binary
3186| Floating-Point Arithmetic.
3187*----------------------------------------------------------------------------*/
3188
3189float64 float64_round_to_int( float64 a STATUS_PARAM )
3190{
3191    flag aSign;
3192    int_fast16_t aExp;
3193    uint64_t lastBitMask, roundBitsMask;
3194    int8 roundingMode;
3195    uint64_t z;
3196    a = float64_squash_input_denormal(a STATUS_VAR);
3197
3198    aExp = extractFloat64Exp( a );
3199    if ( 0x433 <= aExp ) {
3200        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3201            return propagateFloat64NaN( a, a STATUS_VAR );
3202        }
3203        return a;
3204    }
3205    if ( aExp < 0x3FF ) {
3206        if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3207        STATUS(float_exception_flags) |= float_flag_inexact;
3208        aSign = extractFloat64Sign( a );
3209        switch ( STATUS(float_rounding_mode) ) {
3210         case float_round_nearest_even:
3211            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3212                return packFloat64( aSign, 0x3FF, 0 );
3213            }
3214            break;
3215         case float_round_down:
3216            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3217         case float_round_up:
3218            return make_float64(
3219            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3220        }
3221        return packFloat64( aSign, 0, 0 );
3222    }
3223    lastBitMask = 1;
3224    lastBitMask <<= 0x433 - aExp;
3225    roundBitsMask = lastBitMask - 1;
3226    z = float64_val(a);
3227    roundingMode = STATUS(float_rounding_mode);
3228    if ( roundingMode == float_round_nearest_even ) {
3229        z += lastBitMask>>1;
3230        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3231    }
3232    else if ( roundingMode != float_round_to_zero ) {
3233        if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3234            z += roundBitsMask;
3235        }
3236    }
3237    z &= ~ roundBitsMask;
3238    if ( z != float64_val(a) )
3239        STATUS(float_exception_flags) |= float_flag_inexact;
3240    return make_float64(z);
3241
3242}
3243
3244float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3245{
3246    int oldmode;
3247    float64 res;
3248    oldmode = STATUS(float_rounding_mode);
3249    STATUS(float_rounding_mode) = float_round_to_zero;
3250    res = float64_round_to_int(a STATUS_VAR);
3251    STATUS(float_rounding_mode) = oldmode;
3252    return res;
3253}
3254
3255/*----------------------------------------------------------------------------
3256| Returns the result of adding the absolute values of the double-precision
3257| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3258| before being returned.  `zSign' is ignored if the result is a NaN.
3259| The addition is performed according to the IEC/IEEE Standard for Binary
3260| Floating-Point Arithmetic.
3261*----------------------------------------------------------------------------*/
3262
3263static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3264{
3265    int_fast16_t aExp, bExp, zExp;
3266    uint64_t aSig, bSig, zSig;
3267    int_fast16_t expDiff;
3268
3269    aSig = extractFloat64Frac( a );
3270    aExp = extractFloat64Exp( a );
3271    bSig = extractFloat64Frac( b );
3272    bExp = extractFloat64Exp( b );
3273    expDiff = aExp - bExp;
3274    aSig <<= 9;
3275    bSig <<= 9;
3276    if ( 0 < expDiff ) {
3277        if ( aExp == 0x7FF ) {
3278            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3279            return a;
3280        }
3281        if ( bExp == 0 ) {
3282            --expDiff;
3283        }
3284        else {
3285            bSig |= LIT64( 0x2000000000000000 );
3286        }
3287        shift64RightJamming( bSig, expDiff, &bSig );
3288        zExp = aExp;
3289    }
3290    else if ( expDiff < 0 ) {
3291        if ( bExp == 0x7FF ) {
3292            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3293            return packFloat64( zSign, 0x7FF, 0 );
3294        }
3295        if ( aExp == 0 ) {
3296            ++expDiff;
3297        }
3298        else {
3299            aSig |= LIT64( 0x2000000000000000 );
3300        }
3301        shift64RightJamming( aSig, - expDiff, &aSig );
3302        zExp = bExp;
3303    }
3304    else {
3305        if ( aExp == 0x7FF ) {
3306            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3307            return a;
3308        }
3309        if ( aExp == 0 ) {
3310            if (STATUS(flush_to_zero)) {
3311                if (aSig | bSig) {
3312                    float_raise(float_flag_output_denormal STATUS_VAR);
3313                }
3314                return packFloat64(zSign, 0, 0);
3315            }
3316            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3317        }
3318        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3319        zExp = aExp;
3320        goto roundAndPack;
3321    }
3322    aSig |= LIT64( 0x2000000000000000 );
3323    zSig = ( aSig + bSig )<<1;
3324    --zExp;
3325    if ( (int64_t) zSig < 0 ) {
3326        zSig = aSig + bSig;
3327        ++zExp;
3328    }
3329 roundAndPack:
3330    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3331
3332}
3333
3334/*----------------------------------------------------------------------------
3335| Returns the result of subtracting the absolute values of the double-
3336| precision floating-point values `a' and `b'.  If `zSign' is 1, the
3337| difference is negated before being returned.  `zSign' is ignored if the
3338| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3339| Standard for Binary Floating-Point Arithmetic.
3340*----------------------------------------------------------------------------*/
3341
3342static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3343{
3344    int_fast16_t aExp, bExp, zExp;
3345    uint64_t aSig, bSig, zSig;
3346    int_fast16_t expDiff;
3347
3348    aSig = extractFloat64Frac( a );
3349    aExp = extractFloat64Exp( a );
3350    bSig = extractFloat64Frac( b );
3351    bExp = extractFloat64Exp( b );
3352    expDiff = aExp - bExp;
3353    aSig <<= 10;
3354    bSig <<= 10;
3355    if ( 0 < expDiff ) goto aExpBigger;
3356    if ( expDiff < 0 ) goto bExpBigger;
3357    if ( aExp == 0x7FF ) {
3358        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3359        float_raise( float_flag_invalid STATUS_VAR);
3360        return float64_default_nan;
3361    }
3362    if ( aExp == 0 ) {
3363        aExp = 1;
3364        bExp = 1;
3365    }
3366    if ( bSig < aSig ) goto aBigger;
3367    if ( aSig < bSig ) goto bBigger;
3368    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3369 bExpBigger:
3370    if ( bExp == 0x7FF ) {
3371        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3372        return packFloat64( zSign ^ 1, 0x7FF, 0 );
3373    }
3374    if ( aExp == 0 ) {
3375        ++expDiff;
3376    }
3377    else {
3378        aSig |= LIT64( 0x4000000000000000 );
3379    }
3380    shift64RightJamming( aSig, - expDiff, &aSig );
3381    bSig |= LIT64( 0x4000000000000000 );
3382 bBigger:
3383    zSig = bSig - aSig;
3384    zExp = bExp;
3385    zSign ^= 1;
3386    goto normalizeRoundAndPack;
3387 aExpBigger:
3388    if ( aExp == 0x7FF ) {
3389        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3390        return a;
3391    }
3392    if ( bExp == 0 ) {
3393        --expDiff;
3394    }
3395    else {
3396        bSig |= LIT64( 0x4000000000000000 );
3397    }
3398    shift64RightJamming( bSig, expDiff, &bSig );
3399    aSig |= LIT64( 0x4000000000000000 );
3400 aBigger:
3401    zSig = aSig - bSig;
3402    zExp = aExp;
3403 normalizeRoundAndPack:
3404    --zExp;
3405    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3406
3407}
3408
3409/*----------------------------------------------------------------------------
3410| Returns the result of adding the double-precision floating-point values `a'
3411| and `b'.  The operation is performed according to the IEC/IEEE Standard for
3412| Binary Floating-Point Arithmetic.
3413*----------------------------------------------------------------------------*/
3414
3415float64 float64_add( float64 a, float64 b STATUS_PARAM )
3416{
3417    flag aSign, bSign;
3418    a = float64_squash_input_denormal(a STATUS_VAR);
3419    b = float64_squash_input_denormal(b STATUS_VAR);
3420
3421    aSign = extractFloat64Sign( a );
3422    bSign = extractFloat64Sign( b );
3423    if ( aSign == bSign ) {
3424        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3425    }
3426    else {
3427        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3428    }
3429
3430}
3431
3432/*----------------------------------------------------------------------------
3433| Returns the result of subtracting the double-precision floating-point values
3434| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3435| for Binary Floating-Point Arithmetic.
3436*----------------------------------------------------------------------------*/
3437
3438float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3439{
3440    flag aSign, bSign;
3441    a = float64_squash_input_denormal(a STATUS_VAR);
3442    b = float64_squash_input_denormal(b STATUS_VAR);
3443
3444    aSign = extractFloat64Sign( a );
3445    bSign = extractFloat64Sign( b );
3446    if ( aSign == bSign ) {
3447        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3448    }
3449    else {
3450        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3451    }
3452
3453}
3454
3455/*----------------------------------------------------------------------------
3456| Returns the result of multiplying the double-precision floating-point values
3457| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3458| for Binary Floating-Point Arithmetic.
3459*----------------------------------------------------------------------------*/
3460
3461float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3462{
3463    flag aSign, bSign, zSign;
3464    int_fast16_t aExp, bExp, zExp;
3465    uint64_t aSig, bSig, zSig0, zSig1;
3466
3467    a = float64_squash_input_denormal(a STATUS_VAR);
3468    b = float64_squash_input_denormal(b STATUS_VAR);
3469
3470    aSig = extractFloat64Frac( a );
3471    aExp = extractFloat64Exp( a );
3472    aSign = extractFloat64Sign( a );
3473    bSig = extractFloat64Frac( b );
3474    bExp = extractFloat64Exp( b );
3475    bSign = extractFloat64Sign( b );
3476    zSign = aSign ^ bSign;
3477    if ( aExp == 0x7FF ) {
3478        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3479            return propagateFloat64NaN( a, b STATUS_VAR );
3480        }
3481        if ( ( bExp | bSig ) == 0 ) {
3482            float_raise( float_flag_invalid STATUS_VAR);
3483            return float64_default_nan;
3484        }
3485        return packFloat64( zSign, 0x7FF, 0 );
3486    }
3487    if ( bExp == 0x7FF ) {
3488        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3489        if ( ( aExp | aSig ) == 0 ) {
3490            float_raise( float_flag_invalid STATUS_VAR);
3491            return float64_default_nan;
3492        }
3493        return packFloat64( zSign, 0x7FF, 0 );
3494    }
3495    if ( aExp == 0 ) {
3496        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3497        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3498    }
3499    if ( bExp == 0 ) {
3500        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3501        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3502    }
3503    zExp = aExp + bExp - 0x3FF;
3504    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3505    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3506    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3507    zSig0 |= ( zSig1 != 0 );
3508    if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3509        zSig0 <<= 1;
3510        --zExp;
3511    }
3512    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3513
3514}
3515
3516/*----------------------------------------------------------------------------
3517| Returns the result of dividing the double-precision floating-point value `a'
3518| by the corresponding value `b'.  The operation is performed according to
3519| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3520*----------------------------------------------------------------------------*/
3521
3522float64 float64_div( float64 a, float64 b STATUS_PARAM )
3523{
3524    flag aSign, bSign, zSign;
3525    int_fast16_t aExp, bExp, zExp;
3526    uint64_t aSig, bSig, zSig;
3527    uint64_t rem0, rem1;
3528    uint64_t term0, term1;
3529    a = float64_squash_input_denormal(a STATUS_VAR);
3530    b = float64_squash_input_denormal(b STATUS_VAR);
3531
3532    aSig = extractFloat64Frac( a );
3533    aExp = extractFloat64Exp( a );
3534    aSign = extractFloat64Sign( a );
3535    bSig = extractFloat64Frac( b );
3536    bExp = extractFloat64Exp( b );
3537    bSign = extractFloat64Sign( b );
3538    zSign = aSign ^ bSign;
3539    if ( aExp == 0x7FF ) {
3540        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3541        if ( bExp == 0x7FF ) {
3542            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3543            float_raise( float_flag_invalid STATUS_VAR);
3544            return float64_default_nan;
3545        }
3546        return packFloat64( zSign, 0x7FF, 0 );
3547    }
3548    if ( bExp == 0x7FF ) {
3549        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3550        return packFloat64( zSign, 0, 0 );
3551    }
3552    if ( bExp == 0 ) {
3553        if ( bSig == 0 ) {
3554            if ( ( aExp | aSig ) == 0 ) {
3555                float_raise( float_flag_invalid STATUS_VAR);
3556                return float64_default_nan;
3557            }
3558            float_raise( float_flag_divbyzero STATUS_VAR);
3559            return packFloat64( zSign, 0x7FF, 0 );
3560        }
3561        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3562    }
3563    if ( aExp == 0 ) {
3564        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3565        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3566    }
3567    zExp = aExp - bExp + 0x3FD;
3568    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3569    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3570    if ( bSig <= ( aSig + aSig ) ) {
3571        aSig >>= 1;
3572        ++zExp;
3573    }
3574    zSig = estimateDiv128To64( aSig, 0, bSig );
3575    if ( ( zSig & 0x1FF ) <= 2 ) {
3576        mul64To128( bSig, zSig, &term0, &term1 );
3577        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3578        while ( (int64_t) rem0 < 0 ) {
3579            --zSig;
3580            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3581        }
3582        zSig |= ( rem1 != 0 );
3583    }
3584    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3585
3586}
3587
3588/*----------------------------------------------------------------------------
3589| Returns the remainder of the double-precision floating-point value `a'
3590| with respect to the corresponding value `b'.  The operation is performed
3591| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3592*----------------------------------------------------------------------------*/
3593
3594float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3595{
3596    flag aSign, zSign;
3597    int_fast16_t aExp, bExp, expDiff;
3598    uint64_t aSig, bSig;
3599    uint64_t q, alternateASig;
3600    int64_t sigMean;
3601
3602    a = float64_squash_input_denormal(a STATUS_VAR);
3603    b = float64_squash_input_denormal(b STATUS_VAR);
3604    aSig = extractFloat64Frac( a );
3605    aExp = extractFloat64Exp( a );
3606    aSign = extractFloat64Sign( a );
3607    bSig = extractFloat64Frac( b );
3608    bExp = extractFloat64Exp( b );
3609    if ( aExp == 0x7FF ) {
3610        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3611            return propagateFloat64NaN( a, b STATUS_VAR );
3612        }
3613        float_raise( float_flag_invalid STATUS_VAR);
3614        return float64_default_nan;
3615    }
3616    if ( bExp == 0x7FF ) {
3617        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3618        return a;
3619    }
3620    if ( bExp == 0 ) {
3621        if ( bSig == 0 ) {
3622            float_raise( float_flag_invalid STATUS_VAR);
3623            return float64_default_nan;
3624        }
3625        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3626    }
3627    if ( aExp == 0 ) {
3628        if ( aSig == 0 ) return a;
3629        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3630    }
3631    expDiff = aExp - bExp;
3632    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3633    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3634    if ( expDiff < 0 ) {
3635        if ( expDiff < -1 ) return a;
3636        aSig >>= 1;
3637    }
3638    q = ( bSig <= aSig );
3639    if ( q ) aSig -= bSig;
3640    expDiff -= 64;
3641    while ( 0 < expDiff ) {
3642        q = estimateDiv128To64( aSig, 0, bSig );
3643        q = ( 2 < q ) ? q - 2 : 0;
3644        aSig = - ( ( bSig>>2 ) * q );
3645        expDiff -= 62;
3646    }
3647    expDiff += 64;
3648    if ( 0 < expDiff ) {
3649        q = estimateDiv128To64( aSig, 0, bSig );
3650        q = ( 2 < q ) ? q - 2 : 0;
3651        q >>= 64 - expDiff;
3652        bSig >>= 2;
3653        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3654    }
3655    else {
3656        aSig >>= 2;
3657        bSig >>= 2;
3658    }
3659    do {
3660        alternateASig = aSig;
3661        ++q;
3662        aSig -= bSig;
3663    } while ( 0 <= (int64_t) aSig );
3664    sigMean = aSig + alternateASig;
3665    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3666        aSig = alternateASig;
3667    }
3668    zSign = ( (int64_t) aSig < 0 );
3669    if ( zSign ) aSig = - aSig;
3670    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3671
3672}
3673
3674/*----------------------------------------------------------------------------
3675| Returns the result of multiplying the double-precision floating-point values
3676| `a' and `b' then adding 'c', with no intermediate rounding step after the
3677| multiplication.  The operation is performed according to the IEC/IEEE
3678| Standard for Binary Floating-Point Arithmetic 754-2008.
3679| The flags argument allows the caller to select negation of the
3680| addend, the intermediate product, or the final result. (The difference
3681| between this and having the caller do a separate negation is that negating
3682| externally will flip the sign bit on NaNs.)
3683*----------------------------------------------------------------------------*/
3684
3685float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
3686{
3687    flag aSign, bSign, cSign, zSign;
3688    int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
3689    uint64_t aSig, bSig, cSig;
3690    flag pInf, pZero, pSign;
3691    uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
3692    int shiftcount;
3693    flag signflip, infzero;
3694
3695    a = float64_squash_input_denormal(a STATUS_VAR);
3696    b = float64_squash_input_denormal(b STATUS_VAR);
3697    c = float64_squash_input_denormal(c STATUS_VAR);
3698    aSig = extractFloat64Frac(a);
3699    aExp = extractFloat64Exp(a);
3700    aSign = extractFloat64Sign(a);
3701    bSig = extractFloat64Frac(b);
3702    bExp = extractFloat64Exp(b);
3703    bSign = extractFloat64Sign(b);
3704    cSig = extractFloat64Frac(c);
3705    cExp = extractFloat64Exp(c);
3706    cSign = extractFloat64Sign(c);
3707
3708    infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
3709               (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
3710
3711    /* It is implementation-defined whether the cases of (0,inf,qnan)
3712     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
3713     * they return if they do), so we have to hand this information
3714     * off to the target-specific pick-a-NaN routine.
3715     */
3716    if (((aExp == 0x7ff) && aSig) ||
3717        ((bExp == 0x7ff) && bSig) ||
3718        ((cExp == 0x7ff) && cSig)) {
3719        return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
3720    }
3721
3722    if (infzero) {
3723        float_raise(float_flag_invalid STATUS_VAR);
3724        return float64_default_nan;
3725    }
3726
3727    if (flags & float_muladd_negate_c) {
3728        cSign ^= 1;
3729    }
3730
3731    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
3732
3733    /* Work out the sign and type of the product */
3734    pSign = aSign ^ bSign;
3735    if (flags & float_muladd_negate_product) {
3736        pSign ^= 1;
3737    }
3738    pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
3739    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
3740
3741    if (cExp == 0x7ff) {
3742        if (pInf && (pSign ^ cSign)) {
3743            /* addition of opposite-signed infinities => InvalidOperation */
3744            float_raise(float_flag_invalid STATUS_VAR);
3745            return float64_default_nan;
3746        }
3747        /* Otherwise generate an infinity of the same sign */
3748        return packFloat64(cSign ^ signflip, 0x7ff, 0);
3749    }
3750
3751    if (pInf) {
3752        return packFloat64(pSign ^ signflip, 0x7ff, 0);
3753    }
3754
3755    if (pZero) {
3756        if (cExp == 0) {
3757            if (cSig == 0) {
3758                /* Adding two exact zeroes */
3759                if (pSign == cSign) {
3760                    zSign = pSign;
3761                } else if (STATUS(float_rounding_mode) == float_round_down) {
3762                    zSign = 1;
3763                } else {
3764                    zSign = 0;
3765                }
3766                return packFloat64(zSign ^ signflip, 0, 0);
3767            }
3768            /* Exact zero plus a denorm */
3769            if (STATUS(flush_to_zero)) {
3770                float_raise(float_flag_output_denormal STATUS_VAR);
3771                return packFloat64(cSign ^ signflip, 0, 0);
3772            }
3773        }
3774        /* Zero plus something non-zero : just return the something */
3775        return make_float64(float64_val(c) ^ ((uint64_t)signflip << 63));
3776    }
3777
3778    if (aExp == 0) {
3779        normalizeFloat64Subnormal(aSig, &aExp, &aSig);
3780    }
3781    if (bExp == 0) {
3782        normalizeFloat64Subnormal(bSig, &bExp, &bSig);
3783    }
3784
3785    /* Calculate the actual result a * b + c */
3786
3787    /* Multiply first; this is easy. */
3788    /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
3789     * because we want the true exponent, not the "one-less-than"
3790     * flavour that roundAndPackFloat64() takes.
3791     */
3792    pExp = aExp + bExp - 0x3fe;
3793    aSig = (aSig | LIT64(0x0010000000000000))<<10;
3794    bSig = (bSig | LIT64(0x0010000000000000))<<11;
3795    mul64To128(aSig, bSig, &pSig0, &pSig1);
3796    if ((int64_t)(pSig0 << 1) >= 0) {
3797        shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
3798        pExp--;
3799    }
3800
3801    zSign = pSign ^ signflip;
3802
3803    /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
3804     * bit in position 126.
3805     */
3806    if (cExp == 0) {
3807        if (!cSig) {
3808            /* Throw out the special case of c being an exact zero now */
3809            shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
3810            return roundAndPackFloat64(zSign, pExp - 1,
3811                                       pSig1 STATUS_VAR);
3812        }
3813        normalizeFloat64Subnormal(cSig, &cExp, &cSig);
3814    }
3815
3816    /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
3817     * significand of the addend, with the explicit bit in position 126.
3818     */
3819    cSig0 = cSig << (126 - 64 - 52);
3820    cSig1 = 0;
3821    cSig0 |= LIT64(0x4000000000000000);
3822    expDiff = pExp - cExp;
3823
3824    if (pSign == cSign) {
3825        /* Addition */
3826        if (expDiff > 0) {
3827            /* scale c to match p */
3828            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3829            zExp = pExp;
3830        } else if (expDiff < 0) {
3831            /* scale p to match c */
3832            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3833            zExp = cExp;
3834        } else {
3835            /* no scaling needed */
3836            zExp = cExp;
3837        }
3838        /* Add significands and make sure explicit bit ends up in posn 126 */
3839        add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3840        if ((int64_t)zSig0 < 0) {
3841            shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
3842        } else {
3843            zExp--;
3844        }
3845        shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
3846        return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
3847    } else {
3848        /* Subtraction */
3849        if (expDiff > 0) {
3850            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3851            sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3852            zExp = pExp;
3853        } else if (expDiff < 0) {
3854            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3855            sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3856            zExp = cExp;
3857            zSign ^= 1;
3858        } else {
3859            zExp = pExp;
3860            if (lt128(cSig0, cSig1, pSig0, pSig1)) {
3861                sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3862            } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
3863                sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3864                zSign ^= 1;
3865            } else {
3866                /* Exact zero */
3867                zSign = signflip;
3868                if (STATUS(float_rounding_mode) == float_round_down) {
3869                    zSign ^= 1;
3870                }
3871                return packFloat64(zSign, 0, 0);
3872            }
3873        }
3874        --zExp;
3875        /* Do the equivalent of normalizeRoundAndPackFloat64() but
3876         * starting with the significand in a pair of uint64_t.
3877         */
3878        if (zSig0) {
3879            shiftcount = countLeadingZeros64(zSig0) - 1;
3880            shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
3881            if (zSig1) {
3882                zSig0 |= 1;
3883            }
3884            zExp -= shiftcount;
3885        } else {
3886            shiftcount = countLeadingZeros64(zSig1) - 1;
3887            zSig0 = zSig1 << shiftcount;
3888            zExp -= (shiftcount + 64);
3889        }
3890        return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
3891    }
3892}
3893
3894/*----------------------------------------------------------------------------
3895| Returns the square root of the double-precision floating-point value `a'.
3896| The operation is performed according to the IEC/IEEE Standard for Binary
3897| Floating-Point Arithmetic.
3898*----------------------------------------------------------------------------*/
3899
3900float64 float64_sqrt( float64 a STATUS_PARAM )
3901{
3902    flag aSign;
3903    int_fast16_t aExp, zExp;
3904    uint64_t aSig, zSig, doubleZSig;
3905    uint64_t rem0, rem1, term0, term1;
3906    a = float64_squash_input_denormal(a STATUS_VAR);
3907
3908    aSig = extractFloat64Frac( a );
3909    aExp = extractFloat64Exp( a );
3910    aSign = extractFloat64Sign( a );
3911    if ( aExp == 0x7FF ) {
3912        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3913        if ( ! aSign ) return a;
3914        float_raise( float_flag_invalid STATUS_VAR);
3915        return float64_default_nan;
3916    }
3917    if ( aSign ) {
3918        if ( ( aExp | aSig ) == 0 ) return a;
3919        float_raise( float_flag_invalid STATUS_VAR);
3920        return float64_default_nan;
3921    }
3922    if ( aExp == 0 ) {
3923        if ( aSig == 0 ) return float64_zero;
3924        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3925    }
3926    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3927    aSig |= LIT64( 0x0010000000000000 );
3928    zSig = estimateSqrt32( aExp, aSig>>21 );
3929    aSig <<= 9 - ( aExp & 1 );
3930    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3931    if ( ( zSig & 0x1FF ) <= 5 ) {
3932        doubleZSig = zSig<<1;
3933        mul64To128( zSig, zSig, &term0, &term1 );
3934        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3935        while ( (int64_t) rem0 < 0 ) {
3936            --zSig;
3937            doubleZSig -= 2;
3938            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3939        }
3940        zSig |= ( ( rem0 | rem1 ) != 0 );
3941    }
3942    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3943
3944}
3945
3946/*----------------------------------------------------------------------------
3947| Returns the binary log of the double-precision floating-point value `a'.
3948| The operation is performed according to the IEC/IEEE Standard for Binary
3949| Floating-Point Arithmetic.
3950*----------------------------------------------------------------------------*/
3951float64 float64_log2( float64 a STATUS_PARAM )
3952{
3953    flag aSign, zSign;
3954    int_fast16_t aExp;
3955    uint64_t aSig, aSig0, aSig1, zSig, i;
3956    a = float64_squash_input_denormal(a STATUS_VAR);
3957
3958    aSig = extractFloat64Frac( a );
3959    aExp = extractFloat64Exp( a );
3960    aSign = extractFloat64Sign( a );
3961
3962    if ( aExp == 0 ) {
3963        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3964        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3965    }
3966    if ( aSign ) {
3967        float_raise( float_flag_invalid STATUS_VAR);
3968        return float64_default_nan;
3969    }
3970    if ( aExp == 0x7FF ) {
3971        if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3972        return a;
3973    }
3974
3975    aExp -= 0x3FF;
3976    aSig |= LIT64( 0x0010000000000000 );
3977    zSign = aExp < 0;
3978    zSig = (uint64_t)aExp << 52;
3979    for (i = 1LL << 51; i > 0; i >>= 1) {
3980        mul64To128( aSig, aSig, &aSig0, &aSig1 );
3981        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3982        if ( aSig & LIT64( 0x0020000000000000 ) ) {
3983            aSig >>= 1;
3984            zSig |= i;
3985        }
3986    }
3987
3988    if ( zSign )
3989        zSig = -zSig;
3990    return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3991}
3992
3993/*----------------------------------------------------------------------------
3994| Returns 1 if the double-precision floating-point value `a' is equal to the
3995| corresponding value `b', and 0 otherwise.  The invalid exception is raised
3996| if either operand is a NaN.  Otherwise, the comparison is performed
3997| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3998*----------------------------------------------------------------------------*/
3999
4000int float64_eq( float64 a, float64 b STATUS_PARAM )
4001{
4002    uint64_t av, bv;
4003    a = float64_squash_input_denormal(a STATUS_VAR);
4004    b = float64_squash_input_denormal(b STATUS_VAR);
4005
4006    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4007         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4008       ) {
4009        float_raise( float_flag_invalid STATUS_VAR);
4010        return 0;
4011    }
4012    av = float64_val(a);
4013    bv = float64_val(b);
4014    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4015
4016}
4017
4018/*----------------------------------------------------------------------------
4019| Returns 1 if the double-precision floating-point value `a' is less than or
4020| equal to the corresponding value `b', and 0 otherwise.  The invalid
4021| exception is raised if either operand is a NaN.  The comparison is performed
4022| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4023*----------------------------------------------------------------------------*/
4024
4025int float64_le( float64 a, float64 b STATUS_PARAM )
4026{
4027    flag aSign, bSign;
4028    uint64_t av, bv;
4029    a = float64_squash_input_denormal(a STATUS_VAR);
4030    b = float64_squash_input_denormal(b STATUS_VAR);
4031
4032    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4033         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4034       ) {
4035        float_raise( float_flag_invalid STATUS_VAR);
4036        return 0;
4037    }
4038    aSign = extractFloat64Sign( a );
4039    bSign = extractFloat64Sign( b );
4040    av = float64_val(a);
4041    bv = float64_val(b);
4042    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4043    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4044
4045}
4046
4047/*----------------------------------------------------------------------------
4048| Returns 1 if the double-precision floating-point value `a' is less than
4049| the corresponding value `b', and 0 otherwise.  The invalid exception is
4050| raised if either operand is a NaN.  The comparison is performed according
4051| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4052*----------------------------------------------------------------------------*/
4053
4054int float64_lt( float64 a, float64 b STATUS_PARAM )
4055{
4056    flag aSign, bSign;
4057    uint64_t av, bv;
4058
4059    a = float64_squash_input_denormal(a STATUS_VAR);
4060    b = float64_squash_input_denormal(b STATUS_VAR);
4061    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4062         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4063       ) {
4064        float_raise( float_flag_invalid STATUS_VAR);
4065        return 0;
4066    }
4067    aSign = extractFloat64Sign( a );
4068    bSign = extractFloat64Sign( b );
4069    av = float64_val(a);
4070    bv = float64_val(b);
4071    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4072    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4073
4074}
4075
4076/*----------------------------------------------------------------------------
4077| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4078| be compared, and 0 otherwise.  The invalid exception is raised if either
4079| operand is a NaN.  The comparison is performed according to the IEC/IEEE
4080| Standard for Binary Floating-Point Arithmetic.
4081*----------------------------------------------------------------------------*/
4082
4083int float64_unordered( float64 a, float64 b STATUS_PARAM )
4084{
4085    a = float64_squash_input_denormal(a STATUS_VAR);
4086    b = float64_squash_input_denormal(b STATUS_VAR);
4087
4088    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4089         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4090       ) {
4091        float_raise( float_flag_invalid STATUS_VAR);
4092        return 1;
4093    }
4094    return 0;
4095}
4096
4097/*----------------------------------------------------------------------------
4098| Returns 1 if the double-precision floating-point value `a' is equal to the
4099| corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4100| exception.The comparison is performed according to the IEC/IEEE Standard
4101| for Binary Floating-Point Arithmetic.
4102*----------------------------------------------------------------------------*/
4103
4104int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
4105{
4106    uint64_t av, bv;
4107    a = float64_squash_input_denormal(a STATUS_VAR);
4108    b = float64_squash_input_denormal(b STATUS_VAR);
4109
4110    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4111         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4112       ) {
4113        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4114            float_raise( float_flag_invalid STATUS_VAR);
4115        }
4116        return 0;
4117    }
4118    av = float64_val(a);
4119    bv = float64_val(b);
4120    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4121
4122}
4123
4124/*----------------------------------------------------------------------------
4125| Returns 1 if the double-precision floating-point value `a' is less than or
4126| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4127| cause an exception.  Otherwise, the comparison is performed according to the
4128| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4129*----------------------------------------------------------------------------*/
4130
4131int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
4132{
4133    flag aSign, bSign;
4134    uint64_t av, bv;
4135    a = float64_squash_input_denormal(a STATUS_VAR);
4136    b = float64_squash_input_denormal(b STATUS_VAR);
4137
4138    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4139         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4140       ) {
4141        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4142            float_raise( float_flag_invalid STATUS_VAR);
4143        }
4144        return 0;
4145    }
4146    aSign = extractFloat64Sign( a );
4147    bSign = extractFloat64Sign( b );
4148    av = float64_val(a);
4149    bv = float64_val(b);
4150    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4151    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4152
4153}
4154
4155/*----------------------------------------------------------------------------
4156| Returns 1 if the double-precision floating-point value `a' is less than
4157| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4158| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4159| Standard for Binary Floating-Point Arithmetic.
4160*----------------------------------------------------------------------------*/
4161
4162int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
4163{
4164    flag aSign, bSign;
4165    uint64_t av, bv;
4166    a = float64_squash_input_denormal(a STATUS_VAR);
4167    b = float64_squash_input_denormal(b STATUS_VAR);
4168
4169    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4170         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4171       ) {
4172        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4173            float_raise( float_flag_invalid STATUS_VAR);
4174        }
4175        return 0;
4176    }
4177    aSign = extractFloat64Sign( a );
4178    bSign = extractFloat64Sign( b );
4179    av = float64_val(a);
4180    bv = float64_val(b);
4181    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4182    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4183
4184}
4185
4186/*----------------------------------------------------------------------------
4187| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4188| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4189| comparison is performed according to the IEC/IEEE Standard for Binary
4190| Floating-Point Arithmetic.
4191*----------------------------------------------------------------------------*/
4192
4193int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4194{
4195    a = float64_squash_input_denormal(a STATUS_VAR);
4196    b = float64_squash_input_denormal(b STATUS_VAR);
4197
4198    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4199         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4200       ) {
4201        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4202            float_raise( float_flag_invalid STATUS_VAR);
4203        }
4204        return 1;
4205    }
4206    return 0;
4207}
4208
4209/*----------------------------------------------------------------------------
4210| Returns the result of converting the extended double-precision floating-
4211| point value `a' to the 32-bit two's complement integer format.  The
4212| conversion is performed according to the IEC/IEEE Standard for Binary
4213| Floating-Point Arithmetic---which means in particular that the conversion
4214| is rounded according to the current rounding mode.  If `a' is a NaN, the
4215| largest positive integer is returned.  Otherwise, if the conversion
4216| overflows, the largest integer with the same sign as `a' is returned.
4217*----------------------------------------------------------------------------*/
4218
4219int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4220{
4221    flag aSign;
4222    int32 aExp, shiftCount;
4223    uint64_t aSig;
4224
4225    aSig = extractFloatx80Frac( a );
4226    aExp = extractFloatx80Exp( a );
4227    aSign = extractFloatx80Sign( a );
4228    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4229    shiftCount = 0x4037 - aExp;
4230    if ( shiftCount <= 0 ) shiftCount = 1;
4231    shift64RightJamming( aSig, shiftCount, &aSig );
4232    return roundAndPackInt32( aSign, aSig STATUS_VAR );
4233
4234}
4235
4236/*----------------------------------------------------------------------------
4237| Returns the result of converting the extended double-precision floating-
4238| point value `a' to the 32-bit two's complement integer format.  The
4239| conversion is performed according to the IEC/IEEE Standard for Binary
4240| Floating-Point Arithmetic, except that the conversion is always rounded
4241| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4242| Otherwise, if the conversion overflows, the largest integer with the same
4243| sign as `a' is returned.
4244*----------------------------------------------------------------------------*/
4245
4246int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4247{
4248    flag aSign;
4249    int32 aExp, shiftCount;
4250    uint64_t aSig, savedASig;
4251    int32_t z;
4252
4253    aSig = extractFloatx80Frac( a );
4254    aExp = extractFloatx80Exp( a );
4255    aSign = extractFloatx80Sign( a );
4256    if ( 0x401E < aExp ) {
4257        if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4258        goto invalid;
4259    }
4260    else if ( aExp < 0x3FFF ) {
4261        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4262        return 0;
4263    }
4264    shiftCount = 0x403E - aExp;
4265    savedASig = aSig;
4266    aSig >>= shiftCount;
4267    z = aSig;
4268    if ( aSign ) z = - z;
4269    if ( ( z < 0 ) ^ aSign ) {
4270 invalid:
4271        float_raise( float_flag_invalid STATUS_VAR);
4272        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4273    }
4274    if ( ( aSig<<shiftCount ) != savedASig ) {
4275        STATUS(float_exception_flags) |= float_flag_inexact;
4276    }
4277    return z;
4278
4279}
4280
4281/*----------------------------------------------------------------------------
4282| Returns the result of converting the extended double-precision floating-
4283| point value `a' to the 64-bit two's complement integer format.  The
4284| conversion is performed according to the IEC/IEEE Standard for Binary
4285| Floating-Point Arithmetic---which means in particular that the conversion
4286| is rounded according to the current rounding mode.  If `a' is a NaN,
4287| the largest positive integer is returned.  Otherwise, if the conversion
4288| overflows, the largest integer with the same sign as `a' is returned.
4289*----------------------------------------------------------------------------*/
4290
4291int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4292{
4293    flag aSign;
4294    int32 aExp, shiftCount;
4295    uint64_t aSig, aSigExtra;
4296
4297    aSig = extractFloatx80Frac( a );
4298    aExp = extractFloatx80Exp( a );
4299    aSign = extractFloatx80Sign( a );
4300    shiftCount = 0x403E - aExp;
4301    if ( shiftCount <= 0 ) {
4302        if ( shiftCount ) {
4303            float_raise( float_flag_invalid STATUS_VAR);
4304            if (    ! aSign
4305                 || (    ( aExp == 0x7FFF )
4306                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
4307               ) {
4308                return LIT64( 0x7FFFFFFFFFFFFFFF );
4309            }
4310            return (int64_t) LIT64( 0x8000000000000000 );
4311        }
4312        aSigExtra = 0;
4313    }
4314    else {
4315        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4316    }
4317    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4318
4319}
4320
4321/*----------------------------------------------------------------------------
4322| Returns the result of converting the extended double-precision floating-
4323| point value `a' to the 64-bit two's complement integer format.  The
4324| conversion is performed according to the IEC/IEEE Standard for Binary
4325| Floating-Point Arithmetic, except that the conversion is always rounded
4326| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4327| Otherwise, if the conversion overflows, the largest integer with the same
4328| sign as `a' is returned.
4329*----------------------------------------------------------------------------*/
4330
4331int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4332{
4333    flag aSign;
4334    int32 aExp, shiftCount;
4335    uint64_t aSig;
4336    int64 z;
4337
4338    aSig = extractFloatx80Frac( a );
4339    aExp = extractFloatx80Exp( a );
4340    aSign = extractFloatx80Sign( a );
4341    shiftCount = aExp - 0x403E;
4342    if ( 0 <= shiftCount ) {
4343        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4344        if ( ( a.high != 0xC03E ) || aSig ) {
4345            float_raise( float_flag_invalid STATUS_VAR);
4346            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4347                return LIT64( 0x7FFFFFFFFFFFFFFF );
4348            }
4349        }
4350        return (int64_t) LIT64( 0x8000000000000000 );
4351    }
4352    else if ( aExp < 0x3FFF ) {
4353        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4354        return 0;
4355    }
4356    z = aSig>>( - shiftCount );
4357    if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4358        STATUS(float_exception_flags) |= float_flag_inexact;
4359    }
4360    if ( aSign ) z = - z;
4361    return z;
4362
4363}
4364
4365/*----------------------------------------------------------------------------
4366| Returns the result of converting the extended double-precision floating-
4367| point value `a' to the single-precision floating-point format.  The
4368| conversion is performed according to the IEC/IEEE Standard for Binary
4369| Floating-Point Arithmetic.
4370*----------------------------------------------------------------------------*/
4371
4372float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4373{
4374    flag aSign;
4375    int32 aExp;
4376    uint64_t aSig;
4377
4378    aSig = extractFloatx80Frac( a );
4379    aExp = extractFloatx80Exp( a );
4380    aSign = extractFloatx80Sign( a );
4381    if ( aExp == 0x7FFF ) {
4382        if ( (uint64_t) ( aSig<<1 ) ) {
4383            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4384        }
4385        return packFloat32( aSign, 0xFF, 0 );
4386    }
4387    shift64RightJamming( aSig, 33, &aSig );
4388    if ( aExp || aSig ) aExp -= 0x3F81;
4389    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4390
4391}
4392
4393/*----------------------------------------------------------------------------
4394| Returns the result of converting the extended double-precision floating-
4395| point value `a' to the double-precision floating-point format.  The
4396| conversion is performed according to the IEC/IEEE Standard for Binary
4397| Floating-Point Arithmetic.
4398*----------------------------------------------------------------------------*/
4399
4400float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4401{
4402    flag aSign;
4403    int32 aExp;
4404    uint64_t aSig, zSig;
4405
4406    aSig = extractFloatx80Frac( a );
4407    aExp = extractFloatx80Exp( a );
4408    aSign = extractFloatx80Sign( a );
4409    if ( aExp == 0x7FFF ) {
4410        if ( (uint64_t) ( aSig<<1 ) ) {
4411            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4412        }
4413        return packFloat64( aSign, 0x7FF, 0 );
4414    }
4415    shift64RightJamming( aSig, 1, &zSig );
4416    if ( aExp || aSig ) aExp -= 0x3C01;
4417    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4418
4419}
4420
4421/*----------------------------------------------------------------------------
4422| Returns the result of converting the extended double-precision floating-
4423| point value `a' to the quadruple-precision floating-point format.  The
4424| conversion is performed according to the IEC/IEEE Standard for Binary
4425| Floating-Point Arithmetic.
4426*----------------------------------------------------------------------------*/
4427
4428float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4429{
4430    flag aSign;
4431    int_fast16_t aExp;
4432    uint64_t aSig, zSig0, zSig1;
4433
4434    aSig = extractFloatx80Frac( a );
4435    aExp = extractFloatx80Exp( a );
4436    aSign = extractFloatx80Sign( a );
4437    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4438        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4439    }
4440    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4441    return packFloat128( aSign, aExp, zSig0, zSig1 );
4442
4443}
4444
4445/*----------------------------------------------------------------------------
4446| Rounds the extended double-precision floating-point value `a' to an integer,
4447| and returns the result as an extended quadruple-precision floating-point
4448| value.  The operation is performed according to the IEC/IEEE Standard for
4449| Binary Floating-Point Arithmetic.
4450*----------------------------------------------------------------------------*/
4451
4452floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4453{
4454    flag aSign;
4455    int32 aExp;
4456    uint64_t lastBitMask, roundBitsMask;
4457    int8 roundingMode;
4458    floatx80 z;
4459
4460    aExp = extractFloatx80Exp( a );
4461    if ( 0x403E <= aExp ) {
4462        if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4463            return propagateFloatx80NaN( a, a STATUS_VAR );
4464        }
4465        return a;
4466    }
4467    if ( aExp < 0x3FFF ) {
4468        if (    ( aExp == 0 )
4469             && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4470            return a;
4471        }
4472        STATUS(float_exception_flags) |= float_flag_inexact;
4473        aSign = extractFloatx80Sign( a );
4474        switch ( STATUS(float_rounding_mode) ) {
4475         case float_round_nearest_even:
4476            if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4477               ) {
4478                return
4479                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4480            }
4481            break;
4482         case float_round_down:
4483            return
4484                  aSign ?
4485                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4486                : packFloatx80( 0, 0, 0 );
4487         case float_round_up:
4488            return
4489                  aSign ? packFloatx80( 1, 0, 0 )
4490                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4491        }
4492        return packFloatx80( aSign, 0, 0 );
4493    }
4494    lastBitMask = 1;
4495    lastBitMask <<= 0x403E - aExp;
4496    roundBitsMask = lastBitMask - 1;
4497    z = a;
4498    roundingMode = STATUS(float_rounding_mode);
4499    if ( roundingMode == float_round_nearest_even ) {
4500        z.low += lastBitMask>>1;
4501        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4502    }
4503    else if ( roundingMode != float_round_to_zero ) {
4504        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4505            z.low += roundBitsMask;
4506        }
4507    }
4508    z.low &= ~ roundBitsMask;
4509    if ( z.low == 0 ) {
4510        ++z.high;
4511        z.low = LIT64( 0x8000000000000000 );
4512    }
4513    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4514    return z;
4515
4516}
4517
4518/*----------------------------------------------------------------------------
4519| Returns the result of adding the absolute values of the extended double-
4520| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4521| negated before being returned.  `zSign' is ignored if the result is a NaN.
4522| The addition is performed according to the IEC/IEEE Standard for Binary
4523| Floating-Point Arithmetic.
4524*----------------------------------------------------------------------------*/
4525
4526static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4527{
4528    int32 aExp, bExp, zExp;
4529    uint64_t aSig, bSig, zSig0, zSig1;
4530    int32 expDiff;
4531
4532    aSig = extractFloatx80Frac( a );
4533    aExp = extractFloatx80Exp( a );
4534    bSig = extractFloatx80Frac( b );
4535    bExp = extractFloatx80Exp( b );
4536    expDiff = aExp - bExp;
4537    if ( 0 < expDiff ) {
4538        if ( aExp == 0x7FFF ) {
4539            if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4540            return a;
4541        }
4542        if ( bExp == 0 ) --expDiff;
4543        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4544        zExp = aExp;
4545    }
4546    else if ( expDiff < 0 ) {
4547        if ( bExp == 0x7FFF ) {
4548            if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4549            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4550        }
4551        if ( aExp == 0 ) ++expDiff;
4552        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4553        zExp = bExp;
4554    }
4555    else {
4556        if ( aExp == 0x7FFF ) {
4557            if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4558                return propagateFloatx80NaN( a, b STATUS_VAR );
4559            }
4560            return a;
4561        }
4562        zSig1 = 0;
4563        zSig0 = aSig + bSig;
4564        if ( aExp == 0 ) {
4565            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4566            goto roundAndPack;
4567        }
4568        zExp = aExp;
4569        goto shiftRight1;
4570    }
4571    zSig0 = aSig + bSig;
4572    if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4573 shiftRight1:
4574    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4575    zSig0 |= LIT64( 0x8000000000000000 );
4576    ++zExp;
4577 roundAndPack:
4578    return
4579        roundAndPackFloatx80(
4580            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4581
4582}
4583
4584/*----------------------------------------------------------------------------
4585| Returns the result of subtracting the absolute values of the extended
4586| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4587| difference is negated before being returned.  `zSign' is ignored if the
4588| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4589| Standard for Binary Floating-Point Arithmetic.
4590*----------------------------------------------------------------------------*/
4591
4592static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4593{
4594    int32 aExp, bExp, zExp;
4595    uint64_t aSig, bSig, zSig0, zSig1;
4596    int32 expDiff;
4597    floatx80 z;
4598
4599    aSig = extractFloatx80Frac( a );
4600    aExp = extractFloatx80Exp( a );
4601    bSig = extractFloatx80Frac( b );
4602    bExp = extractFloatx80Exp( b );
4603    expDiff = aExp - bExp;
4604    if ( 0 < expDiff ) goto aExpBigger;
4605    if ( expDiff < 0 ) goto bExpBigger;
4606    if ( aExp == 0x7FFF ) {
4607        if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4608            return propagateFloatx80NaN( a, b STATUS_VAR );
4609        }
4610        float_raise( float_flag_invalid STATUS_VAR);
4611        z.low = floatx80_default_nan_low;
4612        z.high = floatx80_default_nan_high;
4613        return z;
4614    }
4615    if ( aExp == 0 ) {
4616        aExp = 1;
4617        bExp = 1;
4618    }
4619    zSig1 = 0;
4620    if ( bSig < aSig ) goto aBigger;
4621    if ( aSig < bSig ) goto bBigger;
4622    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4623 bExpBigger:
4624    if ( bExp == 0x7FFF ) {
4625        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4626        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4627    }
4628    if ( aExp == 0 ) ++expDiff;
4629    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4630 bBigger:
4631    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4632    zExp = bExp;
4633    zSign ^= 1;
4634    goto normalizeRoundAndPack;
4635 aExpBigger:
4636    if ( aExp == 0x7FFF ) {
4637        if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4638        return a;
4639    }
4640    if ( bExp == 0 ) --expDiff;
4641    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4642 aBigger:
4643    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4644    zExp = aExp;
4645 normalizeRoundAndPack:
4646    return
4647        normalizeRoundAndPackFloatx80(
4648            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4649
4650}
4651
4652/*----------------------------------------------------------------------------
4653| Returns the result of adding the extended double-precision floating-point
4654| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4655| Standard for Binary Floating-Point Arithmetic.
4656*----------------------------------------------------------------------------*/
4657
4658floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4659{
4660    flag aSign, bSign;
4661
4662    aSign = extractFloatx80Sign( a );
4663    bSign = extractFloatx80Sign( b );
4664    if ( aSign == bSign ) {
4665        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4666    }
4667    else {
4668        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4669    }
4670
4671}
4672
4673/*----------------------------------------------------------------------------
4674| Returns the result of subtracting the extended double-precision floating-
4675| point values `a' and `b'.  The operation is performed according to the
4676| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4677*----------------------------------------------------------------------------*/
4678
4679floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4680{
4681    flag aSign, bSign;
4682
4683    aSign = extractFloatx80Sign( a );
4684    bSign = extractFloatx80Sign( b );
4685    if ( aSign == bSign ) {
4686        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4687    }
4688    else {
4689        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4690    }
4691
4692}
4693
4694/*----------------------------------------------------------------------------
4695| Returns the result of multiplying the extended double-precision floating-
4696| point values `a' and `b'.  The operation is performed according to the
4697| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4698*----------------------------------------------------------------------------*/
4699
4700floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4701{
4702    flag aSign, bSign, zSign;
4703    int32 aExp, bExp, zExp;
4704    uint64_t aSig, bSig, zSig0, zSig1;
4705    floatx80 z;
4706
4707    aSig = extractFloatx80Frac( a );
4708    aExp = extractFloatx80Exp( a );
4709    aSign = extractFloatx80Sign( a );
4710    bSig = extractFloatx80Frac( b );
4711    bExp = extractFloatx80Exp( b );
4712    bSign = extractFloatx80Sign( b );
4713    zSign = aSign ^ bSign;
4714    if ( aExp == 0x7FFF ) {
4715        if (    (uint64_t) ( aSig<<1 )
4716             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4717            return propagateFloatx80NaN( a, b STATUS_VAR );
4718        }
4719        if ( ( bExp | bSig ) == 0 ) goto invalid;
4720        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4721    }
4722    if ( bExp == 0x7FFF ) {
4723        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4724        if ( ( aExp | aSig ) == 0 ) {
4725 invalid:
4726            float_raise( float_flag_invalid STATUS_VAR);
4727            z.low = floatx80_default_nan_low;
4728            z.high = floatx80_default_nan_high;
4729            return z;
4730        }
4731        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4732    }
4733    if ( aExp == 0 ) {
4734        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4735        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4736    }
4737    if ( bExp == 0 ) {
4738        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4739        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4740    }
4741    zExp = aExp + bExp - 0x3FFE;
4742    mul64To128( aSig, bSig, &zSig0, &zSig1 );
4743    if ( 0 < (int64_t) zSig0 ) {
4744        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4745        --zExp;
4746    }
4747    return
4748        roundAndPackFloatx80(
4749            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4750
4751}
4752
4753/*----------------------------------------------------------------------------
4754| Returns the result of dividing the extended double-precision floating-point
4755| value `a' by the corresponding value `b'.  The operation is performed
4756| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4757*----------------------------------------------------------------------------*/
4758
4759floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4760{
4761    flag aSign, bSign, zSign;
4762    int32 aExp, bExp, zExp;
4763    uint64_t aSig, bSig, zSig0, zSig1;
4764    uint64_t rem0, rem1, rem2, term0, term1, term2;
4765    floatx80 z;
4766
4767    aSig = extractFloatx80Frac( a );
4768    aExp = extractFloatx80Exp( a );
4769    aSign = extractFloatx80Sign( a );
4770    bSig = extractFloatx80Frac( b );
4771    bExp = extractFloatx80Exp( b );
4772    bSign = extractFloatx80Sign( b );
4773    zSign = aSign ^ bSign;
4774    if ( aExp == 0x7FFF ) {
4775        if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4776        if ( bExp == 0x7FFF ) {
4777            if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4778            goto invalid;
4779        }
4780        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4781    }
4782    if ( bExp == 0x7FFF ) {
4783        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4784        return packFloatx80( zSign, 0, 0 );
4785    }
4786    if ( bExp == 0 ) {
4787        if ( bSig == 0 ) {
4788            if ( ( aExp | aSig ) == 0 ) {
4789 invalid:
4790                float_raise( float_flag_invalid STATUS_VAR);
4791                z.low = floatx80_default_nan_low;
4792                z.high = floatx80_default_nan_high;
4793                return z;
4794            }
4795            float_raise( float_flag_divbyzero STATUS_VAR);
4796            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4797        }
4798        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4799    }
4800    if ( aExp == 0 ) {
4801        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4802        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4803    }
4804    zExp = aExp - bExp + 0x3FFE;
4805    rem1 = 0;
4806    if ( bSig <= aSig ) {
4807        shift128Right( aSig, 0, 1, &aSig, &rem1 );
4808        ++zExp;
4809    }
4810    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4811    mul64To128( bSig, zSig0, &term0, &term1 );
4812    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4813    while ( (int64_t) rem0 < 0 ) {
4814        --zSig0;
4815        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4816    }
4817    zSig1 = estimateDiv128To64( rem1, 0, bSig );
4818    if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4819        mul64To128( bSig, zSig1, &term1, &term2 );
4820        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4821        while ( (int64_t) rem1 < 0 ) {
4822            --zSig1;
4823            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4824        }
4825        zSig1 |= ( ( rem1 | rem2 ) != 0 );
4826    }
4827    return
4828        roundAndPackFloatx80(
4829            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4830
4831}
4832
4833/*----------------------------------------------------------------------------
4834| Returns the remainder of the extended double-precision floating-point value
4835| `a' with respect to the corresponding value `b'.  The operation is performed
4836| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4837*----------------------------------------------------------------------------*/
4838
4839floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4840{
4841    flag aSign, zSign;
4842    int32 aExp, bExp, expDiff;
4843    uint64_t aSig0, aSig1, bSig;
4844    uint64_t q, term0, term1, alternateASig0, alternateASig1;
4845    floatx80 z;
4846
4847    aSig0 = extractFloatx80Frac( a );
4848    aExp = extractFloatx80Exp( a );
4849    aSign = extractFloatx80Sign( a );
4850    bSig = extractFloatx80Frac( b );
4851    bExp = extractFloatx80Exp( b );
4852    if ( aExp == 0x7FFF ) {
4853        if (    (uint64_t) ( aSig0<<1 )
4854             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4855            return propagateFloatx80NaN( a, b STATUS_VAR );
4856        }
4857        goto invalid;
4858    }
4859    if ( bExp == 0x7FFF ) {
4860        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4861        return a;
4862    }
4863    if ( bExp == 0 ) {
4864        if ( bSig == 0 ) {
4865 invalid:
4866            float_raise( float_flag_invalid STATUS_VAR);
4867            z.low = floatx80_default_nan_low;
4868            z.high = floatx80_default_nan_high;
4869            return z;
4870        }
4871        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4872    }
4873    if ( aExp == 0 ) {
4874        if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4875        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4876    }
4877    bSig |= LIT64( 0x8000000000000000 );
4878    zSign = aSign;
4879    expDiff = aExp - bExp;
4880    aSig1 = 0;
4881    if ( expDiff < 0 ) {
4882        if ( expDiff < -1 ) return a;
4883        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4884        expDiff = 0;
4885    }
4886    q = ( bSig <= aSig0 );
4887    if ( q ) aSig0 -= bSig;
4888    expDiff -= 64;
4889    while ( 0 < expDiff ) {
4890        q = estimateDiv128To64( aSig0, aSig1, bSig );
4891        q = ( 2 < q ) ? q - 2 : 0;
4892        mul64To128( bSig, q, &term0, &term1 );
4893        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4894        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4895        expDiff -= 62;
4896    }
4897    expDiff += 64;
4898    if ( 0 < expDiff ) {
4899        q = estimateDiv128To64( aSig0, aSig1, bSig );
4900        q = ( 2 < q ) ? q - 2 : 0;
4901        q >>= 64 - expDiff;
4902        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4903        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4904        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4905        while ( le128( term0, term1, aSig0, aSig1 ) ) {
4906            ++q;
4907            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4908        }
4909    }
4910    else {
4911        term1 = 0;
4912        term0 = bSig;
4913    }
4914    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4915    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4916         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4917              && ( q & 1 ) )
4918       ) {
4919        aSig0 = alternateASig0;
4920        aSig1 = alternateASig1;
4921        zSign = ! zSign;
4922    }
4923    return
4924        normalizeRoundAndPackFloatx80(
4925            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4926
4927}
4928
4929/*----------------------------------------------------------------------------
4930| Returns the square root of the extended double-precision floating-point
4931| value `a'.  The operation is performed according to the IEC/IEEE Standard
4932| for Binary Floating-Point Arithmetic.
4933*----------------------------------------------------------------------------*/
4934
4935floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4936{
4937    flag aSign;
4938    int32 aExp, zExp;
4939    uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4940    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4941    floatx80 z;
4942
4943    aSig0 = extractFloatx80Frac( a );
4944    aExp = extractFloatx80Exp( a );
4945    aSign = extractFloatx80Sign( a );
4946    if ( aExp == 0x7FFF ) {
4947        if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4948        if ( ! aSign ) return a;
4949        goto invalid;
4950    }
4951    if ( aSign ) {
4952        if ( ( aExp | aSig0 ) == 0 ) return a;
4953 invalid:
4954        float_raise( float_flag_invalid STATUS_VAR);
4955        z.low = floatx80_default_nan_low;
4956        z.high = floatx80_default_nan_high;
4957        return z;
4958    }
4959    if ( aExp == 0 ) {
4960        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4961        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4962    }
4963    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4964    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4965    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4966    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4967    doubleZSig0 = zSig0<<1;
4968    mul64To128( zSig0, zSig0, &term0, &term1 );
4969    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4970    while ( (int64_t) rem0 < 0 ) {
4971        --zSig0;
4972        doubleZSig0 -= 2;
4973        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4974    }
4975    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4976    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4977        if ( zSig1 == 0 ) zSig1 = 1;
4978        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4979        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4980        mul64To128( zSig1, zSig1, &term2, &term3 );
4981        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4982        while ( (int64_t) rem1 < 0 ) {
4983            --zSig1;
4984            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4985            term3 |= 1;
4986            term2 |= doubleZSig0;
4987            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4988        }
4989        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4990    }
4991    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4992    zSig0 |= doubleZSig0;
4993    return
4994        roundAndPackFloatx80(
4995            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4996
4997}
4998
4999/*----------------------------------------------------------------------------
5000| Returns 1 if the extended double-precision floating-point value `a' is equal
5001| to the corresponding value `b', and 0 otherwise.  The invalid exception is
5002| raised if either operand is a NaN.  Otherwise, the comparison is performed
5003| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5004*----------------------------------------------------------------------------*/
5005
5006int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
5007{
5008
5009    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5010              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5011         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5012              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5013       ) {
5014        float_raise( float_flag_invalid STATUS_VAR);
5015        return 0;
5016    }
5017    return
5018           ( a.low == b.low )
5019        && (    ( a.high == b.high )
5020             || (    ( a.low == 0 )
5021                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5022           );
5023
5024}
5025
5026/*----------------------------------------------------------------------------
5027| Returns 1 if the extended double-precision floating-point value `a' is
5028| less than or equal to the corresponding value `b', and 0 otherwise.  The
5029| invalid exception is raised if either operand is a NaN.  The comparison is
5030| performed according to the IEC/IEEE Standard for Binary Floating-Point
5031| Arithmetic.
5032*----------------------------------------------------------------------------*/
5033
5034int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
5035{
5036    flag aSign, bSign;
5037
5038    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5039              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5040         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5041              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5042       ) {
5043        float_raise( float_flag_invalid STATUS_VAR);
5044        return 0;
5045    }
5046    aSign = extractFloatx80Sign( a );
5047    bSign = extractFloatx80Sign( b );
5048    if ( aSign != bSign ) {
5049        return
5050               aSign
5051            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5052                 == 0 );
5053    }
5054    return
5055          aSign ? le128( b.high, b.low, a.high, a.low )
5056        : le128( a.high, a.low, b.high, b.low );
5057
5058}
5059
5060/*----------------------------------------------------------------------------
5061| Returns 1 if the extended double-precision floating-point value `a' is
5062| less than the corresponding value `b', and 0 otherwise.  The invalid
5063| exception is raised if either operand is a NaN.  The comparison is performed
5064| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5065*----------------------------------------------------------------------------*/
5066
5067int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
5068{
5069    flag aSign, bSign;
5070
5071    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5072              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5073         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5074              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5075       ) {
5076        float_raise( float_flag_invalid STATUS_VAR);
5077        return 0;
5078    }
5079    aSign = extractFloatx80Sign( a );
5080    bSign = extractFloatx80Sign( b );
5081    if ( aSign != bSign ) {
5082        return
5083               aSign
5084            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5085                 != 0 );
5086    }
5087    return
5088          aSign ? lt128( b.high, b.low, a.high, a.low )
5089        : lt128( a.high, a.low, b.high, b.low );
5090
5091}
5092
5093/*----------------------------------------------------------------------------
5094| Returns 1 if the extended double-precision floating-point values `a' and `b'
5095| cannot be compared, and 0 otherwise.  The invalid exception is raised if
5096| either operand is a NaN.   The comparison is performed according to the
5097| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5098*----------------------------------------------------------------------------*/
5099int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5100{
5101    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5102              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5103         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5104              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5105       ) {
5106        float_raise( float_flag_invalid STATUS_VAR);
5107        return 1;
5108    }
5109    return 0;
5110}
5111
5112/*----------------------------------------------------------------------------
5113| Returns 1 if the extended double-precision floating-point value `a' is
5114| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5115| cause an exception.  The comparison is performed according to the IEC/IEEE
5116| Standard for Binary Floating-Point Arithmetic.
5117*----------------------------------------------------------------------------*/
5118
5119int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5120{
5121
5122    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5123              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5124         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5125              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5126       ) {
5127        if (    floatx80_is_signaling_nan( a )
5128             || floatx80_is_signaling_nan( b ) ) {
5129            float_raise( float_flag_invalid STATUS_VAR);
5130        }
5131        return 0;
5132    }
5133    return
5134           ( a.low == b.low )
5135        && (    ( a.high == b.high )
5136             || (    ( a.low == 0 )
5137                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5138           );
5139
5140}
5141
5142/*----------------------------------------------------------------------------
5143| Returns 1 if the extended double-precision floating-point value `a' is less
5144| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5145| do not cause an exception.  Otherwise, the comparison is performed according
5146| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5147*----------------------------------------------------------------------------*/
5148
5149int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5150{
5151    flag aSign, bSign;
5152
5153    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5154              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5155         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5156              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5157       ) {
5158        if (    floatx80_is_signaling_nan( a )
5159             || floatx80_is_signaling_nan( b ) ) {
5160            float_raise( float_flag_invalid STATUS_VAR);
5161        }
5162        return 0;
5163    }
5164    aSign = extractFloatx80Sign( a );
5165    bSign = extractFloatx80Sign( b );
5166    if ( aSign != bSign ) {
5167        return
5168               aSign
5169            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5170                 == 0 );
5171    }
5172    return
5173          aSign ? le128( b.high, b.low, a.high, a.low )
5174        : le128( a.high, a.low, b.high, b.low );
5175
5176}
5177
5178/*----------------------------------------------------------------------------
5179| Returns 1 if the extended double-precision floating-point value `a' is less
5180| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5181| an exception.  Otherwise, the comparison is performed according to the
5182| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5183*----------------------------------------------------------------------------*/
5184
5185int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5186{
5187    flag aSign, bSign;
5188
5189    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5190              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5191         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5192              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5193       ) {
5194        if (    floatx80_is_signaling_nan( a )
5195             || floatx80_is_signaling_nan( b ) ) {
5196            float_raise( float_flag_invalid STATUS_VAR);
5197        }
5198        return 0;
5199    }
5200    aSign = extractFloatx80Sign( a );
5201    bSign = extractFloatx80Sign( b );
5202    if ( aSign != bSign ) {
5203        return
5204               aSign
5205            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5206                 != 0 );
5207    }
5208    return
5209          aSign ? lt128( b.high, b.low, a.high, a.low )
5210        : lt128( a.high, a.low, b.high, b.low );
5211
5212}
5213
5214/*----------------------------------------------------------------------------
5215| Returns 1 if the extended double-precision floating-point values `a' and `b'
5216| cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5217| The comparison is performed according to the IEC/IEEE Standard for Binary
5218| Floating-Point Arithmetic.
5219*----------------------------------------------------------------------------*/
5220int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5221{
5222    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5223              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5224         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5225              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5226       ) {
5227        if (    floatx80_is_signaling_nan( a )
5228             || floatx80_is_signaling_nan( b ) ) {
5229            float_raise( float_flag_invalid STATUS_VAR);
5230        }
5231        return 1;
5232    }
5233    return 0;
5234}
5235
5236/*----------------------------------------------------------------------------
5237| Returns the result of converting the quadruple-precision floating-point
5238| value `a' to the 32-bit two's complement integer format.  The conversion
5239| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5240| Arithmetic---which means in particular that the conversion is rounded
5241| according to the current rounding mode.  If `a' is a NaN, the largest
5242| positive integer is returned.  Otherwise, if the conversion overflows, the
5243| largest integer with the same sign as `a' is returned.
5244*----------------------------------------------------------------------------*/
5245
5246int32 float128_to_int32( float128 a STATUS_PARAM )
5247{
5248    flag aSign;
5249    int32 aExp, shiftCount;
5250    uint64_t aSig0, aSig1;
5251
5252    aSig1 = extractFloat128Frac1( a );
5253    aSig0 = extractFloat128Frac0( a );
5254    aExp = extractFloat128Exp( a );
5255    aSign = extractFloat128Sign( a );
5256    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5257    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5258    aSig0 |= ( aSig1 != 0 );
5259    shiftCount = 0x4028 - aExp;
5260    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5261    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5262
5263}
5264
5265/*----------------------------------------------------------------------------
5266| Returns the result of converting the quadruple-precision floating-point
5267| value `a' to the 32-bit two's complement integer format.  The conversion
5268| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5269| Arithmetic, except that the conversion is always rounded toward zero.  If
5270| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5271| conversion overflows, the largest integer with the same sign as `a' is
5272| returned.
5273*----------------------------------------------------------------------------*/
5274
5275int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5276{
5277    flag aSign;
5278    int32 aExp, shiftCount;
5279    uint64_t aSig0, aSig1, savedASig;
5280    int32_t z;
5281
5282    aSig1 = extractFloat128Frac1( a );
5283    aSig0 = extractFloat128Frac0( a );
5284    aExp = extractFloat128Exp( a );
5285    aSign = extractFloat128Sign( a );
5286    aSig0 |= ( aSig1 != 0 );
5287    if ( 0x401E < aExp ) {
5288        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5289        goto invalid;
5290    }
5291    else if ( aExp < 0x3FFF ) {
5292        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5293        return 0;
5294    }
5295    aSig0 |= LIT64( 0x0001000000000000 );
5296    shiftCount = 0x402F - aExp;
5297    savedASig = aSig0;
5298    aSig0 >>= shiftCount;
5299    z = aSig0;
5300    if ( aSign ) z = - z;
5301    if ( ( z < 0 ) ^ aSign ) {
5302 invalid:
5303        float_raise( float_flag_invalid STATUS_VAR);
5304        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5305    }
5306    if ( ( aSig0<<shiftCount ) != savedASig ) {
5307        STATUS(float_exception_flags) |= float_flag_inexact;
5308    }
5309    return z;
5310
5311}
5312
5313/*----------------------------------------------------------------------------
5314| Returns the result of converting the quadruple-precision floating-point
5315| value `a' to the 64-bit two's complement integer format.  The conversion
5316| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5317| Arithmetic---which means in particular that the conversion is rounded
5318| according to the current rounding mode.  If `a' is a NaN, the largest
5319| positive integer is returned.  Otherwise, if the conversion overflows, the
5320| largest integer with the same sign as `a' is returned.
5321*----------------------------------------------------------------------------*/
5322
5323int64 float128_to_int64( float128 a STATUS_PARAM )
5324{
5325    flag aSign;
5326    int32 aExp, shiftCount;
5327    uint64_t aSig0, aSig1;
5328
5329    aSig1 = extractFloat128Frac1( a );
5330    aSig0 = extractFloat128Frac0( a );
5331    aExp = extractFloat128Exp( a );
5332    aSign = extractFloat128Sign( a );
5333    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5334    shiftCount = 0x402F - aExp;
5335    if ( shiftCount <= 0 ) {
5336        if ( 0x403E < aExp ) {
5337            float_raise( float_flag_invalid STATUS_VAR);
5338            if (    ! aSign
5339                 || (    ( aExp == 0x7FFF )
5340                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5341                    )
5342               ) {
5343                return LIT64( 0x7FFFFFFFFFFFFFFF );
5344            }
5345            return (int64_t) LIT64( 0x8000000000000000 );
5346        }
5347        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5348    }
5349    else {
5350        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5351    }
5352    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5353
5354}
5355
5356/*----------------------------------------------------------------------------
5357| Returns the result of converting the quadruple-precision floating-point
5358| value `a' to the 64-bit two's complement integer format.  The conversion
5359| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5360| Arithmetic, except that the conversion is always rounded toward zero.
5361| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5362| the conversion overflows, the largest integer with the same sign as `a' is
5363| returned.
5364*----------------------------------------------------------------------------*/
5365
5366int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5367{
5368    flag aSign;
5369    int32 aExp, shiftCount;
5370    uint64_t aSig0, aSig1;
5371    int64 z;
5372
5373    aSig1 = extractFloat128Frac1( a );
5374    aSig0 = extractFloat128Frac0( a );
5375    aExp = extractFloat128Exp( a );
5376    aSign = extractFloat128Sign( a );
5377    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5378    shiftCount = aExp - 0x402F;
5379    if ( 0 < shiftCount ) {
5380        if ( 0x403E <= aExp ) {
5381            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5382            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5383                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5384                if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5385            }
5386            else {
5387                float_raise( float_flag_invalid STATUS_VAR);
5388                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5389                    return LIT64( 0x7FFFFFFFFFFFFFFF );
5390                }
5391            }
5392            return (int64_t) LIT64( 0x8000000000000000 );
5393        }
5394        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5395        if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5396            STATUS(float_exception_flags) |= float_flag_inexact;
5397        }
5398    }
5399    else {
5400        if ( aExp < 0x3FFF ) {
5401            if ( aExp | aSig0 | aSig1 ) {
5402                STATUS(float_exception_flags) |= float_flag_inexact;
5403            }
5404            return 0;
5405        }
5406        z = aSig0>>( - shiftCount );
5407        if (    aSig1
5408             || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5409            STATUS(float_exception_flags) |= float_flag_inexact;
5410        }
5411    }
5412    if ( aSign ) z = - z;
5413    return z;
5414
5415}
5416
5417/*----------------------------------------------------------------------------
5418| Returns the result of converting the quadruple-precision floating-point
5419| value `a' to the single-precision floating-point format.  The conversion
5420| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5421| Arithmetic.
5422*----------------------------------------------------------------------------*/
5423
5424float32 float128_to_float32( float128 a STATUS_PARAM )
5425{
5426    flag aSign;
5427    int32 aExp;
5428    uint64_t aSig0, aSig1;
5429    uint32_t zSig;
5430
5431    aSig1 = extractFloat128Frac1( a );
5432    aSig0 = extractFloat128Frac0( a );
5433    aExp = extractFloat128Exp( a );
5434    aSign = extractFloat128Sign( a );
5435    if ( aExp == 0x7FFF ) {
5436        if ( aSig0 | aSig1 ) {
5437            return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5438        }
5439        return packFloat32( aSign, 0xFF, 0 );
5440    }
5441    aSig0 |= ( aSig1 != 0 );
5442    shift64RightJamming( aSig0, 18, &aSig0 );
5443    zSig = aSig0;
5444    if ( aExp || zSig ) {
5445        zSig |= 0x40000000;
5446        aExp -= 0x3F81;
5447    }
5448    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5449
5450}
5451
5452/*----------------------------------------------------------------------------
5453| Returns the result of converting the quadruple-precision floating-point
5454| value `a' to the double-precision floating-point format.  The conversion
5455| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5456| Arithmetic.
5457*----------------------------------------------------------------------------*/
5458
5459float64 float128_to_float64( float128 a STATUS_PARAM )
5460{
5461    flag aSign;
5462    int32 aExp;
5463    uint64_t aSig0, aSig1;
5464
5465    aSig1 = extractFloat128Frac1( a );
5466    aSig0 = extractFloat128Frac0( a );
5467    aExp = extractFloat128Exp( a );
5468    aSign = extractFloat128Sign( a );
5469    if ( aExp == 0x7FFF ) {
5470        if ( aSig0 | aSig1 ) {
5471            return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5472        }
5473        return packFloat64( aSign, 0x7FF, 0 );
5474    }
5475    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5476    aSig0 |= ( aSig1 != 0 );
5477    if ( aExp || aSig0 ) {
5478        aSig0 |= LIT64( 0x4000000000000000 );
5479        aExp -= 0x3C01;
5480    }
5481    return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5482
5483}
5484
5485/*----------------------------------------------------------------------------
5486| Returns the result of converting the quadruple-precision floating-point
5487| value `a' to the extended double-precision floating-point format.  The
5488| conversion is performed according to the IEC/IEEE Standard for Binary
5489| Floating-Point Arithmetic.
5490*----------------------------------------------------------------------------*/
5491
5492floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5493{
5494    flag aSign;
5495    int32 aExp;
5496    uint64_t aSig0, aSig1;
5497
5498    aSig1 = extractFloat128Frac1( a );
5499    aSig0 = extractFloat128Frac0( a );
5500    aExp = extractFloat128Exp( a );
5501    aSign = extractFloat128Sign( a );
5502    if ( aExp == 0x7FFF ) {
5503        if ( aSig0 | aSig1 ) {
5504            return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5505        }
5506        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5507    }
5508    if ( aExp == 0 ) {
5509        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5510        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5511    }
5512    else {
5513        aSig0 |= LIT64( 0x0001000000000000 );
5514    }
5515    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5516    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5517
5518}
5519
5520/*----------------------------------------------------------------------------
5521| Rounds the quadruple-precision floating-point value `a' to an integer, and
5522| returns the result as a quadruple-precision floating-point value.  The
5523| operation is performed according to the IEC/IEEE Standard for Binary
5524| Floating-Point Arithmetic.
5525*----------------------------------------------------------------------------*/
5526
5527float128 float128_round_to_int( float128 a STATUS_PARAM )
5528{
5529    flag aSign;
5530    int32 aExp;
5531    uint64_t lastBitMask, roundBitsMask;
5532    int8 roundingMode;
5533    float128 z;
5534
5535    aExp = extractFloat128Exp( a );
5536    if ( 0x402F <= aExp ) {
5537        if ( 0x406F <= aExp ) {
5538            if (    ( aExp == 0x7FFF )
5539                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5540               ) {
5541                return propagateFloat128NaN( a, a STATUS_VAR );
5542            }
5543            return a;
5544        }
5545        lastBitMask = 1;
5546        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5547        roundBitsMask = lastBitMask - 1;
5548        z = a;
5549        roundingMode = STATUS(float_rounding_mode);
5550        if ( roundingMode == float_round_nearest_even ) {
5551            if ( lastBitMask ) {
5552                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5553                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5554            }
5555            else {
5556                if ( (int64_t) z.low < 0 ) {
5557                    ++z.high;
5558                    if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5559                }
5560            }
5561        }
5562        else if ( roundingMode != float_round_to_zero ) {
5563            if (   extractFloat128Sign( z )
5564                 ^ ( roundingMode == float_round_up ) ) {
5565                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5566            }
5567        }
5568        z.low &= ~ roundBitsMask;
5569    }
5570    else {
5571        if ( aExp < 0x3FFF ) {
5572            if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5573            STATUS(float_exception_flags) |= float_flag_inexact;
5574            aSign = extractFloat128Sign( a );
5575            switch ( STATUS(float_rounding_mode) ) {
5576             case float_round_nearest_even:
5577                if (    ( aExp == 0x3FFE )
5578                     && (   extractFloat128Frac0( a )
5579                          | extractFloat128Frac1( a ) )
5580                   ) {
5581                    return packFloat128( aSign, 0x3FFF, 0, 0 );
5582                }
5583                break;
5584             case float_round_down:
5585                return
5586                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5587                    : packFloat128( 0, 0, 0, 0 );
5588             case float_round_up:
5589                return
5590                      aSign ? packFloat128( 1, 0, 0, 0 )
5591                    : packFloat128( 0, 0x3FFF, 0, 0 );
5592            }
5593            return packFloat128( aSign, 0, 0, 0 );
5594        }
5595        lastBitMask = 1;
5596        lastBitMask <<= 0x402F - aExp;
5597        roundBitsMask = lastBitMask - 1;
5598        z.low = 0;
5599        z.high = a.high;
5600        roundingMode = STATUS(float_rounding_mode);
5601        if ( roundingMode == float_round_nearest_even ) {
5602            z.high += lastBitMask>>1;
5603            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5604                z.high &= ~ lastBitMask;
5605            }
5606        }
5607        else if ( roundingMode != float_round_to_zero ) {
5608            if (   extractFloat128Sign( z )
5609                 ^ ( roundingMode == float_round_up ) ) {
5610                z.high |= ( a.low != 0 );
5611                z.high += roundBitsMask;
5612            }
5613        }
5614        z.high &= ~ roundBitsMask;
5615    }
5616    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5617        STATUS(float_exception_flags) |= float_flag_inexact;
5618    }
5619    return z;
5620
5621}
5622
5623/*----------------------------------------------------------------------------
5624| Returns the result of adding the absolute values of the quadruple-precision
5625| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5626| before being returned.  `zSign' is ignored if the result is a NaN.
5627| The addition is performed according to the IEC/IEEE Standard for Binary
5628| Floating-Point Arithmetic.
5629*----------------------------------------------------------------------------*/
5630
5631static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5632{
5633    int32 aExp, bExp, zExp;
5634    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5635    int32 expDiff;
5636
5637    aSig1 = extractFloat128Frac1( a );
5638    aSig0 = extractFloat128Frac0( a );
5639    aExp = extractFloat128Exp( a );
5640    bSig1 = extractFloat128Frac1( b );
5641    bSig0 = extractFloat128Frac0( b );
5642    bExp = extractFloat128Exp( b );
5643    expDiff = aExp - bExp;
5644    if ( 0 < expDiff ) {
5645        if ( aExp == 0x7FFF ) {
5646            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5647            return a;
5648        }
5649        if ( bExp == 0 ) {
5650            --expDiff;
5651        }
5652        else {
5653            bSig0 |= LIT64( 0x0001000000000000 );
5654        }
5655        shift128ExtraRightJamming(
5656            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5657        zExp = aExp;
5658    }
5659    else if ( expDiff < 0 ) {
5660        if ( bExp == 0x7FFF ) {
5661            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5662            return packFloat128( zSign, 0x7FFF, 0, 0 );
5663        }
5664        if ( aExp == 0 ) {
5665            ++expDiff;
5666        }
5667        else {
5668            aSig0 |= LIT64( 0x0001000000000000 );
5669        }
5670        shift128ExtraRightJamming(
5671            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5672        zExp = bExp;
5673    }
5674    else {
5675        if ( aExp == 0x7FFF ) {
5676            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5677                return propagateFloat128NaN( a, b STATUS_VAR );
5678            }
5679            return a;
5680        }
5681        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5682        if ( aExp == 0 ) {
5683            if (STATUS(flush_to_zero)) {
5684                if (zSig0 | zSig1) {
5685                    float_raise(float_flag_output_denormal STATUS_VAR);
5686                }
5687                return packFloat128(zSign, 0, 0, 0);
5688            }
5689            return packFloat128( zSign, 0, zSig0, zSig1 );
5690        }
5691        zSig2 = 0;
5692        zSig0 |= LIT64( 0x0002000000000000 );
5693        zExp = aExp;
5694        goto shiftRight1;
5695    }
5696    aSig0 |= LIT64( 0x0001000000000000 );
5697    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5698    --zExp;
5699    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5700    ++zExp;
5701 shiftRight1:
5702    shift128ExtraRightJamming(
5703        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5704 roundAndPack:
5705    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5706
5707}
5708
5709/*----------------------------------------------------------------------------
5710| Returns the result of subtracting the absolute values of the quadruple-
5711| precision floating-point values `a' and `b'.  If `zSign' is 1, the
5712| difference is negated before being returned.  `zSign' is ignored if the
5713| result is a NaN.  The subtraction is performed according to the IEC/IEEE
5714| Standard for Binary Floating-Point Arithmetic.
5715*----------------------------------------------------------------------------*/
5716
5717static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5718{
5719    int32 aExp, bExp, zExp;
5720    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5721    int32 expDiff;
5722    float128 z;
5723
5724    aSig1 = extractFloat128Frac1( a );
5725    aSig0 = extractFloat128Frac0( a );
5726    aExp = extractFloat128Exp( a );
5727    bSig1 = extractFloat128Frac1( b );
5728    bSig0 = extractFloat128Frac0( b );
5729    bExp = extractFloat128Exp( b );
5730    expDiff = aExp - bExp;
5731    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5732    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5733    if ( 0 < expDiff ) goto aExpBigger;
5734    if ( expDiff < 0 ) goto bExpBigger;
5735    if ( aExp == 0x7FFF ) {
5736        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5737            return propagateFloat128NaN( a, b STATUS_VAR );
5738        }
5739        float_raise( float_flag_invalid STATUS_VAR);
5740        z.low = float128_default_nan_low;
5741        z.high = float128_default_nan_high;
5742        return z;
5743    }
5744    if ( aExp == 0 ) {
5745        aExp = 1;
5746        bExp = 1;
5747    }
5748    if ( bSig0 < aSig0 ) goto aBigger;
5749    if ( aSig0 < bSig0 ) goto bBigger;
5750    if ( bSig1 < aSig1 ) goto aBigger;
5751    if ( aSig1 < bSig1 ) goto bBigger;
5752    return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5753 bExpBigger:
5754    if ( bExp == 0x7FFF ) {
5755        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5756        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5757    }
5758    if ( aExp == 0 ) {
5759        ++expDiff;
5760    }
5761    else {
5762        aSig0 |= LIT64( 0x4000000000000000 );
5763    }
5764    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5765    bSig0 |= LIT64( 0x4000000000000000 );
5766 bBigger:
5767    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5768    zExp = bExp;
5769    zSign ^= 1;
5770    goto normalizeRoundAndPack;
5771 aExpBigger:
5772    if ( aExp == 0x7FFF ) {
5773        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5774        return a;
5775    }
5776    if ( bExp == 0 ) {
5777        --expDiff;
5778    }
5779    else {
5780        bSig0 |= LIT64( 0x4000000000000000 );
5781    }
5782    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5783    aSig0 |= LIT64( 0x4000000000000000 );
5784 aBigger:
5785    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5786    zExp = aExp;
5787 normalizeRoundAndPack:
5788    --zExp;
5789    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5790
5791}
5792
5793/*----------------------------------------------------------------------------
5794| Returns the result of adding the quadruple-precision floating-point values
5795| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5796| for Binary Floating-Point Arithmetic.
5797*----------------------------------------------------------------------------*/
5798
5799float128 float128_add( float128 a, float128 b STATUS_PARAM )
5800{
5801    flag aSign, bSign;
5802
5803    aSign = extractFloat128Sign( a );
5804    bSign = extractFloat128Sign( b );
5805    if ( aSign == bSign ) {
5806        return addFloat128Sigs( a, b, aSign STATUS_VAR );
5807    }
5808    else {
5809        return subFloat128Sigs( a, b, aSign STATUS_VAR );
5810    }
5811
5812}
5813
5814/*----------------------------------------------------------------------------
5815| Returns the result of subtracting the quadruple-precision floating-point
5816| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5817| Standard for Binary Floating-Point Arithmetic.
5818*----------------------------------------------------------------------------*/
5819
5820float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5821{
5822    flag aSign, bSign;
5823
5824    aSign = extractFloat128Sign( a );
5825    bSign = extractFloat128Sign( b );
5826    if ( aSign == bSign ) {
5827        return subFloat128Sigs( a, b, aSign STATUS_VAR );
5828    }
5829    else {
5830        return addFloat128Sigs( a, b, aSign STATUS_VAR );
5831    }
5832
5833}
5834
5835/*----------------------------------------------------------------------------
5836| Returns the result of multiplying the quadruple-precision floating-point
5837| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5838| Standard for Binary Floating-Point Arithmetic.
5839*----------------------------------------------------------------------------*/
5840
5841float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5842{
5843    flag aSign, bSign, zSign;
5844    int32 aExp, bExp, zExp;
5845    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5846    float128 z;
5847
5848    aSig1 = extractFloat128Frac1( a );
5849    aSig0 = extractFloat128Frac0( a );
5850    aExp = extractFloat128Exp( a );
5851    aSign = extractFloat128Sign( a );
5852    bSig1 = extractFloat128Frac1( b );
5853    bSig0 = extractFloat128Frac0( b );
5854    bExp = extractFloat128Exp( b );
5855    bSign = extractFloat128Sign( b );
5856    zSign = aSign ^ bSign;
5857    if ( aExp == 0x7FFF ) {
5858        if (    ( aSig0 | aSig1 )
5859             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5860            return propagateFloat128NaN( a, b STATUS_VAR );
5861        }
5862        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5863        return packFloat128( zSign, 0x7FFF, 0, 0 );
5864    }
5865    if ( bExp == 0x7FFF ) {
5866        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5867        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5868 invalid:
5869            float_raise( float_flag_invalid STATUS_VAR);
5870            z.low = float128_default_nan_low;
5871            z.high = float128_default_nan_high;
5872            return z;
5873        }
5874        return packFloat128( zSign, 0x7FFF, 0, 0 );
5875    }
5876    if ( aExp == 0 ) {
5877        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5878        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5879    }
5880    if ( bExp == 0 ) {
5881        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5882        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5883    }
5884    zExp = aExp + bExp - 0x4000;
5885    aSig0 |= LIT64( 0x0001000000000000 );
5886    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5887    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5888    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5889    zSig2 |= ( zSig3 != 0 );
5890    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5891        shift128ExtraRightJamming(
5892            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5893        ++zExp;
5894    }
5895    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5896
5897}
5898
5899/*----------------------------------------------------------------------------
5900| Returns the result of dividing the quadruple-precision floating-point value
5901| `a' by the corresponding value `b'.  The operation is performed according to
5902| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5903*----------------------------------------------------------------------------*/
5904
5905float128 float128_div( float128 a, float128 b STATUS_PARAM )
5906{
5907    flag aSign, bSign, zSign;
5908    int32 aExp, bExp, zExp;
5909    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5910    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5911    float128 z;
5912
5913    aSig1 = extractFloat128Frac1( a );
5914    aSig0 = extractFloat128Frac0( a );
5915    aExp = extractFloat128Exp( a );
5916    aSign = extractFloat128Sign( a );
5917    bSig1 = extractFloat128Frac1( b );
5918    bSig0 = extractFloat128Frac0( b );
5919    bExp = extractFloat128Exp( b );
5920    bSign = extractFloat128Sign( b );
5921    zSign = aSign ^ bSign;
5922    if ( aExp == 0x7FFF ) {
5923        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5924        if ( bExp == 0x7FFF ) {
5925            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5926            goto invalid;
5927        }
5928        return packFloat128( zSign, 0x7FFF, 0, 0 );
5929    }
5930    if ( bExp == 0x7FFF ) {
5931        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5932        return packFloat128( zSign, 0, 0, 0 );
5933    }
5934    if ( bExp == 0 ) {
5935        if ( ( bSig0 | bSig1 ) == 0 ) {
5936            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5937 invalid:
5938                float_raise( float_flag_invalid STATUS_VAR);
5939                z.low = float128_default_nan_low;
5940                z.high = float128_default_nan_high;
5941                return z;
5942            }
5943            float_raise( float_flag_divbyzero STATUS_VAR);
5944            return packFloat128( zSign, 0x7FFF, 0, 0 );
5945        }
5946        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5947    }
5948    if ( aExp == 0 ) {
5949        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5950        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5951    }
5952    zExp = aExp - bExp + 0x3FFD;
5953    shortShift128Left(
5954        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5955    shortShift128Left(
5956        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5957    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5958        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5959        ++zExp;
5960    }
5961    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5962    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5963    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5964    while ( (int64_t) rem0 < 0 ) {
5965        --zSig0;
5966        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5967    }
5968    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5969    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5970        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5971        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5972        while ( (int64_t) rem1 < 0 ) {
5973            --zSig1;
5974            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5975        }
5976        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5977    }
5978    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5979    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5980
5981}
5982
5983/*----------------------------------------------------------------------------
5984| Returns the remainder of the quadruple-precision floating-point value `a'
5985| with respect to the corresponding value `b'.  The operation is performed
5986| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5987*----------------------------------------------------------------------------*/
5988
5989float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5990{
5991    flag aSign, zSign;
5992    int32 aExp, bExp, expDiff;
5993    uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5994    uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5995    int64_t sigMean0;
5996    float128 z;
5997
5998    aSig1 = extractFloat128Frac1( a );
5999    aSig0 = extractFloat128Frac0( a );
6000    aExp = extractFloat128Exp( a );
6001    aSign = extractFloat128Sign( a );
6002    bSig1 = extractFloat128Frac1( b );
6003    bSig0 = extractFloat128Frac0( b );
6004    bExp = extractFloat128Exp( b );
6005    if ( aExp == 0x7FFF ) {
6006        if (    ( aSig0 | aSig1 )
6007             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6008            return propagateFloat128NaN( a, b STATUS_VAR );
6009        }
6010        goto invalid;
6011    }
6012    if ( bExp == 0x7FFF ) {
6013        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6014        return a;
6015    }
6016    if ( bExp == 0 ) {
6017        if ( ( bSig0 | bSig1 ) == 0 ) {
6018 invalid:
6019            float_raise( float_flag_invalid STATUS_VAR);
6020            z.low = float128_default_nan_low;
6021            z.high = float128_default_nan_high;
6022            return z;
6023        }
6024        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6025    }
6026    if ( aExp == 0 ) {
6027        if ( ( aSig0 | aSig1 ) == 0 ) return a;
6028        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6029    }
6030    expDiff = aExp - bExp;
6031    if ( expDiff < -1 ) return a;
6032    shortShift128Left(
6033        aSig0 | LIT64( 0x0001000000000000 ),
6034        aSig1,
6035        15 - ( expDiff < 0 ),
6036        &aSig0,
6037        &aSig1
6038    );
6039    shortShift128Left(
6040        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6041    q = le128( bSig0, bSig1, aSig0, aSig1 );
6042    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6043    expDiff -= 64;
6044    while ( 0 < expDiff ) {
6045        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6046        q = ( 4 < q ) ? q - 4 : 0;
6047        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6048        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6049        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6050        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6051        expDiff -= 61;
6052    }
6053    if ( -64 < expDiff ) {
6054        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6055        q = ( 4 < q ) ? q - 4 : 0;
6056        q >>= - expDiff;
6057        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6058        expDiff += 52;
6059        if ( expDiff < 0 ) {
6060            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6061        }
6062        else {
6063            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6064        }
6065        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6066        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6067    }
6068    else {
6069        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6070        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6071    }
6072    do {
6073        alternateASig0 = aSig0;
6074        alternateASig1 = aSig1;
6075        ++q;
6076        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6077    } while ( 0 <= (int64_t) aSig0 );
6078    add128(
6079        aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6080    if (    ( sigMean0 < 0 )
6081         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6082        aSig0 = alternateASig0;
6083        aSig1 = alternateASig1;
6084    }
6085    zSign = ( (int64_t) aSig0 < 0 );
6086    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6087    return
6088        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6089
6090}
6091
6092/*----------------------------------------------------------------------------
6093| Returns the square root of the quadruple-precision floating-point value `a'.
6094| The operation is performed according to the IEC/IEEE Standard for Binary
6095| Floating-Point Arithmetic.
6096*----------------------------------------------------------------------------*/
6097
6098float128 float128_sqrt( float128 a STATUS_PARAM )
6099{
6100    flag aSign;
6101    int32 aExp, zExp;
6102    uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6103    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6104    float128 z;
6105
6106    aSig1 = extractFloat128Frac1( a );
6107    aSig0 = extractFloat128Frac0( a );
6108    aExp = extractFloat128Exp( a );
6109    aSign = extractFloat128Sign( a );
6110    if ( aExp == 0x7FFF ) {
6111        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6112        if ( ! aSign ) return a;
6113        goto invalid;
6114    }
6115    if ( aSign ) {
6116        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6117 invalid:
6118        float_raise( float_flag_invalid STATUS_VAR);
6119        z.low = float128_default_nan_low;
6120        z.high = float128_default_nan_high;
6121        return z;
6122    }
6123    if ( aExp == 0 ) {
6124        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6125        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6126    }
6127    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6128    aSig0 |= LIT64( 0x0001000000000000 );
6129    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6130    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6131    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6132    doubleZSig0 = zSig0<<1;
6133    mul64To128( zSig0, zSig0, &term0, &term1 );
6134    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6135    while ( (int64_t) rem0 < 0 ) {
6136        --zSig0;
6137        doubleZSig0 -= 2;
6138        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6139    }
6140    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6141    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6142        if ( zSig1 == 0 ) zSig1 = 1;
6143        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6144        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6145        mul64To128( zSig1, zSig1, &term2, &term3 );
6146        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6147        while ( (int64_t) rem1 < 0 ) {
6148            --zSig1;
6149            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6150            term3 |= 1;
6151            term2 |= doubleZSig0;
6152            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6153        }
6154        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6155    }
6156    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6157    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6158
6159}
6160
6161/*----------------------------------------------------------------------------
6162| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6163| the corresponding value `b', and 0 otherwise.  The invalid exception is
6164| raised if either operand is a NaN.  Otherwise, the comparison is performed
6165| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6166*----------------------------------------------------------------------------*/
6167
6168int float128_eq( float128 a, float128 b STATUS_PARAM )
6169{
6170
6171    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6172              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6173         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6174              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6175       ) {
6176        float_raise( float_flag_invalid STATUS_VAR);
6177        return 0;
6178    }
6179    return
6180           ( a.low == b.low )
6181        && (    ( a.high == b.high )
6182             || (    ( a.low == 0 )
6183                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6184           );
6185
6186}
6187
6188/*----------------------------------------------------------------------------
6189| Returns 1 if the quadruple-precision floating-point value `a' is less than
6190| or equal to the corresponding value `b', and 0 otherwise.  The invalid
6191| exception is raised if either operand is a NaN.  The comparison is performed
6192| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6193*----------------------------------------------------------------------------*/
6194
6195int float128_le( float128 a, float128 b STATUS_PARAM )
6196{
6197    flag aSign, bSign;
6198
6199    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6200              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6201         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6202              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6203       ) {
6204        float_raise( float_flag_invalid STATUS_VAR);
6205        return 0;
6206    }
6207    aSign = extractFloat128Sign( a );
6208    bSign = extractFloat128Sign( b );
6209    if ( aSign != bSign ) {
6210        return
6211               aSign
6212            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6213                 == 0 );
6214    }
6215    return
6216          aSign ? le128( b.high, b.low, a.high, a.low )
6217        : le128( a.high, a.low, b.high, b.low );
6218
6219}
6220
6221/*----------------------------------------------------------------------------
6222| Returns 1 if the quadruple-precision floating-point value `a' is less than
6223| the corresponding value `b', and 0 otherwise.  The invalid exception is
6224| raised if either operand is a NaN.  The comparison is performed according
6225| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6226*----------------------------------------------------------------------------*/
6227
6228int float128_lt( float128 a, float128 b STATUS_PARAM )
6229{
6230    flag aSign, bSign;
6231
6232    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6233              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6234         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6235              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6236       ) {
6237        float_raise( float_flag_invalid STATUS_VAR);
6238        return 0;
6239    }
6240    aSign = extractFloat128Sign( a );
6241    bSign = extractFloat128Sign( b );
6242    if ( aSign != bSign ) {
6243        return
6244               aSign
6245            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6246                 != 0 );
6247    }
6248    return
6249          aSign ? lt128( b.high, b.low, a.high, a.low )
6250        : lt128( a.high, a.low, b.high, b.low );
6251
6252}
6253
6254/*----------------------------------------------------------------------------
6255| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6256| be compared, and 0 otherwise.  The invalid exception is raised if either
6257| operand is a NaN. The comparison is performed according to the IEC/IEEE
6258| Standard for Binary Floating-Point Arithmetic.
6259*----------------------------------------------------------------------------*/
6260
6261int float128_unordered( float128 a, float128 b STATUS_PARAM )
6262{
6263    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6264              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6265         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6266              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6267       ) {
6268        float_raise( float_flag_invalid STATUS_VAR);
6269        return 1;
6270    }
6271    return 0;
6272}
6273
6274/*----------------------------------------------------------------------------
6275| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6276| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6277| exception.  The comparison is performed according to the IEC/IEEE Standard
6278| for Binary Floating-Point Arithmetic.
6279*----------------------------------------------------------------------------*/
6280
6281int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
6282{
6283
6284    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6285              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6286         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6287              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6288       ) {
6289        if (    float128_is_signaling_nan( a )
6290             || float128_is_signaling_nan( b ) ) {
6291            float_raise( float_flag_invalid STATUS_VAR);
6292        }
6293        return 0;
6294    }
6295    return
6296           ( a.low == b.low )
6297        && (    ( a.high == b.high )
6298             || (    ( a.low == 0 )
6299                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6300           );
6301
6302}
6303
6304/*----------------------------------------------------------------------------
6305| Returns 1 if the quadruple-precision floating-point value `a' is less than
6306| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6307| cause an exception.  Otherwise, the comparison is performed according to the
6308| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6309*----------------------------------------------------------------------------*/
6310
6311int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
6312{
6313    flag aSign, bSign;
6314
6315    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6316              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6317         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6318              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6319       ) {
6320        if (    float128_is_signaling_nan( a )
6321             || float128_is_signaling_nan( b ) ) {
6322            float_raise( float_flag_invalid STATUS_VAR);
6323        }
6324        return 0;
6325    }
6326    aSign = extractFloat128Sign( a );
6327    bSign = extractFloat128Sign( b );
6328    if ( aSign != bSign ) {
6329        return
6330               aSign
6331            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6332                 == 0 );
6333    }
6334    return
6335          aSign ? le128( b.high, b.low, a.high, a.low )
6336        : le128( a.high, a.low, b.high, b.low );
6337
6338}
6339
6340/*----------------------------------------------------------------------------
6341| Returns 1 if the quadruple-precision floating-point value `a' is less than
6342| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6343| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6344| Standard for Binary Floating-Point Arithmetic.
6345*----------------------------------------------------------------------------*/
6346
6347int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
6348{
6349    flag aSign, bSign;
6350
6351    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6352              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6353         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6354              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6355       ) {
6356        if (    float128_is_signaling_nan( a )
6357             || float128_is_signaling_nan( b ) ) {
6358            float_raise( float_flag_invalid STATUS_VAR);
6359        }
6360        return 0;
6361    }
6362    aSign = extractFloat128Sign( a );
6363    bSign = extractFloat128Sign( b );
6364    if ( aSign != bSign ) {
6365        return
6366               aSign
6367            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6368                 != 0 );
6369    }
6370    return
6371          aSign ? lt128( b.high, b.low, a.high, a.low )
6372        : lt128( a.high, a.low, b.high, b.low );
6373
6374}
6375
6376/*----------------------------------------------------------------------------
6377| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6378| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6379| comparison is performed according to the IEC/IEEE Standard for Binary
6380| Floating-Point Arithmetic.
6381*----------------------------------------------------------------------------*/
6382
6383int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6384{
6385    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6386              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6387         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6388              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6389       ) {
6390        if (    float128_is_signaling_nan( a )
6391             || float128_is_signaling_nan( b ) ) {
6392            float_raise( float_flag_invalid STATUS_VAR);
6393        }
6394        return 1;
6395    }
6396    return 0;
6397}
6398
6399/* misc functions */
6400float32 uint32_to_float32( uint32 a STATUS_PARAM )
6401{
6402    return int64_to_float32(a STATUS_VAR);
6403}
6404
6405float64 uint32_to_float64( uint32 a STATUS_PARAM )
6406{
6407    return int64_to_float64(a STATUS_VAR);
6408}
6409
6410uint32 float32_to_uint32( float32 a STATUS_PARAM )
6411{
6412    int64_t v;
6413    uint32 res;
6414
6415    v = float32_to_int64(a STATUS_VAR);
6416    if (v < 0) {
6417        res = 0;
6418        float_raise( float_flag_invalid STATUS_VAR);
6419    } else if (v > 0xffffffff) {
6420        res = 0xffffffff;
6421        float_raise( float_flag_invalid STATUS_VAR);
6422    } else {
6423        res = v;
6424    }
6425    return res;
6426}
6427
6428uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6429{
6430    int64_t v;
6431    uint32 res;
6432
6433    v = float32_to_int64_round_to_zero(a STATUS_VAR);
6434    if (v < 0) {
6435        res = 0;
6436        float_raise( float_flag_invalid STATUS_VAR);
6437    } else if (v > 0xffffffff) {
6438        res = 0xffffffff;
6439        float_raise( float_flag_invalid STATUS_VAR);
6440    } else {
6441        res = v;
6442    }
6443    return res;
6444}
6445
6446uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
6447{
6448    int64_t v;
6449    uint_fast16_t res;
6450
6451    v = float32_to_int64_round_to_zero(a STATUS_VAR);
6452    if (v < 0) {
6453        res = 0;
6454        float_raise( float_flag_invalid STATUS_VAR);
6455    } else if (v > 0xffff) {
6456        res = 0xffff;
6457        float_raise( float_flag_invalid STATUS_VAR);
6458    } else {
6459        res = v;
6460    }
6461    return res;
6462}
6463
6464uint32 float64_to_uint32( float64 a STATUS_PARAM )
6465{
6466    int64_t v;
6467    uint32 res;
6468
6469    v = float64_to_int64(a STATUS_VAR);
6470    if (v < 0) {
6471        res = 0;
6472        float_raise( float_flag_invalid STATUS_VAR);
6473    } else if (v > 0xffffffff) {
6474        res = 0xffffffff;
6475        float_raise( float_flag_invalid STATUS_VAR);
6476    } else {
6477        res = v;
6478    }
6479    return res;
6480}
6481
6482uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6483{
6484    int64_t v;
6485    uint32 res;
6486
6487    v = float64_to_int64_round_to_zero(a STATUS_VAR);
6488    if (v < 0) {
6489        res = 0;
6490        float_raise( float_flag_invalid STATUS_VAR);
6491    } else if (v > 0xffffffff) {
6492        res = 0xffffffff;
6493        float_raise( float_flag_invalid STATUS_VAR);
6494    } else {
6495        res = v;
6496    }
6497    return res;
6498}
6499
6500uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
6501{
6502    int64_t v;
6503    uint_fast16_t res;
6504
6505    v = float64_to_int64_round_to_zero(a STATUS_VAR);
6506    if (v < 0) {
6507        res = 0;
6508        float_raise( float_flag_invalid STATUS_VAR);
6509    } else if (v > 0xffff) {
6510        res = 0xffff;
6511        float_raise( float_flag_invalid STATUS_VAR);
6512    } else {
6513        res = v;
6514    }
6515    return res;
6516}
6517
6518/* FIXME: This looks broken.  */
6519uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6520{
6521    int64_t v;
6522
6523    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6524    v += float64_val(a);
6525    v = float64_to_int64(make_float64(v) STATUS_VAR);
6526
6527    return v - INT64_MIN;
6528}
6529
6530uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6531{
6532    int64_t v;
6533
6534    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6535    v += float64_val(a);
6536    v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6537
6538    return v - INT64_MIN;
6539}
6540
6541#define COMPARE(s, nan_exp)                                                  \
6542INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
6543                                      int is_quiet STATUS_PARAM )            \
6544{                                                                            \
6545    flag aSign, bSign;                                                       \
6546    uint ## s ## _t av, bv;                                                  \
6547    a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
6548    b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
6549                                                                             \
6550    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
6551         extractFloat ## s ## Frac( a ) ) ||                                 \
6552        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
6553          extractFloat ## s ## Frac( b ) )) {                                \
6554        if (!is_quiet ||                                                     \
6555            float ## s ## _is_signaling_nan( a ) ||                          \
6556            float ## s ## _is_signaling_nan( b ) ) {                         \
6557            float_raise( float_flag_invalid STATUS_VAR);                     \
6558        }                                                                    \
6559        return float_relation_unordered;                                     \
6560    }                                                                        \
6561    aSign = extractFloat ## s ## Sign( a );                                  \
6562    bSign = extractFloat ## s ## Sign( b );                                  \
6563    av = float ## s ## _val(a);                                              \
6564    bv = float ## s ## _val(b);                                              \
6565    if ( aSign != bSign ) {                                                  \
6566        if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
6567            /* zero case */                                                  \
6568            return float_relation_equal;                                     \
6569        } else {                                                             \
6570            return 1 - (2 * aSign);                                          \
6571        }                                                                    \
6572    } else {                                                                 \
6573        if (av == bv) {                                                      \
6574            return float_relation_equal;                                     \
6575        } else {                                                             \
6576            return 1 - 2 * (aSign ^ ( av < bv ));                            \
6577        }                                                                    \
6578    }                                                                        \
6579}                                                                            \
6580                                                                             \
6581int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
6582{                                                                            \
6583    return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6584}                                                                            \
6585                                                                             \
6586int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6587{                                                                            \
6588    return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6589}
6590
6591COMPARE(32, 0xff)
6592COMPARE(64, 0x7ff)
6593
6594INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6595                                      int is_quiet STATUS_PARAM )
6596{
6597    flag aSign, bSign;
6598
6599    if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6600          ( extractFloatx80Frac( a )<<1 ) ) ||
6601        ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6602          ( extractFloatx80Frac( b )<<1 ) )) {
6603        if (!is_quiet ||
6604            floatx80_is_signaling_nan( a ) ||
6605            floatx80_is_signaling_nan( b ) ) {
6606            float_raise( float_flag_invalid STATUS_VAR);
6607        }
6608        return float_relation_unordered;
6609    }
6610    aSign = extractFloatx80Sign( a );
6611    bSign = extractFloatx80Sign( b );
6612    if ( aSign != bSign ) {
6613
6614        if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6615             ( ( a.low | b.low ) == 0 ) ) {
6616            /* zero case */
6617            return float_relation_equal;
6618        } else {
6619            return 1 - (2 * aSign);
6620        }
6621    } else {
6622        if (a.low == b.low && a.high == b.high) {
6623            return float_relation_equal;
6624        } else {
6625            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6626        }
6627    }
6628}
6629
6630int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6631{
6632    return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6633}
6634
6635int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6636{
6637    return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6638}
6639
6640INLINE int float128_compare_internal( float128 a, float128 b,
6641                                      int is_quiet STATUS_PARAM )
6642{
6643    flag aSign, bSign;
6644
6645    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6646          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6647        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6648          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6649        if (!is_quiet ||
6650            float128_is_signaling_nan( a ) ||
6651            float128_is_signaling_nan( b ) ) {
6652            float_raise( float_flag_invalid STATUS_VAR);
6653        }
6654        return float_relation_unordered;
6655    }
6656    aSign = extractFloat128Sign( a );
6657    bSign = extractFloat128Sign( b );
6658    if ( aSign != bSign ) {
6659        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6660            /* zero case */
6661            return float_relation_equal;
6662        } else {
6663            return 1 - (2 * aSign);
6664        }
6665    } else {
6666        if (a.low == b.low && a.high == b.high) {
6667            return float_relation_equal;
6668        } else {
6669            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6670        }
6671    }
6672}
6673
6674int float128_compare( float128 a, float128 b STATUS_PARAM )
6675{
6676    return float128_compare_internal(a, b, 0 STATUS_VAR);
6677}
6678
6679int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6680{
6681    return float128_compare_internal(a, b, 1 STATUS_VAR);
6682}
6683
6684/* min() and max() functions. These can't be implemented as
6685 * 'compare and pick one input' because that would mishandle
6686 * NaNs and +0 vs -0.
6687 */
6688#define MINMAX(s, nan_exp)                                              \
6689INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
6690                                        int ismin STATUS_PARAM )        \
6691{                                                                       \
6692    flag aSign, bSign;                                                  \
6693    uint ## s ## _t av, bv;                                             \
6694    a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
6695    b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
6696    if (float ## s ## _is_any_nan(a) ||                                 \
6697        float ## s ## _is_any_nan(b)) {                                 \
6698        return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
6699    }                                                                   \
6700    aSign = extractFloat ## s ## Sign(a);                               \
6701    bSign = extractFloat ## s ## Sign(b);                               \
6702    av = float ## s ## _val(a);                                         \
6703    bv = float ## s ## _val(b);                                         \
6704    if (aSign != bSign) {                                               \
6705        if (ismin) {                                                    \
6706            return aSign ? a : b;                                       \
6707        } else {                                                        \
6708            return aSign ? b : a;                                       \
6709        }                                                               \
6710    } else {                                                            \
6711        if (ismin) {                                                    \
6712            return (aSign ^ (av < bv)) ? a : b;                         \
6713        } else {                                                        \
6714            return (aSign ^ (av < bv)) ? b : a;                         \
6715        }                                                               \
6716    }                                                                   \
6717}                                                                       \
6718                                                                        \
6719float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
6720{                                                                       \
6721    return float ## s ## _minmax(a, b, 1 STATUS_VAR);                   \
6722}                                                                       \
6723                                                                        \
6724float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
6725{                                                                       \
6726    return float ## s ## _minmax(a, b, 0 STATUS_VAR);                   \
6727}
6728
6729MINMAX(32, 0xff)
6730MINMAX(64, 0x7ff)
6731
6732
6733/* Multiply A by 2 raised to the power N.  */
6734float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6735{
6736    flag aSign;
6737    int16_t aExp;
6738    uint32_t aSig;
6739
6740    a = float32_squash_input_denormal(a STATUS_VAR);
6741    aSig = extractFloat32Frac( a );
6742    aExp = extractFloat32Exp( a );
6743    aSign = extractFloat32Sign( a );
6744
6745    if ( aExp == 0xFF ) {
6746        if ( aSig ) {
6747            return propagateFloat32NaN( a, a STATUS_VAR );
6748        }
6749        return a;
6750    }
6751    if ( aExp != 0 )
6752        aSig |= 0x00800000;
6753    else if ( aSig == 0 )
6754        return a;
6755
6756    if (n > 0x200) {
6757        n = 0x200;
6758    } else if (n < -0x200) {
6759        n = -0x200;
6760    }
6761
6762    aExp += n - 1;
6763    aSig <<= 7;
6764    return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6765}
6766
6767float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6768{
6769    flag aSign;
6770    int16_t aExp;
6771    uint64_t aSig;
6772
6773    a = float64_squash_input_denormal(a STATUS_VAR);
6774    aSig = extractFloat64Frac( a );
6775    aExp = extractFloat64Exp( a );
6776    aSign = extractFloat64Sign( a );
6777
6778    if ( aExp == 0x7FF ) {
6779        if ( aSig ) {
6780            return propagateFloat64NaN( a, a STATUS_VAR );
6781        }
6782        return a;
6783    }
6784    if ( aExp != 0 )
6785        aSig |= LIT64( 0x0010000000000000 );
6786    else if ( aSig == 0 )
6787        return a;
6788
6789    if (n > 0x1000) {
6790        n = 0x1000;
6791    } else if (n < -0x1000) {
6792        n = -0x1000;
6793    }
6794
6795    aExp += n - 1;
6796    aSig <<= 10;
6797    return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6798}
6799
6800floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6801{
6802    flag aSign;
6803    int32_t aExp;
6804    uint64_t aSig;
6805
6806    aSig = extractFloatx80Frac( a );
6807    aExp = extractFloatx80Exp( a );
6808    aSign = extractFloatx80Sign( a );
6809
6810    if ( aExp == 0x7FFF ) {
6811        if ( aSig<<1 ) {
6812            return propagateFloatx80NaN( a, a STATUS_VAR );
6813        }
6814        return a;
6815    }
6816
6817    if (aExp == 0 && aSig == 0)
6818        return a;
6819
6820    if (n > 0x10000) {
6821        n = 0x10000;
6822    } else if (n < -0x10000) {
6823        n = -0x10000;
6824    }
6825
6826    aExp += n;
6827    return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6828                                          aSign, aExp, aSig, 0 STATUS_VAR );
6829}
6830
6831float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6832{
6833    flag aSign;
6834    int32_t aExp;
6835    uint64_t aSig0, aSig1;
6836
6837    aSig1 = extractFloat128Frac1( a );
6838    aSig0 = extractFloat128Frac0( a );
6839    aExp = extractFloat128Exp( a );
6840    aSign = extractFloat128Sign( a );
6841    if ( aExp == 0x7FFF ) {
6842        if ( aSig0 | aSig1 ) {
6843            return propagateFloat128NaN( a, a STATUS_VAR );
6844        }
6845        return a;
6846    }
6847    if ( aExp != 0 )
6848        aSig0 |= LIT64( 0x0001000000000000 );
6849    else if ( aSig0 == 0 && aSig1 == 0 )
6850        return a;
6851
6852    if (n > 0x10000) {
6853        n = 0x10000;
6854    } else if (n < -0x10000) {
6855        n = -0x10000;
6856    }
6857
6858    aExp += n - 1;
6859    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6860                                          STATUS_VAR );
6861
6862}
6863