qemu/fpu/softfloat.c
<<
>>
Prefs
   1
   2/*============================================================================
   3
   4This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
   5Package, Release 2b.
   6
   7Written by John R. Hauser.  This work was made possible in part by the
   8International Computer Science Institute, located at Suite 600, 1947 Center
   9Street, Berkeley, California 94704.  Funding was partially provided by the
  10National Science Foundation under grant MIP-9311980.  The original version
  11of this code was written as part of a project to build a fixed-point vector
  12processor in collaboration with the University of California at Berkeley,
  13overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15arithmetic/SoftFloat.html'.
  16
  17THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  18been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25
  26Derivative works are acceptable, even for commercial purposes, so long as
  27(1) the source code for the derivative work includes prominent notice that
  28the work is derivative, and (2) the source code includes prominent notice with
  29these four paragraphs for those parts of this code that are retained.
  30
  31=============================================================================*/
  32
  33#include "softfloat.h"
  34
  35/*----------------------------------------------------------------------------
  36| Primitive arithmetic functions, including multi-word arithmetic, and
  37| division and square root approximations.  (Can be specialized to target if
  38| desired.)
  39*----------------------------------------------------------------------------*/
  40#include "softfloat-macros.h"
  41
  42/*----------------------------------------------------------------------------
  43| Functions and definitions to determine:  (1) whether tininess for underflow
  44| is detected before or after rounding by default, (2) what (if anything)
  45| happens when exceptions are raised, (3) how signaling NaNs are distinguished
  46| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  47| are propagated from function inputs to output.  These details are target-
  48| specific.
  49*----------------------------------------------------------------------------*/
  50#include "softfloat-specialize.h"
  51
  52void set_float_rounding_mode(int val STATUS_PARAM)
  53{
  54    STATUS(float_rounding_mode) = val;
  55}
  56
  57void set_float_exception_flags(int val STATUS_PARAM)
  58{
  59    STATUS(float_exception_flags) = val;
  60}
  61
  62#ifdef FLOATX80
  63void set_floatx80_rounding_precision(int val STATUS_PARAM)
  64{
  65    STATUS(floatx80_rounding_precision) = val;
  66}
  67#endif
  68
  69/*----------------------------------------------------------------------------
  70| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  71| and 7, and returns the properly rounded 32-bit integer corresponding to the
  72| input.  If `zSign' is 1, the input is negated before being converted to an
  73| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
  74| is simply rounded to an integer, with the inexact exception raised if the
  75| input cannot be represented exactly as an integer.  However, if the fixed-
  76| point input is too large, the invalid exception is raised and the largest
  77| positive or negative integer is returned.
  78*----------------------------------------------------------------------------*/
  79
  80static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
  81{
  82    int8 roundingMode;
  83    flag roundNearestEven;
  84    int8 roundIncrement, roundBits;
  85    int32 z;
  86
  87    roundingMode = STATUS(float_rounding_mode);
  88    roundNearestEven = ( roundingMode == float_round_nearest_even );
  89    roundIncrement = 0x40;
  90    if ( ! roundNearestEven ) {
  91        if ( roundingMode == float_round_to_zero ) {
  92            roundIncrement = 0;
  93        }
  94        else {
  95            roundIncrement = 0x7F;
  96            if ( zSign ) {
  97                if ( roundingMode == float_round_up ) roundIncrement = 0;
  98            }
  99            else {
 100                if ( roundingMode == float_round_down ) roundIncrement = 0;
 101            }
 102        }
 103    }
 104    roundBits = absZ & 0x7F;
 105    absZ = ( absZ + roundIncrement )>>7;
 106    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 107    z = absZ;
 108    if ( zSign ) z = - z;
 109    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 110        float_raise( float_flag_invalid STATUS_VAR);
 111        return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 112    }
 113    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 114    return z;
 115
 116}
 117
 118/*----------------------------------------------------------------------------
 119| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 120| `absZ1', with binary point between bits 63 and 64 (between the input words),
 121| and returns the properly rounded 64-bit integer corresponding to the input.
 122| If `zSign' is 1, the input is negated before being converted to an integer.
 123| Ordinarily, the fixed-point input is simply rounded to an integer, with
 124| the inexact exception raised if the input cannot be represented exactly as
 125| an integer.  However, if the fixed-point input is too large, the invalid
 126| exception is raised and the largest positive or negative integer is
 127| returned.
 128*----------------------------------------------------------------------------*/
 129
 130static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
 131{
 132    int8 roundingMode;
 133    flag roundNearestEven, increment;
 134    int64 z;
 135
 136    roundingMode = STATUS(float_rounding_mode);
 137    roundNearestEven = ( roundingMode == float_round_nearest_even );
 138    increment = ( (sbits64) absZ1 < 0 );
 139    if ( ! roundNearestEven ) {
 140        if ( roundingMode == float_round_to_zero ) {
 141            increment = 0;
 142        }
 143        else {
 144            if ( zSign ) {
 145                increment = ( roundingMode == float_round_down ) && absZ1;
 146            }
 147            else {
 148                increment = ( roundingMode == float_round_up ) && absZ1;
 149            }
 150        }
 151    }
 152    if ( increment ) {
 153        ++absZ0;
 154        if ( absZ0 == 0 ) goto overflow;
 155        absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 156    }
 157    z = absZ0;
 158    if ( zSign ) z = - z;
 159    if ( z && ( ( z < 0 ) ^ zSign ) ) {
 160 overflow:
 161        float_raise( float_flag_invalid STATUS_VAR);
 162        return
 163              zSign ? (sbits64) LIT64( 0x8000000000000000 )
 164            : LIT64( 0x7FFFFFFFFFFFFFFF );
 165    }
 166    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 167    return z;
 168
 169}
 170
 171/*----------------------------------------------------------------------------
 172| Returns the fraction bits of the single-precision floating-point value `a'.
 173*----------------------------------------------------------------------------*/
 174
 175INLINE bits32 extractFloat32Frac( float32 a )
 176{
 177
 178    return float32_val(a) & 0x007FFFFF;
 179
 180}
 181
 182/*----------------------------------------------------------------------------
 183| Returns the exponent bits of the single-precision floating-point value `a'.
 184*----------------------------------------------------------------------------*/
 185
 186INLINE int16 extractFloat32Exp( float32 a )
 187{
 188
 189    return ( float32_val(a)>>23 ) & 0xFF;
 190
 191}
 192
 193/*----------------------------------------------------------------------------
 194| Returns the sign bit of the single-precision floating-point value `a'.
 195*----------------------------------------------------------------------------*/
 196
 197INLINE flag extractFloat32Sign( float32 a )
 198{
 199
 200    return float32_val(a)>>31;
 201
 202}
 203
 204/*----------------------------------------------------------------------------
 205| If `a' is denormal and we are in flush-to-zero mode then set the
 206| input-denormal exception and return zero. Otherwise just return the value.
 207*----------------------------------------------------------------------------*/
 208static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
 209{
 210    if (STATUS(flush_inputs_to_zero)) {
 211        if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 212            float_raise(float_flag_input_denormal STATUS_VAR);
 213            return make_float32(float32_val(a) & 0x80000000);
 214        }
 215    }
 216    return a;
 217}
 218
 219/*----------------------------------------------------------------------------
 220| Normalizes the subnormal single-precision floating-point value represented
 221| by the denormalized significand `aSig'.  The normalized exponent and
 222| significand are stored at the locations pointed to by `zExpPtr' and
 223| `zSigPtr', respectively.
 224*----------------------------------------------------------------------------*/
 225
 226static void
 227 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 228{
 229    int8 shiftCount;
 230
 231    shiftCount = countLeadingZeros32( aSig ) - 8;
 232    *zSigPtr = aSig<<shiftCount;
 233    *zExpPtr = 1 - shiftCount;
 234
 235}
 236
 237/*----------------------------------------------------------------------------
 238| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 239| single-precision floating-point value, returning the result.  After being
 240| shifted into the proper positions, the three fields are simply added
 241| together to form the result.  This means that any integer portion of `zSig'
 242| will be added into the exponent.  Since a properly normalized significand
 243| will have an integer portion equal to 1, the `zExp' input should be 1 less
 244| than the desired result exponent whenever `zSig' is a complete, normalized
 245| significand.
 246*----------------------------------------------------------------------------*/
 247
 248INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 249{
 250
 251    return make_float32(
 252          ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
 253
 254}
 255
 256/*----------------------------------------------------------------------------
 257| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 258| and significand `zSig', and returns the proper single-precision floating-
 259| point value corresponding to the abstract input.  Ordinarily, the abstract
 260| value is simply rounded and packed into the single-precision format, with
 261| the inexact exception raised if the abstract input cannot be represented
 262| exactly.  However, if the abstract value is too large, the overflow and
 263| inexact exceptions are raised and an infinity or maximal finite value is
 264| returned.  If the abstract value is too small, the input value is rounded to
 265| a subnormal number, and the underflow and inexact exceptions are raised if
 266| the abstract input cannot be represented exactly as a subnormal single-
 267| precision floating-point number.
 268|     The input significand `zSig' has its binary point between bits 30
 269| and 29, which is 7 bits to the left of the usual location.  This shifted
 270| significand must be normalized or smaller.  If `zSig' is not normalized,
 271| `zExp' must be 0; in that case, the result returned is a subnormal number,
 272| and it must not require rounding.  In the usual case that `zSig' is
 273| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 274| The handling of underflow and overflow follows the IEC/IEEE Standard for
 275| Binary Floating-Point Arithmetic.
 276*----------------------------------------------------------------------------*/
 277
 278static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 279{
 280    int8 roundingMode;
 281    flag roundNearestEven;
 282    int8 roundIncrement, roundBits;
 283    flag isTiny;
 284
 285    roundingMode = STATUS(float_rounding_mode);
 286    roundNearestEven = ( roundingMode == float_round_nearest_even );
 287    roundIncrement = 0x40;
 288    if ( ! roundNearestEven ) {
 289        if ( roundingMode == float_round_to_zero ) {
 290            roundIncrement = 0;
 291        }
 292        else {
 293            roundIncrement = 0x7F;
 294            if ( zSign ) {
 295                if ( roundingMode == float_round_up ) roundIncrement = 0;
 296            }
 297            else {
 298                if ( roundingMode == float_round_down ) roundIncrement = 0;
 299            }
 300        }
 301    }
 302    roundBits = zSig & 0x7F;
 303    if ( 0xFD <= (bits16) zExp ) {
 304        if (    ( 0xFD < zExp )
 305             || (    ( zExp == 0xFD )
 306                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 307           ) {
 308            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 309            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 310        }
 311        if ( zExp < 0 ) {
 312            if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
 313            isTiny =
 314                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 315                || ( zExp < -1 )
 316                || ( zSig + roundIncrement < 0x80000000 );
 317            shift32RightJamming( zSig, - zExp, &zSig );
 318            zExp = 0;
 319            roundBits = zSig & 0x7F;
 320            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 321        }
 322    }
 323    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 324    zSig = ( zSig + roundIncrement )>>7;
 325    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 326    if ( zSig == 0 ) zExp = 0;
 327    return packFloat32( zSign, zExp, zSig );
 328
 329}
 330
 331/*----------------------------------------------------------------------------
 332| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 333| and significand `zSig', and returns the proper single-precision floating-
 334| point value corresponding to the abstract input.  This routine is just like
 335| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 336| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 337| floating-point exponent.
 338*----------------------------------------------------------------------------*/
 339
 340static float32
 341 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 342{
 343    int8 shiftCount;
 344
 345    shiftCount = countLeadingZeros32( zSig ) - 1;
 346    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 347
 348}
 349
 350/*----------------------------------------------------------------------------
 351| Returns the fraction bits of the double-precision floating-point value `a'.
 352*----------------------------------------------------------------------------*/
 353
 354INLINE bits64 extractFloat64Frac( float64 a )
 355{
 356
 357    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 358
 359}
 360
 361/*----------------------------------------------------------------------------
 362| Returns the exponent bits of the double-precision floating-point value `a'.
 363*----------------------------------------------------------------------------*/
 364
 365INLINE int16 extractFloat64Exp( float64 a )
 366{
 367
 368    return ( float64_val(a)>>52 ) & 0x7FF;
 369
 370}
 371
 372/*----------------------------------------------------------------------------
 373| Returns the sign bit of the double-precision floating-point value `a'.
 374*----------------------------------------------------------------------------*/
 375
 376INLINE flag extractFloat64Sign( float64 a )
 377{
 378
 379    return float64_val(a)>>63;
 380
 381}
 382
 383/*----------------------------------------------------------------------------
 384| If `a' is denormal and we are in flush-to-zero mode then set the
 385| input-denormal exception and return zero. Otherwise just return the value.
 386*----------------------------------------------------------------------------*/
 387static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
 388{
 389    if (STATUS(flush_inputs_to_zero)) {
 390        if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 391            float_raise(float_flag_input_denormal STATUS_VAR);
 392            return make_float64(float64_val(a) & (1ULL << 63));
 393        }
 394    }
 395    return a;
 396}
 397
 398/*----------------------------------------------------------------------------
 399| Normalizes the subnormal double-precision floating-point value represented
 400| by the denormalized significand `aSig'.  The normalized exponent and
 401| significand are stored at the locations pointed to by `zExpPtr' and
 402| `zSigPtr', respectively.
 403*----------------------------------------------------------------------------*/
 404
 405static void
 406 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 407{
 408    int8 shiftCount;
 409
 410    shiftCount = countLeadingZeros64( aSig ) - 11;
 411    *zSigPtr = aSig<<shiftCount;
 412    *zExpPtr = 1 - shiftCount;
 413
 414}
 415
 416/*----------------------------------------------------------------------------
 417| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 418| double-precision floating-point value, returning the result.  After being
 419| shifted into the proper positions, the three fields are simply added
 420| together to form the result.  This means that any integer portion of `zSig'
 421| will be added into the exponent.  Since a properly normalized significand
 422| will have an integer portion equal to 1, the `zExp' input should be 1 less
 423| than the desired result exponent whenever `zSig' is a complete, normalized
 424| significand.
 425*----------------------------------------------------------------------------*/
 426
 427INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 428{
 429
 430    return make_float64(
 431        ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
 432
 433}
 434
 435/*----------------------------------------------------------------------------
 436| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 437| and significand `zSig', and returns the proper double-precision floating-
 438| point value corresponding to the abstract input.  Ordinarily, the abstract
 439| value is simply rounded and packed into the double-precision format, with
 440| the inexact exception raised if the abstract input cannot be represented
 441| exactly.  However, if the abstract value is too large, the overflow and
 442| inexact exceptions are raised and an infinity or maximal finite value is
 443| returned.  If the abstract value is too small, the input value is rounded
 444| to a subnormal number, and the underflow and inexact exceptions are raised
 445| if the abstract input cannot be represented exactly as a subnormal double-
 446| precision floating-point number.
 447|     The input significand `zSig' has its binary point between bits 62
 448| and 61, which is 10 bits to the left of the usual location.  This shifted
 449| significand must be normalized or smaller.  If `zSig' is not normalized,
 450| `zExp' must be 0; in that case, the result returned is a subnormal number,
 451| and it must not require rounding.  In the usual case that `zSig' is
 452| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 453| The handling of underflow and overflow follows the IEC/IEEE Standard for
 454| Binary Floating-Point Arithmetic.
 455*----------------------------------------------------------------------------*/
 456
 457static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 458{
 459    int8 roundingMode;
 460    flag roundNearestEven;
 461    int16 roundIncrement, roundBits;
 462    flag isTiny;
 463
 464    roundingMode = STATUS(float_rounding_mode);
 465    roundNearestEven = ( roundingMode == float_round_nearest_even );
 466    roundIncrement = 0x200;
 467    if ( ! roundNearestEven ) {
 468        if ( roundingMode == float_round_to_zero ) {
 469            roundIncrement = 0;
 470        }
 471        else {
 472            roundIncrement = 0x3FF;
 473            if ( zSign ) {
 474                if ( roundingMode == float_round_up ) roundIncrement = 0;
 475            }
 476            else {
 477                if ( roundingMode == float_round_down ) roundIncrement = 0;
 478            }
 479        }
 480    }
 481    roundBits = zSig & 0x3FF;
 482    if ( 0x7FD <= (bits16) zExp ) {
 483        if (    ( 0x7FD < zExp )
 484             || (    ( zExp == 0x7FD )
 485                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 486           ) {
 487            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 488            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 489        }
 490        if ( zExp < 0 ) {
 491            if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
 492            isTiny =
 493                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 494                || ( zExp < -1 )
 495                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 496            shift64RightJamming( zSig, - zExp, &zSig );
 497            zExp = 0;
 498            roundBits = zSig & 0x3FF;
 499            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 500        }
 501    }
 502    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 503    zSig = ( zSig + roundIncrement )>>10;
 504    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 505    if ( zSig == 0 ) zExp = 0;
 506    return packFloat64( zSign, zExp, zSig );
 507
 508}
 509
 510/*----------------------------------------------------------------------------
 511| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 512| and significand `zSig', and returns the proper double-precision floating-
 513| point value corresponding to the abstract input.  This routine is just like
 514| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 515| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 516| floating-point exponent.
 517*----------------------------------------------------------------------------*/
 518
 519static float64
 520 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 521{
 522    int8 shiftCount;
 523
 524    shiftCount = countLeadingZeros64( zSig ) - 1;
 525    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 526
 527}
 528
 529#ifdef FLOATX80
 530
 531/*----------------------------------------------------------------------------
 532| Returns the fraction bits of the extended double-precision floating-point
 533| value `a'.
 534*----------------------------------------------------------------------------*/
 535
 536INLINE bits64 extractFloatx80Frac( floatx80 a )
 537{
 538
 539    return a.low;
 540
 541}
 542
 543/*----------------------------------------------------------------------------
 544| Returns the exponent bits of the extended double-precision floating-point
 545| value `a'.
 546*----------------------------------------------------------------------------*/
 547
 548INLINE int32 extractFloatx80Exp( floatx80 a )
 549{
 550
 551    return a.high & 0x7FFF;
 552
 553}
 554
 555/*----------------------------------------------------------------------------
 556| Returns the sign bit of the extended double-precision floating-point value
 557| `a'.
 558*----------------------------------------------------------------------------*/
 559
 560INLINE flag extractFloatx80Sign( floatx80 a )
 561{
 562
 563    return a.high>>15;
 564
 565}
 566
 567/*----------------------------------------------------------------------------
 568| Normalizes the subnormal extended double-precision floating-point value
 569| represented by the denormalized significand `aSig'.  The normalized exponent
 570| and significand are stored at the locations pointed to by `zExpPtr' and
 571| `zSigPtr', respectively.
 572*----------------------------------------------------------------------------*/
 573
 574static void
 575 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 576{
 577    int8 shiftCount;
 578
 579    shiftCount = countLeadingZeros64( aSig );
 580    *zSigPtr = aSig<<shiftCount;
 581    *zExpPtr = 1 - shiftCount;
 582
 583}
 584
 585/*----------------------------------------------------------------------------
 586| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 587| extended double-precision floating-point value, returning the result.
 588*----------------------------------------------------------------------------*/
 589
 590INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 591{
 592    floatx80 z;
 593
 594    z.low = zSig;
 595    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 596    return z;
 597
 598}
 599
 600/*----------------------------------------------------------------------------
 601| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 602| and extended significand formed by the concatenation of `zSig0' and `zSig1',
 603| and returns the proper extended double-precision floating-point value
 604| corresponding to the abstract input.  Ordinarily, the abstract value is
 605| rounded and packed into the extended double-precision format, with the
 606| inexact exception raised if the abstract input cannot be represented
 607| exactly.  However, if the abstract value is too large, the overflow and
 608| inexact exceptions are raised and an infinity or maximal finite value is
 609| returned.  If the abstract value is too small, the input value is rounded to
 610| a subnormal number, and the underflow and inexact exceptions are raised if
 611| the abstract input cannot be represented exactly as a subnormal extended
 612| double-precision floating-point number.
 613|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 614| number of bits as single or double precision, respectively.  Otherwise, the
 615| result is rounded to the full precision of the extended double-precision
 616| format.
 617|     The input significand must be normalized or smaller.  If the input
 618| significand is not normalized, `zExp' must be 0; in that case, the result
 619| returned is a subnormal number, and it must not require rounding.  The
 620| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 621| Floating-Point Arithmetic.
 622*----------------------------------------------------------------------------*/
 623
 624static floatx80
 625 roundAndPackFloatx80(
 626     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 627 STATUS_PARAM)
 628{
 629    int8 roundingMode;
 630    flag roundNearestEven, increment, isTiny;
 631    int64 roundIncrement, roundMask, roundBits;
 632
 633    roundingMode = STATUS(float_rounding_mode);
 634    roundNearestEven = ( roundingMode == float_round_nearest_even );
 635    if ( roundingPrecision == 80 ) goto precision80;
 636    if ( roundingPrecision == 64 ) {
 637        roundIncrement = LIT64( 0x0000000000000400 );
 638        roundMask = LIT64( 0x00000000000007FF );
 639    }
 640    else if ( roundingPrecision == 32 ) {
 641        roundIncrement = LIT64( 0x0000008000000000 );
 642        roundMask = LIT64( 0x000000FFFFFFFFFF );
 643    }
 644    else {
 645        goto precision80;
 646    }
 647    zSig0 |= ( zSig1 != 0 );
 648    if ( ! roundNearestEven ) {
 649        if ( roundingMode == float_round_to_zero ) {
 650            roundIncrement = 0;
 651        }
 652        else {
 653            roundIncrement = roundMask;
 654            if ( zSign ) {
 655                if ( roundingMode == float_round_up ) roundIncrement = 0;
 656            }
 657            else {
 658                if ( roundingMode == float_round_down ) roundIncrement = 0;
 659            }
 660        }
 661    }
 662    roundBits = zSig0 & roundMask;
 663    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 664        if (    ( 0x7FFE < zExp )
 665             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 666           ) {
 667            goto overflow;
 668        }
 669        if ( zExp <= 0 ) {
 670            if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
 671            isTiny =
 672                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 673                || ( zExp < 0 )
 674                || ( zSig0 <= zSig0 + roundIncrement );
 675            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 676            zExp = 0;
 677            roundBits = zSig0 & roundMask;
 678            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 679            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 680            zSig0 += roundIncrement;
 681            if ( (sbits64) zSig0 < 0 ) zExp = 1;
 682            roundIncrement = roundMask + 1;
 683            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 684                roundMask |= roundIncrement;
 685            }
 686            zSig0 &= ~ roundMask;
 687            return packFloatx80( zSign, zExp, zSig0 );
 688        }
 689    }
 690    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 691    zSig0 += roundIncrement;
 692    if ( zSig0 < roundIncrement ) {
 693        ++zExp;
 694        zSig0 = LIT64( 0x8000000000000000 );
 695    }
 696    roundIncrement = roundMask + 1;
 697    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 698        roundMask |= roundIncrement;
 699    }
 700    zSig0 &= ~ roundMask;
 701    if ( zSig0 == 0 ) zExp = 0;
 702    return packFloatx80( zSign, zExp, zSig0 );
 703 precision80:
 704    increment = ( (sbits64) zSig1 < 0 );
 705    if ( ! roundNearestEven ) {
 706        if ( roundingMode == float_round_to_zero ) {
 707            increment = 0;
 708        }
 709        else {
 710            if ( zSign ) {
 711                increment = ( roundingMode == float_round_down ) && zSig1;
 712            }
 713            else {
 714                increment = ( roundingMode == float_round_up ) && zSig1;
 715            }
 716        }
 717    }
 718    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 719        if (    ( 0x7FFE < zExp )
 720             || (    ( zExp == 0x7FFE )
 721                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 722                  && increment
 723                )
 724           ) {
 725            roundMask = 0;
 726 overflow:
 727            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 728            if (    ( roundingMode == float_round_to_zero )
 729                 || ( zSign && ( roundingMode == float_round_up ) )
 730                 || ( ! zSign && ( roundingMode == float_round_down ) )
 731               ) {
 732                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 733            }
 734            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 735        }
 736        if ( zExp <= 0 ) {
 737            isTiny =
 738                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 739                || ( zExp < 0 )
 740                || ! increment
 741                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 742            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 743            zExp = 0;
 744            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 745            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 746            if ( roundNearestEven ) {
 747                increment = ( (sbits64) zSig1 < 0 );
 748            }
 749            else {
 750                if ( zSign ) {
 751                    increment = ( roundingMode == float_round_down ) && zSig1;
 752                }
 753                else {
 754                    increment = ( roundingMode == float_round_up ) && zSig1;
 755                }
 756            }
 757            if ( increment ) {
 758                ++zSig0;
 759                zSig0 &=
 760                    ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 761                if ( (sbits64) zSig0 < 0 ) zExp = 1;
 762            }
 763            return packFloatx80( zSign, zExp, zSig0 );
 764        }
 765    }
 766    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 767    if ( increment ) {
 768        ++zSig0;
 769        if ( zSig0 == 0 ) {
 770            ++zExp;
 771            zSig0 = LIT64( 0x8000000000000000 );
 772        }
 773        else {
 774            zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 775        }
 776    }
 777    else {
 778        if ( zSig0 == 0 ) zExp = 0;
 779    }
 780    return packFloatx80( zSign, zExp, zSig0 );
 781
 782}
 783
 784/*----------------------------------------------------------------------------
 785| Takes an abstract floating-point value having sign `zSign', exponent
 786| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 787| and returns the proper extended double-precision floating-point value
 788| corresponding to the abstract input.  This routine is just like
 789| `roundAndPackFloatx80' except that the input significand does not have to be
 790| normalized.
 791*----------------------------------------------------------------------------*/
 792
 793static floatx80
 794 normalizeRoundAndPackFloatx80(
 795     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 796 STATUS_PARAM)
 797{
 798    int8 shiftCount;
 799
 800    if ( zSig0 == 0 ) {
 801        zSig0 = zSig1;
 802        zSig1 = 0;
 803        zExp -= 64;
 804    }
 805    shiftCount = countLeadingZeros64( zSig0 );
 806    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 807    zExp -= shiftCount;
 808    return
 809        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 810
 811}
 812
 813#endif
 814
 815#ifdef FLOAT128
 816
 817/*----------------------------------------------------------------------------
 818| Returns the least-significant 64 fraction bits of the quadruple-precision
 819| floating-point value `a'.
 820*----------------------------------------------------------------------------*/
 821
 822INLINE bits64 extractFloat128Frac1( float128 a )
 823{
 824
 825    return a.low;
 826
 827}
 828
 829/*----------------------------------------------------------------------------
 830| Returns the most-significant 48 fraction bits of the quadruple-precision
 831| floating-point value `a'.
 832*----------------------------------------------------------------------------*/
 833
 834INLINE bits64 extractFloat128Frac0( float128 a )
 835{
 836
 837    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 838
 839}
 840
 841/*----------------------------------------------------------------------------
 842| Returns the exponent bits of the quadruple-precision floating-point value
 843| `a'.
 844*----------------------------------------------------------------------------*/
 845
 846INLINE int32 extractFloat128Exp( float128 a )
 847{
 848
 849    return ( a.high>>48 ) & 0x7FFF;
 850
 851}
 852
 853/*----------------------------------------------------------------------------
 854| Returns the sign bit of the quadruple-precision floating-point value `a'.
 855*----------------------------------------------------------------------------*/
 856
 857INLINE flag extractFloat128Sign( float128 a )
 858{
 859
 860    return a.high>>63;
 861
 862}
 863
 864/*----------------------------------------------------------------------------
 865| Normalizes the subnormal quadruple-precision floating-point value
 866| represented by the denormalized significand formed by the concatenation of
 867| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 868| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 869| significand are stored at the location pointed to by `zSig0Ptr', and the
 870| least significant 64 bits of the normalized significand are stored at the
 871| location pointed to by `zSig1Ptr'.
 872*----------------------------------------------------------------------------*/
 873
 874static void
 875 normalizeFloat128Subnormal(
 876     bits64 aSig0,
 877     bits64 aSig1,
 878     int32 *zExpPtr,
 879     bits64 *zSig0Ptr,
 880     bits64 *zSig1Ptr
 881 )
 882{
 883    int8 shiftCount;
 884
 885    if ( aSig0 == 0 ) {
 886        shiftCount = countLeadingZeros64( aSig1 ) - 15;
 887        if ( shiftCount < 0 ) {
 888            *zSig0Ptr = aSig1>>( - shiftCount );
 889            *zSig1Ptr = aSig1<<( shiftCount & 63 );
 890        }
 891        else {
 892            *zSig0Ptr = aSig1<<shiftCount;
 893            *zSig1Ptr = 0;
 894        }
 895        *zExpPtr = - shiftCount - 63;
 896    }
 897    else {
 898        shiftCount = countLeadingZeros64( aSig0 ) - 15;
 899        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 900        *zExpPtr = 1 - shiftCount;
 901    }
 902
 903}
 904
 905/*----------------------------------------------------------------------------
 906| Packs the sign `zSign', the exponent `zExp', and the significand formed
 907| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 908| floating-point value, returning the result.  After being shifted into the
 909| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 910| added together to form the most significant 32 bits of the result.  This
 911| means that any integer portion of `zSig0' will be added into the exponent.
 912| Since a properly normalized significand will have an integer portion equal
 913| to 1, the `zExp' input should be 1 less than the desired result exponent
 914| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 915| significand.
 916*----------------------------------------------------------------------------*/
 917
 918INLINE float128
 919 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 920{
 921    float128 z;
 922
 923    z.low = zSig1;
 924    z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 925    return z;
 926
 927}
 928
 929/*----------------------------------------------------------------------------
 930| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 931| and extended significand formed by the concatenation of `zSig0', `zSig1',
 932| and `zSig2', and returns the proper quadruple-precision floating-point value
 933| corresponding to the abstract input.  Ordinarily, the abstract value is
 934| simply rounded and packed into the quadruple-precision format, with the
 935| inexact exception raised if the abstract input cannot be represented
 936| exactly.  However, if the abstract value is too large, the overflow and
 937| inexact exceptions are raised and an infinity or maximal finite value is
 938| returned.  If the abstract value is too small, the input value is rounded to
 939| a subnormal number, and the underflow and inexact exceptions are raised if
 940| the abstract input cannot be represented exactly as a subnormal quadruple-
 941| precision floating-point number.
 942|     The input significand must be normalized or smaller.  If the input
 943| significand is not normalized, `zExp' must be 0; in that case, the result
 944| returned is a subnormal number, and it must not require rounding.  In the
 945| usual case that the input significand is normalized, `zExp' must be 1 less
 946| than the ``true'' floating-point exponent.  The handling of underflow and
 947| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 948*----------------------------------------------------------------------------*/
 949
 950static float128
 951 roundAndPackFloat128(
 952     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
 953{
 954    int8 roundingMode;
 955    flag roundNearestEven, increment, isTiny;
 956
 957    roundingMode = STATUS(float_rounding_mode);
 958    roundNearestEven = ( roundingMode == float_round_nearest_even );
 959    increment = ( (sbits64) zSig2 < 0 );
 960    if ( ! roundNearestEven ) {
 961        if ( roundingMode == float_round_to_zero ) {
 962            increment = 0;
 963        }
 964        else {
 965            if ( zSign ) {
 966                increment = ( roundingMode == float_round_down ) && zSig2;
 967            }
 968            else {
 969                increment = ( roundingMode == float_round_up ) && zSig2;
 970            }
 971        }
 972    }
 973    if ( 0x7FFD <= (bits32) zExp ) {
 974        if (    ( 0x7FFD < zExp )
 975             || (    ( zExp == 0x7FFD )
 976                  && eq128(
 977                         LIT64( 0x0001FFFFFFFFFFFF ),
 978                         LIT64( 0xFFFFFFFFFFFFFFFF ),
 979                         zSig0,
 980                         zSig1
 981                     )
 982                  && increment
 983                )
 984           ) {
 985            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 986            if (    ( roundingMode == float_round_to_zero )
 987                 || ( zSign && ( roundingMode == float_round_up ) )
 988                 || ( ! zSign && ( roundingMode == float_round_down ) )
 989               ) {
 990                return
 991                    packFloat128(
 992                        zSign,
 993                        0x7FFE,
 994                        LIT64( 0x0000FFFFFFFFFFFF ),
 995                        LIT64( 0xFFFFFFFFFFFFFFFF )
 996                    );
 997            }
 998            return packFloat128( zSign, 0x7FFF, 0, 0 );
 999        }
1000        if ( zExp < 0 ) {
1001            if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
1002            isTiny =
1003                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1004                || ( zExp < -1 )
1005                || ! increment
1006                || lt128(
1007                       zSig0,
1008                       zSig1,
1009                       LIT64( 0x0001FFFFFFFFFFFF ),
1010                       LIT64( 0xFFFFFFFFFFFFFFFF )
1011                   );
1012            shift128ExtraRightJamming(
1013                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1014            zExp = 0;
1015            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1016            if ( roundNearestEven ) {
1017                increment = ( (sbits64) zSig2 < 0 );
1018            }
1019            else {
1020                if ( zSign ) {
1021                    increment = ( roundingMode == float_round_down ) && zSig2;
1022                }
1023                else {
1024                    increment = ( roundingMode == float_round_up ) && zSig2;
1025                }
1026            }
1027        }
1028    }
1029    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1030    if ( increment ) {
1031        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1032        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1033    }
1034    else {
1035        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1036    }
1037    return packFloat128( zSign, zExp, zSig0, zSig1 );
1038
1039}
1040
1041/*----------------------------------------------------------------------------
1042| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1043| and significand formed by the concatenation of `zSig0' and `zSig1', and
1044| returns the proper quadruple-precision floating-point value corresponding
1045| to the abstract input.  This routine is just like `roundAndPackFloat128'
1046| except that the input significand has fewer bits and does not have to be
1047| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1048| point exponent.
1049*----------------------------------------------------------------------------*/
1050
1051static float128
1052 normalizeRoundAndPackFloat128(
1053     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1054{
1055    int8 shiftCount;
1056    bits64 zSig2;
1057
1058    if ( zSig0 == 0 ) {
1059        zSig0 = zSig1;
1060        zSig1 = 0;
1061        zExp -= 64;
1062    }
1063    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1064    if ( 0 <= shiftCount ) {
1065        zSig2 = 0;
1066        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1067    }
1068    else {
1069        shift128ExtraRightJamming(
1070            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1071    }
1072    zExp -= shiftCount;
1073    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1074
1075}
1076
1077#endif
1078
1079/*----------------------------------------------------------------------------
1080| Returns the result of converting the 32-bit two's complement integer `a'
1081| to the single-precision floating-point format.  The conversion is performed
1082| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1083*----------------------------------------------------------------------------*/
1084
1085float32 int32_to_float32( int32 a STATUS_PARAM )
1086{
1087    flag zSign;
1088
1089    if ( a == 0 ) return float32_zero;
1090    if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1091    zSign = ( a < 0 );
1092    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1093
1094}
1095
1096/*----------------------------------------------------------------------------
1097| Returns the result of converting the 32-bit two's complement integer `a'
1098| to the double-precision floating-point format.  The conversion is performed
1099| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1100*----------------------------------------------------------------------------*/
1101
1102float64 int32_to_float64( int32 a STATUS_PARAM )
1103{
1104    flag zSign;
1105    uint32 absA;
1106    int8 shiftCount;
1107    bits64 zSig;
1108
1109    if ( a == 0 ) return float64_zero;
1110    zSign = ( a < 0 );
1111    absA = zSign ? - a : a;
1112    shiftCount = countLeadingZeros32( absA ) + 21;
1113    zSig = absA;
1114    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1115
1116}
1117
1118#ifdef FLOATX80
1119
1120/*----------------------------------------------------------------------------
1121| Returns the result of converting the 32-bit two's complement integer `a'
1122| to the extended double-precision floating-point format.  The conversion
1123| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1124| Arithmetic.
1125*----------------------------------------------------------------------------*/
1126
1127floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1128{
1129    flag zSign;
1130    uint32 absA;
1131    int8 shiftCount;
1132    bits64 zSig;
1133
1134    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1135    zSign = ( a < 0 );
1136    absA = zSign ? - a : a;
1137    shiftCount = countLeadingZeros32( absA ) + 32;
1138    zSig = absA;
1139    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1140
1141}
1142
1143#endif
1144
1145#ifdef FLOAT128
1146
1147/*----------------------------------------------------------------------------
1148| Returns the result of converting the 32-bit two's complement integer `a' to
1149| the quadruple-precision floating-point format.  The conversion is performed
1150| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1151*----------------------------------------------------------------------------*/
1152
1153float128 int32_to_float128( int32 a STATUS_PARAM )
1154{
1155    flag zSign;
1156    uint32 absA;
1157    int8 shiftCount;
1158    bits64 zSig0;
1159
1160    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1161    zSign = ( a < 0 );
1162    absA = zSign ? - a : a;
1163    shiftCount = countLeadingZeros32( absA ) + 17;
1164    zSig0 = absA;
1165    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1166
1167}
1168
1169#endif
1170
1171/*----------------------------------------------------------------------------
1172| Returns the result of converting the 64-bit two's complement integer `a'
1173| to the single-precision floating-point format.  The conversion is performed
1174| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1175*----------------------------------------------------------------------------*/
1176
1177float32 int64_to_float32( int64 a STATUS_PARAM )
1178{
1179    flag zSign;
1180    uint64 absA;
1181    int8 shiftCount;
1182
1183    if ( a == 0 ) return float32_zero;
1184    zSign = ( a < 0 );
1185    absA = zSign ? - a : a;
1186    shiftCount = countLeadingZeros64( absA ) - 40;
1187    if ( 0 <= shiftCount ) {
1188        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1189    }
1190    else {
1191        shiftCount += 7;
1192        if ( shiftCount < 0 ) {
1193            shift64RightJamming( absA, - shiftCount, &absA );
1194        }
1195        else {
1196            absA <<= shiftCount;
1197        }
1198        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1199    }
1200
1201}
1202
1203float32 uint64_to_float32( uint64 a STATUS_PARAM )
1204{
1205    int8 shiftCount;
1206
1207    if ( a == 0 ) return float32_zero;
1208    shiftCount = countLeadingZeros64( a ) - 40;
1209    if ( 0 <= shiftCount ) {
1210        return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1211    }
1212    else {
1213        shiftCount += 7;
1214        if ( shiftCount < 0 ) {
1215            shift64RightJamming( a, - shiftCount, &a );
1216        }
1217        else {
1218            a <<= shiftCount;
1219        }
1220        return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1221    }
1222}
1223
1224/*----------------------------------------------------------------------------
1225| Returns the result of converting the 64-bit two's complement integer `a'
1226| to the double-precision floating-point format.  The conversion is performed
1227| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1228*----------------------------------------------------------------------------*/
1229
1230float64 int64_to_float64( int64 a STATUS_PARAM )
1231{
1232    flag zSign;
1233
1234    if ( a == 0 ) return float64_zero;
1235    if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1236        return packFloat64( 1, 0x43E, 0 );
1237    }
1238    zSign = ( a < 0 );
1239    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1240
1241}
1242
1243float64 uint64_to_float64( uint64 a STATUS_PARAM )
1244{
1245    if ( a == 0 ) return float64_zero;
1246    return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1247
1248}
1249
1250#ifdef FLOATX80
1251
1252/*----------------------------------------------------------------------------
1253| Returns the result of converting the 64-bit two's complement integer `a'
1254| to the extended double-precision floating-point format.  The conversion
1255| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1256| Arithmetic.
1257*----------------------------------------------------------------------------*/
1258
1259floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1260{
1261    flag zSign;
1262    uint64 absA;
1263    int8 shiftCount;
1264
1265    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1266    zSign = ( a < 0 );
1267    absA = zSign ? - a : a;
1268    shiftCount = countLeadingZeros64( absA );
1269    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1270
1271}
1272
1273#endif
1274
1275#ifdef FLOAT128
1276
1277/*----------------------------------------------------------------------------
1278| Returns the result of converting the 64-bit two's complement integer `a' to
1279| the quadruple-precision floating-point format.  The conversion is performed
1280| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1281*----------------------------------------------------------------------------*/
1282
1283float128 int64_to_float128( int64 a STATUS_PARAM )
1284{
1285    flag zSign;
1286    uint64 absA;
1287    int8 shiftCount;
1288    int32 zExp;
1289    bits64 zSig0, zSig1;
1290
1291    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1292    zSign = ( a < 0 );
1293    absA = zSign ? - a : a;
1294    shiftCount = countLeadingZeros64( absA ) + 49;
1295    zExp = 0x406E - shiftCount;
1296    if ( 64 <= shiftCount ) {
1297        zSig1 = 0;
1298        zSig0 = absA;
1299        shiftCount -= 64;
1300    }
1301    else {
1302        zSig1 = absA;
1303        zSig0 = 0;
1304    }
1305    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1306    return packFloat128( zSign, zExp, zSig0, zSig1 );
1307
1308}
1309
1310#endif
1311
1312/*----------------------------------------------------------------------------
1313| Returns the result of converting the single-precision floating-point value
1314| `a' to the 32-bit two's complement integer format.  The conversion is
1315| performed according to the IEC/IEEE Standard for Binary Floating-Point
1316| Arithmetic---which means in particular that the conversion is rounded
1317| according to the current rounding mode.  If `a' is a NaN, the largest
1318| positive integer is returned.  Otherwise, if the conversion overflows, the
1319| largest integer with the same sign as `a' is returned.
1320*----------------------------------------------------------------------------*/
1321
1322int32 float32_to_int32( float32 a STATUS_PARAM )
1323{
1324    flag aSign;
1325    int16 aExp, shiftCount;
1326    bits32 aSig;
1327    bits64 aSig64;
1328
1329    a = float32_squash_input_denormal(a STATUS_VAR);
1330    aSig = extractFloat32Frac( a );
1331    aExp = extractFloat32Exp( a );
1332    aSign = extractFloat32Sign( a );
1333    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1334    if ( aExp ) aSig |= 0x00800000;
1335    shiftCount = 0xAF - aExp;
1336    aSig64 = aSig;
1337    aSig64 <<= 32;
1338    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1339    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1340
1341}
1342
1343/*----------------------------------------------------------------------------
1344| Returns the result of converting the single-precision floating-point value
1345| `a' to the 32-bit two's complement integer format.  The conversion is
1346| performed according to the IEC/IEEE Standard for Binary Floating-Point
1347| Arithmetic, except that the conversion is always rounded toward zero.
1348| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1349| the conversion overflows, the largest integer with the same sign as `a' is
1350| returned.
1351*----------------------------------------------------------------------------*/
1352
1353int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1354{
1355    flag aSign;
1356    int16 aExp, shiftCount;
1357    bits32 aSig;
1358    int32 z;
1359    a = float32_squash_input_denormal(a STATUS_VAR);
1360
1361    aSig = extractFloat32Frac( a );
1362    aExp = extractFloat32Exp( a );
1363    aSign = extractFloat32Sign( a );
1364    shiftCount = aExp - 0x9E;
1365    if ( 0 <= shiftCount ) {
1366        if ( float32_val(a) != 0xCF000000 ) {
1367            float_raise( float_flag_invalid STATUS_VAR);
1368            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1369        }
1370        return (sbits32) 0x80000000;
1371    }
1372    else if ( aExp <= 0x7E ) {
1373        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1374        return 0;
1375    }
1376    aSig = ( aSig | 0x00800000 )<<8;
1377    z = aSig>>( - shiftCount );
1378    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1379        STATUS(float_exception_flags) |= float_flag_inexact;
1380    }
1381    if ( aSign ) z = - z;
1382    return z;
1383
1384}
1385
1386/*----------------------------------------------------------------------------
1387| Returns the result of converting the single-precision floating-point value
1388| `a' to the 16-bit two's complement integer format.  The conversion is
1389| performed according to the IEC/IEEE Standard for Binary Floating-Point
1390| Arithmetic, except that the conversion is always rounded toward zero.
1391| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1392| the conversion overflows, the largest integer with the same sign as `a' is
1393| returned.
1394*----------------------------------------------------------------------------*/
1395
1396int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1397{
1398    flag aSign;
1399    int16 aExp, shiftCount;
1400    bits32 aSig;
1401    int32 z;
1402
1403    aSig = extractFloat32Frac( a );
1404    aExp = extractFloat32Exp( a );
1405    aSign = extractFloat32Sign( a );
1406    shiftCount = aExp - 0x8E;
1407    if ( 0 <= shiftCount ) {
1408        if ( float32_val(a) != 0xC7000000 ) {
1409            float_raise( float_flag_invalid STATUS_VAR);
1410            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1411                return 0x7FFF;
1412            }
1413        }
1414        return (sbits32) 0xffff8000;
1415    }
1416    else if ( aExp <= 0x7E ) {
1417        if ( aExp | aSig ) {
1418            STATUS(float_exception_flags) |= float_flag_inexact;
1419        }
1420        return 0;
1421    }
1422    shiftCount -= 0x10;
1423    aSig = ( aSig | 0x00800000 )<<8;
1424    z = aSig>>( - shiftCount );
1425    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1426        STATUS(float_exception_flags) |= float_flag_inexact;
1427    }
1428    if ( aSign ) {
1429        z = - z;
1430    }
1431    return z;
1432
1433}
1434
1435/*----------------------------------------------------------------------------
1436| Returns the result of converting the single-precision floating-point value
1437| `a' to the 64-bit two's complement integer format.  The conversion is
1438| performed according to the IEC/IEEE Standard for Binary Floating-Point
1439| Arithmetic---which means in particular that the conversion is rounded
1440| according to the current rounding mode.  If `a' is a NaN, the largest
1441| positive integer is returned.  Otherwise, if the conversion overflows, the
1442| largest integer with the same sign as `a' is returned.
1443*----------------------------------------------------------------------------*/
1444
1445int64 float32_to_int64( float32 a STATUS_PARAM )
1446{
1447    flag aSign;
1448    int16 aExp, shiftCount;
1449    bits32 aSig;
1450    bits64 aSig64, aSigExtra;
1451    a = float32_squash_input_denormal(a STATUS_VAR);
1452
1453    aSig = extractFloat32Frac( a );
1454    aExp = extractFloat32Exp( a );
1455    aSign = extractFloat32Sign( a );
1456    shiftCount = 0xBE - aExp;
1457    if ( shiftCount < 0 ) {
1458        float_raise( float_flag_invalid STATUS_VAR);
1459        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1460            return LIT64( 0x7FFFFFFFFFFFFFFF );
1461        }
1462        return (sbits64) LIT64( 0x8000000000000000 );
1463    }
1464    if ( aExp ) aSig |= 0x00800000;
1465    aSig64 = aSig;
1466    aSig64 <<= 40;
1467    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1468    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1469
1470}
1471
1472/*----------------------------------------------------------------------------
1473| Returns the result of converting the single-precision floating-point value
1474| `a' to the 64-bit two's complement integer format.  The conversion is
1475| performed according to the IEC/IEEE Standard for Binary Floating-Point
1476| Arithmetic, except that the conversion is always rounded toward zero.  If
1477| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1478| conversion overflows, the largest integer with the same sign as `a' is
1479| returned.
1480*----------------------------------------------------------------------------*/
1481
1482int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1483{
1484    flag aSign;
1485    int16 aExp, shiftCount;
1486    bits32 aSig;
1487    bits64 aSig64;
1488    int64 z;
1489    a = float32_squash_input_denormal(a STATUS_VAR);
1490
1491    aSig = extractFloat32Frac( a );
1492    aExp = extractFloat32Exp( a );
1493    aSign = extractFloat32Sign( a );
1494    shiftCount = aExp - 0xBE;
1495    if ( 0 <= shiftCount ) {
1496        if ( float32_val(a) != 0xDF000000 ) {
1497            float_raise( float_flag_invalid STATUS_VAR);
1498            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1499                return LIT64( 0x7FFFFFFFFFFFFFFF );
1500            }
1501        }
1502        return (sbits64) LIT64( 0x8000000000000000 );
1503    }
1504    else if ( aExp <= 0x7E ) {
1505        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1506        return 0;
1507    }
1508    aSig64 = aSig | 0x00800000;
1509    aSig64 <<= 40;
1510    z = aSig64>>( - shiftCount );
1511    if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1512        STATUS(float_exception_flags) |= float_flag_inexact;
1513    }
1514    if ( aSign ) z = - z;
1515    return z;
1516
1517}
1518
1519/*----------------------------------------------------------------------------
1520| Returns the result of converting the single-precision floating-point value
1521| `a' to the double-precision floating-point format.  The conversion is
1522| performed according to the IEC/IEEE Standard for Binary Floating-Point
1523| Arithmetic.
1524*----------------------------------------------------------------------------*/
1525
1526float64 float32_to_float64( float32 a STATUS_PARAM )
1527{
1528    flag aSign;
1529    int16 aExp;
1530    bits32 aSig;
1531    a = float32_squash_input_denormal(a STATUS_VAR);
1532
1533    aSig = extractFloat32Frac( a );
1534    aExp = extractFloat32Exp( a );
1535    aSign = extractFloat32Sign( a );
1536    if ( aExp == 0xFF ) {
1537        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1538        return packFloat64( aSign, 0x7FF, 0 );
1539    }
1540    if ( aExp == 0 ) {
1541        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1542        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1543        --aExp;
1544    }
1545    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1546
1547}
1548
1549#ifdef FLOATX80
1550
1551/*----------------------------------------------------------------------------
1552| Returns the result of converting the single-precision floating-point value
1553| `a' to the extended double-precision floating-point format.  The conversion
1554| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1555| Arithmetic.
1556*----------------------------------------------------------------------------*/
1557
1558floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1559{
1560    flag aSign;
1561    int16 aExp;
1562    bits32 aSig;
1563
1564    a = float32_squash_input_denormal(a STATUS_VAR);
1565    aSig = extractFloat32Frac( a );
1566    aExp = extractFloat32Exp( a );
1567    aSign = extractFloat32Sign( a );
1568    if ( aExp == 0xFF ) {
1569        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1570        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1571    }
1572    if ( aExp == 0 ) {
1573        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1574        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1575    }
1576    aSig |= 0x00800000;
1577    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1578
1579}
1580
1581#endif
1582
1583#ifdef FLOAT128
1584
1585/*----------------------------------------------------------------------------
1586| Returns the result of converting the single-precision floating-point value
1587| `a' to the double-precision floating-point format.  The conversion is
1588| performed according to the IEC/IEEE Standard for Binary Floating-Point
1589| Arithmetic.
1590*----------------------------------------------------------------------------*/
1591
1592float128 float32_to_float128( float32 a STATUS_PARAM )
1593{
1594    flag aSign;
1595    int16 aExp;
1596    bits32 aSig;
1597
1598    a = float32_squash_input_denormal(a STATUS_VAR);
1599    aSig = extractFloat32Frac( a );
1600    aExp = extractFloat32Exp( a );
1601    aSign = extractFloat32Sign( a );
1602    if ( aExp == 0xFF ) {
1603        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1604        return packFloat128( aSign, 0x7FFF, 0, 0 );
1605    }
1606    if ( aExp == 0 ) {
1607        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1608        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1609        --aExp;
1610    }
1611    return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1612
1613}
1614
1615#endif
1616
1617/*----------------------------------------------------------------------------
1618| Rounds the single-precision floating-point value `a' to an integer, and
1619| returns the result as a single-precision floating-point value.  The
1620| operation is performed according to the IEC/IEEE Standard for Binary
1621| Floating-Point Arithmetic.
1622*----------------------------------------------------------------------------*/
1623
1624float32 float32_round_to_int( float32 a STATUS_PARAM)
1625{
1626    flag aSign;
1627    int16 aExp;
1628    bits32 lastBitMask, roundBitsMask;
1629    int8 roundingMode;
1630    bits32 z;
1631    a = float32_squash_input_denormal(a STATUS_VAR);
1632
1633    aExp = extractFloat32Exp( a );
1634    if ( 0x96 <= aExp ) {
1635        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1636            return propagateFloat32NaN( a, a STATUS_VAR );
1637        }
1638        return a;
1639    }
1640    if ( aExp <= 0x7E ) {
1641        if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1642        STATUS(float_exception_flags) |= float_flag_inexact;
1643        aSign = extractFloat32Sign( a );
1644        switch ( STATUS(float_rounding_mode) ) {
1645         case float_round_nearest_even:
1646            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1647                return packFloat32( aSign, 0x7F, 0 );
1648            }
1649            break;
1650         case float_round_down:
1651            return make_float32(aSign ? 0xBF800000 : 0);
1652         case float_round_up:
1653            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1654        }
1655        return packFloat32( aSign, 0, 0 );
1656    }
1657    lastBitMask = 1;
1658    lastBitMask <<= 0x96 - aExp;
1659    roundBitsMask = lastBitMask - 1;
1660    z = float32_val(a);
1661    roundingMode = STATUS(float_rounding_mode);
1662    if ( roundingMode == float_round_nearest_even ) {
1663        z += lastBitMask>>1;
1664        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1665    }
1666    else if ( roundingMode != float_round_to_zero ) {
1667        if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1668            z += roundBitsMask;
1669        }
1670    }
1671    z &= ~ roundBitsMask;
1672    if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1673    return make_float32(z);
1674
1675}
1676
1677/*----------------------------------------------------------------------------
1678| Returns the result of adding the absolute values of the single-precision
1679| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1680| before being returned.  `zSign' is ignored if the result is a NaN.
1681| The addition is performed according to the IEC/IEEE Standard for Binary
1682| Floating-Point Arithmetic.
1683*----------------------------------------------------------------------------*/
1684
1685static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1686{
1687    int16 aExp, bExp, zExp;
1688    bits32 aSig, bSig, zSig;
1689    int16 expDiff;
1690
1691    aSig = extractFloat32Frac( a );
1692    aExp = extractFloat32Exp( a );
1693    bSig = extractFloat32Frac( b );
1694    bExp = extractFloat32Exp( b );
1695    expDiff = aExp - bExp;
1696    aSig <<= 6;
1697    bSig <<= 6;
1698    if ( 0 < expDiff ) {
1699        if ( aExp == 0xFF ) {
1700            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1701            return a;
1702        }
1703        if ( bExp == 0 ) {
1704            --expDiff;
1705        }
1706        else {
1707            bSig |= 0x20000000;
1708        }
1709        shift32RightJamming( bSig, expDiff, &bSig );
1710        zExp = aExp;
1711    }
1712    else if ( expDiff < 0 ) {
1713        if ( bExp == 0xFF ) {
1714            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1715            return packFloat32( zSign, 0xFF, 0 );
1716        }
1717        if ( aExp == 0 ) {
1718            ++expDiff;
1719        }
1720        else {
1721            aSig |= 0x20000000;
1722        }
1723        shift32RightJamming( aSig, - expDiff, &aSig );
1724        zExp = bExp;
1725    }
1726    else {
1727        if ( aExp == 0xFF ) {
1728            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1729            return a;
1730        }
1731        if ( aExp == 0 ) {
1732            if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1733            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1734        }
1735        zSig = 0x40000000 + aSig + bSig;
1736        zExp = aExp;
1737        goto roundAndPack;
1738    }
1739    aSig |= 0x20000000;
1740    zSig = ( aSig + bSig )<<1;
1741    --zExp;
1742    if ( (sbits32) zSig < 0 ) {
1743        zSig = aSig + bSig;
1744        ++zExp;
1745    }
1746 roundAndPack:
1747    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1748
1749}
1750
1751/*----------------------------------------------------------------------------
1752| Returns the result of subtracting the absolute values of the single-
1753| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1754| difference is negated before being returned.  `zSign' is ignored if the
1755| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1756| Standard for Binary Floating-Point Arithmetic.
1757*----------------------------------------------------------------------------*/
1758
1759static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1760{
1761    int16 aExp, bExp, zExp;
1762    bits32 aSig, bSig, zSig;
1763    int16 expDiff;
1764
1765    aSig = extractFloat32Frac( a );
1766    aExp = extractFloat32Exp( a );
1767    bSig = extractFloat32Frac( b );
1768    bExp = extractFloat32Exp( b );
1769    expDiff = aExp - bExp;
1770    aSig <<= 7;
1771    bSig <<= 7;
1772    if ( 0 < expDiff ) goto aExpBigger;
1773    if ( expDiff < 0 ) goto bExpBigger;
1774    if ( aExp == 0xFF ) {
1775        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1776        float_raise( float_flag_invalid STATUS_VAR);
1777        return float32_default_nan;
1778    }
1779    if ( aExp == 0 ) {
1780        aExp = 1;
1781        bExp = 1;
1782    }
1783    if ( bSig < aSig ) goto aBigger;
1784    if ( aSig < bSig ) goto bBigger;
1785    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1786 bExpBigger:
1787    if ( bExp == 0xFF ) {
1788        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1789        return packFloat32( zSign ^ 1, 0xFF, 0 );
1790    }
1791    if ( aExp == 0 ) {
1792        ++expDiff;
1793    }
1794    else {
1795        aSig |= 0x40000000;
1796    }
1797    shift32RightJamming( aSig, - expDiff, &aSig );
1798    bSig |= 0x40000000;
1799 bBigger:
1800    zSig = bSig - aSig;
1801    zExp = bExp;
1802    zSign ^= 1;
1803    goto normalizeRoundAndPack;
1804 aExpBigger:
1805    if ( aExp == 0xFF ) {
1806        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1807        return a;
1808    }
1809    if ( bExp == 0 ) {
1810        --expDiff;
1811    }
1812    else {
1813        bSig |= 0x40000000;
1814    }
1815    shift32RightJamming( bSig, expDiff, &bSig );
1816    aSig |= 0x40000000;
1817 aBigger:
1818    zSig = aSig - bSig;
1819    zExp = aExp;
1820 normalizeRoundAndPack:
1821    --zExp;
1822    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1823
1824}
1825
1826/*----------------------------------------------------------------------------
1827| Returns the result of adding the single-precision floating-point values `a'
1828| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1829| Binary Floating-Point Arithmetic.
1830*----------------------------------------------------------------------------*/
1831
1832float32 float32_add( float32 a, float32 b STATUS_PARAM )
1833{
1834    flag aSign, bSign;
1835    a = float32_squash_input_denormal(a STATUS_VAR);
1836    b = float32_squash_input_denormal(b STATUS_VAR);
1837
1838    aSign = extractFloat32Sign( a );
1839    bSign = extractFloat32Sign( b );
1840    if ( aSign == bSign ) {
1841        return addFloat32Sigs( a, b, aSign STATUS_VAR);
1842    }
1843    else {
1844        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1845    }
1846
1847}
1848
1849/*----------------------------------------------------------------------------
1850| Returns the result of subtracting the single-precision floating-point values
1851| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1852| for Binary Floating-Point Arithmetic.
1853*----------------------------------------------------------------------------*/
1854
1855float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1856{
1857    flag aSign, bSign;
1858    a = float32_squash_input_denormal(a STATUS_VAR);
1859    b = float32_squash_input_denormal(b STATUS_VAR);
1860
1861    aSign = extractFloat32Sign( a );
1862    bSign = extractFloat32Sign( b );
1863    if ( aSign == bSign ) {
1864        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1865    }
1866    else {
1867        return addFloat32Sigs( a, b, aSign STATUS_VAR );
1868    }
1869
1870}
1871
1872/*----------------------------------------------------------------------------
1873| Returns the result of multiplying the single-precision floating-point values
1874| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1875| for Binary Floating-Point Arithmetic.
1876*----------------------------------------------------------------------------*/
1877
1878float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1879{
1880    flag aSign, bSign, zSign;
1881    int16 aExp, bExp, zExp;
1882    bits32 aSig, bSig;
1883    bits64 zSig64;
1884    bits32 zSig;
1885
1886    a = float32_squash_input_denormal(a STATUS_VAR);
1887    b = float32_squash_input_denormal(b STATUS_VAR);
1888
1889    aSig = extractFloat32Frac( a );
1890    aExp = extractFloat32Exp( a );
1891    aSign = extractFloat32Sign( a );
1892    bSig = extractFloat32Frac( b );
1893    bExp = extractFloat32Exp( b );
1894    bSign = extractFloat32Sign( b );
1895    zSign = aSign ^ bSign;
1896    if ( aExp == 0xFF ) {
1897        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1898            return propagateFloat32NaN( a, b STATUS_VAR );
1899        }
1900        if ( ( bExp | bSig ) == 0 ) {
1901            float_raise( float_flag_invalid STATUS_VAR);
1902            return float32_default_nan;
1903        }
1904        return packFloat32( zSign, 0xFF, 0 );
1905    }
1906    if ( bExp == 0xFF ) {
1907        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1908        if ( ( aExp | aSig ) == 0 ) {
1909            float_raise( float_flag_invalid STATUS_VAR);
1910            return float32_default_nan;
1911        }
1912        return packFloat32( zSign, 0xFF, 0 );
1913    }
1914    if ( aExp == 0 ) {
1915        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1916        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1917    }
1918    if ( bExp == 0 ) {
1919        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1920        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1921    }
1922    zExp = aExp + bExp - 0x7F;
1923    aSig = ( aSig | 0x00800000 )<<7;
1924    bSig = ( bSig | 0x00800000 )<<8;
1925    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1926    zSig = zSig64;
1927    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1928        zSig <<= 1;
1929        --zExp;
1930    }
1931    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1932
1933}
1934
1935/*----------------------------------------------------------------------------
1936| Returns the result of dividing the single-precision floating-point value `a'
1937| by the corresponding value `b'.  The operation is performed according to the
1938| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1939*----------------------------------------------------------------------------*/
1940
1941float32 float32_div( float32 a, float32 b STATUS_PARAM )
1942{
1943    flag aSign, bSign, zSign;
1944    int16 aExp, bExp, zExp;
1945    bits32 aSig, bSig, zSig;
1946    a = float32_squash_input_denormal(a STATUS_VAR);
1947    b = float32_squash_input_denormal(b STATUS_VAR);
1948
1949    aSig = extractFloat32Frac( a );
1950    aExp = extractFloat32Exp( a );
1951    aSign = extractFloat32Sign( a );
1952    bSig = extractFloat32Frac( b );
1953    bExp = extractFloat32Exp( b );
1954    bSign = extractFloat32Sign( b );
1955    zSign = aSign ^ bSign;
1956    if ( aExp == 0xFF ) {
1957        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1958        if ( bExp == 0xFF ) {
1959            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1960            float_raise( float_flag_invalid STATUS_VAR);
1961            return float32_default_nan;
1962        }
1963        return packFloat32( zSign, 0xFF, 0 );
1964    }
1965    if ( bExp == 0xFF ) {
1966        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1967        return packFloat32( zSign, 0, 0 );
1968    }
1969    if ( bExp == 0 ) {
1970        if ( bSig == 0 ) {
1971            if ( ( aExp | aSig ) == 0 ) {
1972                float_raise( float_flag_invalid STATUS_VAR);
1973                return float32_default_nan;
1974            }
1975            float_raise( float_flag_divbyzero STATUS_VAR);
1976            return packFloat32( zSign, 0xFF, 0 );
1977        }
1978        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1979    }
1980    if ( aExp == 0 ) {
1981        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1982        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1983    }
1984    zExp = aExp - bExp + 0x7D;
1985    aSig = ( aSig | 0x00800000 )<<7;
1986    bSig = ( bSig | 0x00800000 )<<8;
1987    if ( bSig <= ( aSig + aSig ) ) {
1988        aSig >>= 1;
1989        ++zExp;
1990    }
1991    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1992    if ( ( zSig & 0x3F ) == 0 ) {
1993        zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1994    }
1995    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1996
1997}
1998
1999/*----------------------------------------------------------------------------
2000| Returns the remainder of the single-precision floating-point value `a'
2001| with respect to the corresponding value `b'.  The operation is performed
2002| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2003*----------------------------------------------------------------------------*/
2004
2005float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2006{
2007    flag aSign, zSign;
2008    int16 aExp, bExp, expDiff;
2009    bits32 aSig, bSig;
2010    bits32 q;
2011    bits64 aSig64, bSig64, q64;
2012    bits32 alternateASig;
2013    sbits32 sigMean;
2014    a = float32_squash_input_denormal(a STATUS_VAR);
2015    b = float32_squash_input_denormal(b STATUS_VAR);
2016
2017    aSig = extractFloat32Frac( a );
2018    aExp = extractFloat32Exp( a );
2019    aSign = extractFloat32Sign( a );
2020    bSig = extractFloat32Frac( b );
2021    bExp = extractFloat32Exp( b );
2022    if ( aExp == 0xFF ) {
2023        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2024            return propagateFloat32NaN( a, b STATUS_VAR );
2025        }
2026        float_raise( float_flag_invalid STATUS_VAR);
2027        return float32_default_nan;
2028    }
2029    if ( bExp == 0xFF ) {
2030        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2031        return a;
2032    }
2033    if ( bExp == 0 ) {
2034        if ( bSig == 0 ) {
2035            float_raise( float_flag_invalid STATUS_VAR);
2036            return float32_default_nan;
2037        }
2038        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2039    }
2040    if ( aExp == 0 ) {
2041        if ( aSig == 0 ) return a;
2042        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2043    }
2044    expDiff = aExp - bExp;
2045    aSig |= 0x00800000;
2046    bSig |= 0x00800000;
2047    if ( expDiff < 32 ) {
2048        aSig <<= 8;
2049        bSig <<= 8;
2050        if ( expDiff < 0 ) {
2051            if ( expDiff < -1 ) return a;
2052            aSig >>= 1;
2053        }
2054        q = ( bSig <= aSig );
2055        if ( q ) aSig -= bSig;
2056        if ( 0 < expDiff ) {
2057            q = ( ( (bits64) aSig )<<32 ) / bSig;
2058            q >>= 32 - expDiff;
2059            bSig >>= 2;
2060            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2061        }
2062        else {
2063            aSig >>= 2;
2064            bSig >>= 2;
2065        }
2066    }
2067    else {
2068        if ( bSig <= aSig ) aSig -= bSig;
2069        aSig64 = ( (bits64) aSig )<<40;
2070        bSig64 = ( (bits64) bSig )<<40;
2071        expDiff -= 64;
2072        while ( 0 < expDiff ) {
2073            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2074            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2075            aSig64 = - ( ( bSig * q64 )<<38 );
2076            expDiff -= 62;
2077        }
2078        expDiff += 64;
2079        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2080        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2081        q = q64>>( 64 - expDiff );
2082        bSig <<= 6;
2083        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2084    }
2085    do {
2086        alternateASig = aSig;
2087        ++q;
2088        aSig -= bSig;
2089    } while ( 0 <= (sbits32) aSig );
2090    sigMean = aSig + alternateASig;
2091    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2092        aSig = alternateASig;
2093    }
2094    zSign = ( (sbits32) aSig < 0 );
2095    if ( zSign ) aSig = - aSig;
2096    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2097
2098}
2099
2100/*----------------------------------------------------------------------------
2101| Returns the square root of the single-precision floating-point value `a'.
2102| The operation is performed according to the IEC/IEEE Standard for Binary
2103| Floating-Point Arithmetic.
2104*----------------------------------------------------------------------------*/
2105
2106float32 float32_sqrt( float32 a STATUS_PARAM )
2107{
2108    flag aSign;
2109    int16 aExp, zExp;
2110    bits32 aSig, zSig;
2111    bits64 rem, term;
2112    a = float32_squash_input_denormal(a STATUS_VAR);
2113
2114    aSig = extractFloat32Frac( a );
2115    aExp = extractFloat32Exp( a );
2116    aSign = extractFloat32Sign( a );
2117    if ( aExp == 0xFF ) {
2118        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2119        if ( ! aSign ) return a;
2120        float_raise( float_flag_invalid STATUS_VAR);
2121        return float32_default_nan;
2122    }
2123    if ( aSign ) {
2124        if ( ( aExp | aSig ) == 0 ) return a;
2125        float_raise( float_flag_invalid STATUS_VAR);
2126        return float32_default_nan;
2127    }
2128    if ( aExp == 0 ) {
2129        if ( aSig == 0 ) return float32_zero;
2130        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2131    }
2132    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2133    aSig = ( aSig | 0x00800000 )<<8;
2134    zSig = estimateSqrt32( aExp, aSig ) + 2;
2135    if ( ( zSig & 0x7F ) <= 5 ) {
2136        if ( zSig < 2 ) {
2137            zSig = 0x7FFFFFFF;
2138            goto roundAndPack;
2139        }
2140        aSig >>= aExp & 1;
2141        term = ( (bits64) zSig ) * zSig;
2142        rem = ( ( (bits64) aSig )<<32 ) - term;
2143        while ( (sbits64) rem < 0 ) {
2144            --zSig;
2145            rem += ( ( (bits64) zSig )<<1 ) | 1;
2146        }
2147        zSig |= ( rem != 0 );
2148    }
2149    shift32RightJamming( zSig, 1, &zSig );
2150 roundAndPack:
2151    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2152
2153}
2154
2155/*----------------------------------------------------------------------------
2156| Returns the binary exponential of the single-precision floating-point value
2157| `a'. The operation is performed according to the IEC/IEEE Standard for
2158| Binary Floating-Point Arithmetic.
2159|
2160| Uses the following identities:
2161|
2162| 1. -------------------------------------------------------------------------
2163|      x    x*ln(2)
2164|     2  = e
2165|
2166| 2. -------------------------------------------------------------------------
2167|                      2     3     4     5           n
2168|      x        x     x     x     x     x           x
2169|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2170|               1!    2!    3!    4!    5!          n!
2171*----------------------------------------------------------------------------*/
2172
2173static const float64 float32_exp2_coefficients[15] =
2174{
2175    make_float64( 0x3ff0000000000000ll ), /*  1 */
2176    make_float64( 0x3fe0000000000000ll ), /*  2 */
2177    make_float64( 0x3fc5555555555555ll ), /*  3 */
2178    make_float64( 0x3fa5555555555555ll ), /*  4 */
2179    make_float64( 0x3f81111111111111ll ), /*  5 */
2180    make_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2181    make_float64( 0x3f2a01a01a01a01all ), /*  7 */
2182    make_float64( 0x3efa01a01a01a01all ), /*  8 */
2183    make_float64( 0x3ec71de3a556c734ll ), /*  9 */
2184    make_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2185    make_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2186    make_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2187    make_float64( 0x3de6124613a86d09ll ), /* 13 */
2188    make_float64( 0x3da93974a8c07c9dll ), /* 14 */
2189    make_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2190};
2191
2192float32 float32_exp2( float32 a STATUS_PARAM )
2193{
2194    flag aSign;
2195    int16 aExp;
2196    bits32 aSig;
2197    float64 r, x, xn;
2198    int i;
2199    a = float32_squash_input_denormal(a STATUS_VAR);
2200
2201    aSig = extractFloat32Frac( a );
2202    aExp = extractFloat32Exp( a );
2203    aSign = extractFloat32Sign( a );
2204
2205    if ( aExp == 0xFF) {
2206        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2207        return (aSign) ? float32_zero : a;
2208    }
2209    if (aExp == 0) {
2210        if (aSig == 0) return float32_one;
2211    }
2212
2213    float_raise( float_flag_inexact STATUS_VAR);
2214
2215    /* ******************************* */
2216    /* using float64 for approximation */
2217    /* ******************************* */
2218    x = float32_to_float64(a STATUS_VAR);
2219    x = float64_mul(x, float64_ln2 STATUS_VAR);
2220
2221    xn = x;
2222    r = float64_one;
2223    for (i = 0 ; i < 15 ; i++) {
2224        float64 f;
2225
2226        f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2227        r = float64_add(r, f STATUS_VAR);
2228
2229        xn = float64_mul(xn, x STATUS_VAR);
2230    }
2231
2232    return float64_to_float32(r, status);
2233}
2234
2235/*----------------------------------------------------------------------------
2236| Returns the binary log of the single-precision floating-point value `a'.
2237| The operation is performed according to the IEC/IEEE Standard for Binary
2238| Floating-Point Arithmetic.
2239*----------------------------------------------------------------------------*/
2240float32 float32_log2( float32 a STATUS_PARAM )
2241{
2242    flag aSign, zSign;
2243    int16 aExp;
2244    bits32 aSig, zSig, i;
2245
2246    a = float32_squash_input_denormal(a STATUS_VAR);
2247    aSig = extractFloat32Frac( a );
2248    aExp = extractFloat32Exp( a );
2249    aSign = extractFloat32Sign( a );
2250
2251    if ( aExp == 0 ) {
2252        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2253        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2254    }
2255    if ( aSign ) {
2256        float_raise( float_flag_invalid STATUS_VAR);
2257        return float32_default_nan;
2258    }
2259    if ( aExp == 0xFF ) {
2260        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2261        return a;
2262    }
2263
2264    aExp -= 0x7F;
2265    aSig |= 0x00800000;
2266    zSign = aExp < 0;
2267    zSig = aExp << 23;
2268
2269    for (i = 1 << 22; i > 0; i >>= 1) {
2270        aSig = ( (bits64)aSig * aSig ) >> 23;
2271        if ( aSig & 0x01000000 ) {
2272            aSig >>= 1;
2273            zSig |= i;
2274        }
2275    }
2276
2277    if ( zSign )
2278        zSig = -zSig;
2279
2280    return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2281}
2282
2283/*----------------------------------------------------------------------------
2284| Returns 1 if the single-precision floating-point value `a' is equal to
2285| the corresponding value `b', and 0 otherwise.  The comparison is performed
2286| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2287*----------------------------------------------------------------------------*/
2288
2289int float32_eq( float32 a, float32 b STATUS_PARAM )
2290{
2291    a = float32_squash_input_denormal(a STATUS_VAR);
2292    b = float32_squash_input_denormal(b STATUS_VAR);
2293
2294    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2295         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2296       ) {
2297        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2298            float_raise( float_flag_invalid STATUS_VAR);
2299        }
2300        return 0;
2301    }
2302    return ( float32_val(a) == float32_val(b) ) ||
2303            ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2304
2305}
2306
2307/*----------------------------------------------------------------------------
2308| Returns 1 if the single-precision floating-point value `a' is less than
2309| or equal to the corresponding value `b', and 0 otherwise.  The comparison
2310| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2311| Arithmetic.
2312*----------------------------------------------------------------------------*/
2313
2314int float32_le( float32 a, float32 b STATUS_PARAM )
2315{
2316    flag aSign, bSign;
2317    bits32 av, bv;
2318    a = float32_squash_input_denormal(a STATUS_VAR);
2319    b = float32_squash_input_denormal(b STATUS_VAR);
2320
2321    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2322         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2323       ) {
2324        float_raise( float_flag_invalid STATUS_VAR);
2325        return 0;
2326    }
2327    aSign = extractFloat32Sign( a );
2328    bSign = extractFloat32Sign( b );
2329    av = float32_val(a);
2330    bv = float32_val(b);
2331    if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2332    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2333
2334}
2335
2336/*----------------------------------------------------------------------------
2337| Returns 1 if the single-precision floating-point value `a' is less than
2338| the corresponding value `b', and 0 otherwise.  The comparison is performed
2339| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2340*----------------------------------------------------------------------------*/
2341
2342int float32_lt( float32 a, float32 b STATUS_PARAM )
2343{
2344    flag aSign, bSign;
2345    bits32 av, bv;
2346    a = float32_squash_input_denormal(a STATUS_VAR);
2347    b = float32_squash_input_denormal(b STATUS_VAR);
2348
2349    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2350         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2351       ) {
2352        float_raise( float_flag_invalid STATUS_VAR);
2353        return 0;
2354    }
2355    aSign = extractFloat32Sign( a );
2356    bSign = extractFloat32Sign( b );
2357    av = float32_val(a);
2358    bv = float32_val(b);
2359    if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2360    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2361
2362}
2363
2364/*----------------------------------------------------------------------------
2365| Returns 1 if the single-precision floating-point value `a' is equal to
2366| the corresponding value `b', and 0 otherwise.  The invalid exception is
2367| raised if either operand is a NaN.  Otherwise, the comparison is performed
2368| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2369*----------------------------------------------------------------------------*/
2370
2371int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2372{
2373    bits32 av, bv;
2374    a = float32_squash_input_denormal(a STATUS_VAR);
2375    b = float32_squash_input_denormal(b STATUS_VAR);
2376
2377    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2378         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2379       ) {
2380        float_raise( float_flag_invalid STATUS_VAR);
2381        return 0;
2382    }
2383    av = float32_val(a);
2384    bv = float32_val(b);
2385    return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2386
2387}
2388
2389/*----------------------------------------------------------------------------
2390| Returns 1 if the single-precision floating-point value `a' is less than or
2391| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2392| cause an exception.  Otherwise, the comparison is performed according to the
2393| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2394*----------------------------------------------------------------------------*/
2395
2396int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2397{
2398    flag aSign, bSign;
2399    bits32 av, bv;
2400    a = float32_squash_input_denormal(a STATUS_VAR);
2401    b = float32_squash_input_denormal(b STATUS_VAR);
2402
2403    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2404         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2405       ) {
2406        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2407            float_raise( float_flag_invalid STATUS_VAR);
2408        }
2409        return 0;
2410    }
2411    aSign = extractFloat32Sign( a );
2412    bSign = extractFloat32Sign( b );
2413    av = float32_val(a);
2414    bv = float32_val(b);
2415    if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2416    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2417
2418}
2419
2420/*----------------------------------------------------------------------------
2421| Returns 1 if the single-precision floating-point value `a' is less than
2422| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2423| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2424| Standard for Binary Floating-Point Arithmetic.
2425*----------------------------------------------------------------------------*/
2426
2427int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2428{
2429    flag aSign, bSign;
2430    bits32 av, bv;
2431    a = float32_squash_input_denormal(a STATUS_VAR);
2432    b = float32_squash_input_denormal(b STATUS_VAR);
2433
2434    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2435         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2436       ) {
2437        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2438            float_raise( float_flag_invalid STATUS_VAR);
2439        }
2440        return 0;
2441    }
2442    aSign = extractFloat32Sign( a );
2443    bSign = extractFloat32Sign( b );
2444    av = float32_val(a);
2445    bv = float32_val(b);
2446    if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2447    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2448
2449}
2450
2451/*----------------------------------------------------------------------------
2452| Returns the result of converting the double-precision floating-point value
2453| `a' to the 32-bit two's complement integer format.  The conversion is
2454| performed according to the IEC/IEEE Standard for Binary Floating-Point
2455| Arithmetic---which means in particular that the conversion is rounded
2456| according to the current rounding mode.  If `a' is a NaN, the largest
2457| positive integer is returned.  Otherwise, if the conversion overflows, the
2458| largest integer with the same sign as `a' is returned.
2459*----------------------------------------------------------------------------*/
2460
2461int32 float64_to_int32( float64 a STATUS_PARAM )
2462{
2463    flag aSign;
2464    int16 aExp, shiftCount;
2465    bits64 aSig;
2466    a = float64_squash_input_denormal(a STATUS_VAR);
2467
2468    aSig = extractFloat64Frac( a );
2469    aExp = extractFloat64Exp( a );
2470    aSign = extractFloat64Sign( a );
2471    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2472    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2473    shiftCount = 0x42C - aExp;
2474    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2475    return roundAndPackInt32( aSign, aSig STATUS_VAR );
2476
2477}
2478
2479/*----------------------------------------------------------------------------
2480| Returns the result of converting the double-precision floating-point value
2481| `a' to the 32-bit two's complement integer format.  The conversion is
2482| performed according to the IEC/IEEE Standard for Binary Floating-Point
2483| Arithmetic, except that the conversion is always rounded toward zero.
2484| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2485| the conversion overflows, the largest integer with the same sign as `a' is
2486| returned.
2487*----------------------------------------------------------------------------*/
2488
2489int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2490{
2491    flag aSign;
2492    int16 aExp, shiftCount;
2493    bits64 aSig, savedASig;
2494    int32 z;
2495    a = float64_squash_input_denormal(a STATUS_VAR);
2496
2497    aSig = extractFloat64Frac( a );
2498    aExp = extractFloat64Exp( a );
2499    aSign = extractFloat64Sign( a );
2500    if ( 0x41E < aExp ) {
2501        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2502        goto invalid;
2503    }
2504    else if ( aExp < 0x3FF ) {
2505        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2506        return 0;
2507    }
2508    aSig |= LIT64( 0x0010000000000000 );
2509    shiftCount = 0x433 - aExp;
2510    savedASig = aSig;
2511    aSig >>= shiftCount;
2512    z = aSig;
2513    if ( aSign ) z = - z;
2514    if ( ( z < 0 ) ^ aSign ) {
2515 invalid:
2516        float_raise( float_flag_invalid STATUS_VAR);
2517        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2518    }
2519    if ( ( aSig<<shiftCount ) != savedASig ) {
2520        STATUS(float_exception_flags) |= float_flag_inexact;
2521    }
2522    return z;
2523
2524}
2525
2526/*----------------------------------------------------------------------------
2527| Returns the result of converting the double-precision floating-point value
2528| `a' to the 16-bit two's complement integer format.  The conversion is
2529| performed according to the IEC/IEEE Standard for Binary Floating-Point
2530| Arithmetic, except that the conversion is always rounded toward zero.
2531| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2532| the conversion overflows, the largest integer with the same sign as `a' is
2533| returned.
2534*----------------------------------------------------------------------------*/
2535
2536int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2537{
2538    flag aSign;
2539    int16 aExp, shiftCount;
2540    bits64 aSig, savedASig;
2541    int32 z;
2542
2543    aSig = extractFloat64Frac( a );
2544    aExp = extractFloat64Exp( a );
2545    aSign = extractFloat64Sign( a );
2546    if ( 0x40E < aExp ) {
2547        if ( ( aExp == 0x7FF ) && aSig ) {
2548            aSign = 0;
2549        }
2550        goto invalid;
2551    }
2552    else if ( aExp < 0x3FF ) {
2553        if ( aExp || aSig ) {
2554            STATUS(float_exception_flags) |= float_flag_inexact;
2555        }
2556        return 0;
2557    }
2558    aSig |= LIT64( 0x0010000000000000 );
2559    shiftCount = 0x433 - aExp;
2560    savedASig = aSig;
2561    aSig >>= shiftCount;
2562    z = aSig;
2563    if ( aSign ) {
2564        z = - z;
2565    }
2566    if ( ( (int16_t)z < 0 ) ^ aSign ) {
2567 invalid:
2568        float_raise( float_flag_invalid STATUS_VAR);
2569        return aSign ? (sbits32) 0xffff8000 : 0x7FFF;
2570    }
2571    if ( ( aSig<<shiftCount ) != savedASig ) {
2572        STATUS(float_exception_flags) |= float_flag_inexact;
2573    }
2574    return z;
2575}
2576
2577/*----------------------------------------------------------------------------
2578| Returns the result of converting the double-precision floating-point value
2579| `a' to the 64-bit two's complement integer format.  The conversion is
2580| performed according to the IEC/IEEE Standard for Binary Floating-Point
2581| Arithmetic---which means in particular that the conversion is rounded
2582| according to the current rounding mode.  If `a' is a NaN, the largest
2583| positive integer is returned.  Otherwise, if the conversion overflows, the
2584| largest integer with the same sign as `a' is returned.
2585*----------------------------------------------------------------------------*/
2586
2587int64 float64_to_int64( float64 a STATUS_PARAM )
2588{
2589    flag aSign;
2590    int16 aExp, shiftCount;
2591    bits64 aSig, aSigExtra;
2592    a = float64_squash_input_denormal(a STATUS_VAR);
2593
2594    aSig = extractFloat64Frac( a );
2595    aExp = extractFloat64Exp( a );
2596    aSign = extractFloat64Sign( a );
2597    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2598    shiftCount = 0x433 - aExp;
2599    if ( shiftCount <= 0 ) {
2600        if ( 0x43E < aExp ) {
2601            float_raise( float_flag_invalid STATUS_VAR);
2602            if (    ! aSign
2603                 || (    ( aExp == 0x7FF )
2604                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2605               ) {
2606                return LIT64( 0x7FFFFFFFFFFFFFFF );
2607            }
2608            return (sbits64) LIT64( 0x8000000000000000 );
2609        }
2610        aSigExtra = 0;
2611        aSig <<= - shiftCount;
2612    }
2613    else {
2614        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2615    }
2616    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2617
2618}
2619
2620/*----------------------------------------------------------------------------
2621| Returns the result of converting the double-precision floating-point value
2622| `a' to the 64-bit two's complement integer format.  The conversion is
2623| performed according to the IEC/IEEE Standard for Binary Floating-Point
2624| Arithmetic, except that the conversion is always rounded toward zero.
2625| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2626| the conversion overflows, the largest integer with the same sign as `a' is
2627| returned.
2628*----------------------------------------------------------------------------*/
2629
2630int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2631{
2632    flag aSign;
2633    int16 aExp, shiftCount;
2634    bits64 aSig;
2635    int64 z;
2636    a = float64_squash_input_denormal(a STATUS_VAR);
2637
2638    aSig = extractFloat64Frac( a );
2639    aExp = extractFloat64Exp( a );
2640    aSign = extractFloat64Sign( a );
2641    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2642    shiftCount = aExp - 0x433;
2643    if ( 0 <= shiftCount ) {
2644        if ( 0x43E <= aExp ) {
2645            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2646                float_raise( float_flag_invalid STATUS_VAR);
2647                if (    ! aSign
2648                     || (    ( aExp == 0x7FF )
2649                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2650                   ) {
2651                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2652                }
2653            }
2654            return (sbits64) LIT64( 0x8000000000000000 );
2655        }
2656        z = aSig<<shiftCount;
2657    }
2658    else {
2659        if ( aExp < 0x3FE ) {
2660            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2661            return 0;
2662        }
2663        z = aSig>>( - shiftCount );
2664        if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2665            STATUS(float_exception_flags) |= float_flag_inexact;
2666        }
2667    }
2668    if ( aSign ) z = - z;
2669    return z;
2670
2671}
2672
2673/*----------------------------------------------------------------------------
2674| Returns the result of converting the double-precision floating-point value
2675| `a' to the single-precision floating-point format.  The conversion is
2676| performed according to the IEC/IEEE Standard for Binary Floating-Point
2677| Arithmetic.
2678*----------------------------------------------------------------------------*/
2679
2680float32 float64_to_float32( float64 a STATUS_PARAM )
2681{
2682    flag aSign;
2683    int16 aExp;
2684    bits64 aSig;
2685    bits32 zSig;
2686    a = float64_squash_input_denormal(a STATUS_VAR);
2687
2688    aSig = extractFloat64Frac( a );
2689    aExp = extractFloat64Exp( a );
2690    aSign = extractFloat64Sign( a );
2691    if ( aExp == 0x7FF ) {
2692        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2693        return packFloat32( aSign, 0xFF, 0 );
2694    }
2695    shift64RightJamming( aSig, 22, &aSig );
2696    zSig = aSig;
2697    if ( aExp || zSig ) {
2698        zSig |= 0x40000000;
2699        aExp -= 0x381;
2700    }
2701    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2702
2703}
2704
2705
2706/*----------------------------------------------------------------------------
2707| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2708| half-precision floating-point value, returning the result.  After being
2709| shifted into the proper positions, the three fields are simply added
2710| together to form the result.  This means that any integer portion of `zSig'
2711| will be added into the exponent.  Since a properly normalized significand
2712| will have an integer portion equal to 1, the `zExp' input should be 1 less
2713| than the desired result exponent whenever `zSig' is a complete, normalized
2714| significand.
2715*----------------------------------------------------------------------------*/
2716static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2717{
2718    return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig;
2719}
2720
2721/* Half precision floats come in two formats: standard IEEE and "ARM" format.
2722   The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2723  
2724float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM )
2725{
2726    flag aSign;
2727    int16 aExp;
2728    bits32 aSig;
2729
2730    aSign = a >> 15;
2731    aExp = (a >> 10) & 0x1f;
2732    aSig = a & 0x3ff;
2733
2734    if (aExp == 0x1f && ieee) {
2735        if (aSig) {
2736            /* Make sure correct exceptions are raised.  */
2737            float32ToCommonNaN(a STATUS_VAR);
2738            aSig |= 0x200;
2739        }
2740        return packFloat32(aSign, 0xff, aSig << 13);
2741    }
2742    if (aExp == 0) {
2743        int8 shiftCount;
2744
2745        if (aSig == 0) {
2746            return packFloat32(aSign, 0, 0);
2747        }
2748
2749        shiftCount = countLeadingZeros32( aSig ) - 21;
2750        aSig = aSig << shiftCount;
2751        aExp = -shiftCount;
2752    }
2753    return packFloat32( aSign, aExp + 0x70, aSig << 13);
2754}
2755
2756bits16 float32_to_float16( float32 a, flag ieee STATUS_PARAM)
2757{
2758    flag aSign;
2759    int16 aExp;
2760    bits32 aSig;
2761    bits32 mask;
2762    bits32 increment;
2763    int8 roundingMode;
2764    a = float32_squash_input_denormal(a STATUS_VAR);
2765
2766    aSig = extractFloat32Frac( a );
2767    aExp = extractFloat32Exp( a );
2768    aSign = extractFloat32Sign( a );
2769    if ( aExp == 0xFF ) {
2770        if (aSig) {
2771            /* Make sure correct exceptions are raised.  */
2772            float32ToCommonNaN(a STATUS_VAR);
2773            aSig |= 0x00400000;
2774        }
2775        return packFloat16(aSign, 0x1f, aSig >> 13);
2776    }
2777    if (aExp == 0 && aSign == 0) {
2778        return packFloat16(aSign, 0, 0);
2779    }
2780    /* Decimal point between bits 22 and 23.  */
2781    aSig |= 0x00800000;
2782    aExp -= 0x7f;
2783    if (aExp < -14) {
2784        mask = 0x007fffff;
2785        if (aExp < -24) {
2786            aExp = -25;
2787        } else {
2788            mask >>= 24 + aExp;
2789        }
2790    } else {
2791        mask = 0x00001fff;
2792    }
2793    if (aSig & mask) {
2794        float_raise( float_flag_underflow STATUS_VAR );
2795        roundingMode = STATUS(float_rounding_mode);
2796        switch (roundingMode) {
2797        case float_round_nearest_even:
2798            increment = (mask + 1) >> 1;
2799            if ((aSig & mask) == increment) {
2800                increment = aSig & (increment << 1);
2801            }
2802            break;
2803        case float_round_up:
2804            increment = aSign ? 0 : mask;
2805            break;
2806        case float_round_down:
2807            increment = aSign ? mask : 0;
2808            break;
2809        default: /* round_to_zero */
2810            increment = 0;
2811            break;
2812        }
2813        aSig += increment;
2814        if (aSig >= 0x01000000) {
2815            aSig >>= 1;
2816            aExp++;
2817        }
2818    } else if (aExp < -14
2819          && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2820        float_raise( float_flag_underflow STATUS_VAR);
2821    }
2822
2823    if (ieee) {
2824        if (aExp > 15) {
2825            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2826            return packFloat16(aSign, 0x1f, 0);
2827        }
2828    } else {
2829        if (aExp > 16) {
2830            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2831            return packFloat16(aSign, 0x1f, 0x3ff);
2832        }
2833    }
2834    if (aExp < -24) {
2835        return packFloat16(aSign, 0, 0);
2836    }
2837    if (aExp < -14) {
2838        aSig >>= -14 - aExp;
2839        aExp = -14;
2840    }
2841    return packFloat16(aSign, aExp + 14, aSig >> 13);
2842}
2843
2844#ifdef FLOATX80
2845
2846/*----------------------------------------------------------------------------
2847| Returns the result of converting the double-precision floating-point value
2848| `a' to the extended double-precision floating-point format.  The conversion
2849| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2850| Arithmetic.
2851*----------------------------------------------------------------------------*/
2852
2853floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2854{
2855    flag aSign;
2856    int16 aExp;
2857    bits64 aSig;
2858
2859    a = float64_squash_input_denormal(a STATUS_VAR);
2860    aSig = extractFloat64Frac( a );
2861    aExp = extractFloat64Exp( a );
2862    aSign = extractFloat64Sign( a );
2863    if ( aExp == 0x7FF ) {
2864        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2865        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2866    }
2867    if ( aExp == 0 ) {
2868        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2869        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2870    }
2871    return
2872        packFloatx80(
2873            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2874
2875}
2876
2877#endif
2878
2879#ifdef FLOAT128
2880
2881/*----------------------------------------------------------------------------
2882| Returns the result of converting the double-precision floating-point value
2883| `a' to the quadruple-precision floating-point format.  The conversion is
2884| performed according to the IEC/IEEE Standard for Binary Floating-Point
2885| Arithmetic.
2886*----------------------------------------------------------------------------*/
2887
2888float128 float64_to_float128( float64 a STATUS_PARAM )
2889{
2890    flag aSign;
2891    int16 aExp;
2892    bits64 aSig, zSig0, zSig1;
2893
2894    a = float64_squash_input_denormal(a STATUS_VAR);
2895    aSig = extractFloat64Frac( a );
2896    aExp = extractFloat64Exp( a );
2897    aSign = extractFloat64Sign( a );
2898    if ( aExp == 0x7FF ) {
2899        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2900        return packFloat128( aSign, 0x7FFF, 0, 0 );
2901    }
2902    if ( aExp == 0 ) {
2903        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2904        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2905        --aExp;
2906    }
2907    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2908    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2909
2910}
2911
2912#endif
2913
2914/*----------------------------------------------------------------------------
2915| Rounds the double-precision floating-point value `a' to an integer, and
2916| returns the result as a double-precision floating-point value.  The
2917| operation is performed according to the IEC/IEEE Standard for Binary
2918| Floating-Point Arithmetic.
2919*----------------------------------------------------------------------------*/
2920
2921float64 float64_round_to_int( float64 a STATUS_PARAM )
2922{
2923    flag aSign;
2924    int16 aExp;
2925    bits64 lastBitMask, roundBitsMask;
2926    int8 roundingMode;
2927    bits64 z;
2928    a = float64_squash_input_denormal(a STATUS_VAR);
2929
2930    aExp = extractFloat64Exp( a );
2931    if ( 0x433 <= aExp ) {
2932        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2933            return propagateFloat64NaN( a, a STATUS_VAR );
2934        }
2935        return a;
2936    }
2937    if ( aExp < 0x3FF ) {
2938        if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2939        STATUS(float_exception_flags) |= float_flag_inexact;
2940        aSign = extractFloat64Sign( a );
2941        switch ( STATUS(float_rounding_mode) ) {
2942         case float_round_nearest_even:
2943            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2944                return packFloat64( aSign, 0x3FF, 0 );
2945            }
2946            break;
2947         case float_round_down:
2948            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2949         case float_round_up:
2950            return make_float64(
2951            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2952        }
2953        return packFloat64( aSign, 0, 0 );
2954    }
2955    lastBitMask = 1;
2956    lastBitMask <<= 0x433 - aExp;
2957    roundBitsMask = lastBitMask - 1;
2958    z = float64_val(a);
2959    roundingMode = STATUS(float_rounding_mode);
2960    if ( roundingMode == float_round_nearest_even ) {
2961        z += lastBitMask>>1;
2962        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2963    }
2964    else if ( roundingMode != float_round_to_zero ) {
2965        if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2966            z += roundBitsMask;
2967        }
2968    }
2969    z &= ~ roundBitsMask;
2970    if ( z != float64_val(a) )
2971        STATUS(float_exception_flags) |= float_flag_inexact;
2972    return make_float64(z);
2973
2974}
2975
2976float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2977{
2978    int oldmode;
2979    float64 res;
2980    oldmode = STATUS(float_rounding_mode);
2981    STATUS(float_rounding_mode) = float_round_to_zero;
2982    res = float64_round_to_int(a STATUS_VAR);
2983    STATUS(float_rounding_mode) = oldmode;
2984    return res;
2985}
2986
2987/*----------------------------------------------------------------------------
2988| Returns the result of adding the absolute values of the double-precision
2989| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2990| before being returned.  `zSign' is ignored if the result is a NaN.
2991| The addition is performed according to the IEC/IEEE Standard for Binary
2992| Floating-Point Arithmetic.
2993*----------------------------------------------------------------------------*/
2994
2995static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2996{
2997    int16 aExp, bExp, zExp;
2998    bits64 aSig, bSig, zSig;
2999    int16 expDiff;
3000
3001    aSig = extractFloat64Frac( a );
3002    aExp = extractFloat64Exp( a );
3003    bSig = extractFloat64Frac( b );
3004    bExp = extractFloat64Exp( b );
3005    expDiff = aExp - bExp;
3006    aSig <<= 9;
3007    bSig <<= 9;
3008    if ( 0 < expDiff ) {
3009        if ( aExp == 0x7FF ) {
3010            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3011            return a;
3012        }
3013        if ( bExp == 0 ) {
3014            --expDiff;
3015        }
3016        else {
3017            bSig |= LIT64( 0x2000000000000000 );
3018        }
3019        shift64RightJamming( bSig, expDiff, &bSig );
3020        zExp = aExp;
3021    }
3022    else if ( expDiff < 0 ) {
3023        if ( bExp == 0x7FF ) {
3024            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3025            return packFloat64( zSign, 0x7FF, 0 );
3026        }
3027        if ( aExp == 0 ) {
3028            ++expDiff;
3029        }
3030        else {
3031            aSig |= LIT64( 0x2000000000000000 );
3032        }
3033        shift64RightJamming( aSig, - expDiff, &aSig );
3034        zExp = bExp;
3035    }
3036    else {
3037        if ( aExp == 0x7FF ) {
3038            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3039            return a;
3040        }
3041        if ( aExp == 0 ) {
3042            if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
3043            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3044        }
3045        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3046        zExp = aExp;
3047        goto roundAndPack;
3048    }
3049    aSig |= LIT64( 0x2000000000000000 );
3050    zSig = ( aSig + bSig )<<1;
3051    --zExp;
3052    if ( (sbits64) zSig < 0 ) {
3053        zSig = aSig + bSig;
3054        ++zExp;
3055    }
3056 roundAndPack:
3057    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3058
3059}
3060
3061/*----------------------------------------------------------------------------
3062| Returns the result of subtracting the absolute values of the double-
3063| precision floating-point values `a' and `b'.  If `zSign' is 1, the
3064| difference is negated before being returned.  `zSign' is ignored if the
3065| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3066| Standard for Binary Floating-Point Arithmetic.
3067*----------------------------------------------------------------------------*/
3068
3069static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3070{
3071    int16 aExp, bExp, zExp;
3072    bits64 aSig, bSig, zSig;
3073    int16 expDiff;
3074
3075    aSig = extractFloat64Frac( a );
3076    aExp = extractFloat64Exp( a );
3077    bSig = extractFloat64Frac( b );
3078    bExp = extractFloat64Exp( b );
3079    expDiff = aExp - bExp;
3080    aSig <<= 10;
3081    bSig <<= 10;
3082    if ( 0 < expDiff ) goto aExpBigger;
3083    if ( expDiff < 0 ) goto bExpBigger;
3084    if ( aExp == 0x7FF ) {
3085        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3086        float_raise( float_flag_invalid STATUS_VAR);
3087        return float64_default_nan;
3088    }
3089    if ( aExp == 0 ) {
3090        aExp = 1;
3091        bExp = 1;
3092    }
3093    if ( bSig < aSig ) goto aBigger;
3094    if ( aSig < bSig ) goto bBigger;
3095    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3096 bExpBigger:
3097    if ( bExp == 0x7FF ) {
3098        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3099        return packFloat64( zSign ^ 1, 0x7FF, 0 );
3100    }
3101    if ( aExp == 0 ) {
3102        ++expDiff;
3103    }
3104    else {
3105        aSig |= LIT64( 0x4000000000000000 );
3106    }
3107    shift64RightJamming( aSig, - expDiff, &aSig );
3108    bSig |= LIT64( 0x4000000000000000 );
3109 bBigger:
3110    zSig = bSig - aSig;
3111    zExp = bExp;
3112    zSign ^= 1;
3113    goto normalizeRoundAndPack;
3114 aExpBigger:
3115    if ( aExp == 0x7FF ) {
3116        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3117        return a;
3118    }
3119    if ( bExp == 0 ) {
3120        --expDiff;
3121    }
3122    else {
3123        bSig |= LIT64( 0x4000000000000000 );
3124    }
3125    shift64RightJamming( bSig, expDiff, &bSig );
3126    aSig |= LIT64( 0x4000000000000000 );
3127 aBigger:
3128    zSig = aSig - bSig;
3129    zExp = aExp;
3130 normalizeRoundAndPack:
3131    --zExp;
3132    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3133
3134}
3135
3136/*----------------------------------------------------------------------------
3137| Returns the result of adding the double-precision floating-point values `a'
3138| and `b'.  The operation is performed according to the IEC/IEEE Standard for
3139| Binary Floating-Point Arithmetic.
3140*----------------------------------------------------------------------------*/
3141
3142float64 float64_add( float64 a, float64 b STATUS_PARAM )
3143{
3144    flag aSign, bSign;
3145    a = float64_squash_input_denormal(a STATUS_VAR);
3146    b = float64_squash_input_denormal(b STATUS_VAR);
3147
3148    aSign = extractFloat64Sign( a );
3149    bSign = extractFloat64Sign( b );
3150    if ( aSign == bSign ) {
3151        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3152    }
3153    else {
3154        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3155    }
3156
3157}
3158
3159/*----------------------------------------------------------------------------
3160| Returns the result of subtracting the double-precision floating-point values
3161| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3162| for Binary Floating-Point Arithmetic.
3163*----------------------------------------------------------------------------*/
3164
3165float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3166{
3167    flag aSign, bSign;
3168    a = float64_squash_input_denormal(a STATUS_VAR);
3169    b = float64_squash_input_denormal(b STATUS_VAR);
3170
3171    aSign = extractFloat64Sign( a );
3172    bSign = extractFloat64Sign( b );
3173    if ( aSign == bSign ) {
3174        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3175    }
3176    else {
3177        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3178    }
3179
3180}
3181
3182/*----------------------------------------------------------------------------
3183| Returns the result of multiplying the double-precision floating-point values
3184| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3185| for Binary Floating-Point Arithmetic.
3186*----------------------------------------------------------------------------*/
3187
3188float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3189{
3190    flag aSign, bSign, zSign;
3191    int16 aExp, bExp, zExp;
3192    bits64 aSig, bSig, zSig0, zSig1;
3193
3194    a = float64_squash_input_denormal(a STATUS_VAR);
3195    b = float64_squash_input_denormal(b STATUS_VAR);
3196
3197    aSig = extractFloat64Frac( a );
3198    aExp = extractFloat64Exp( a );
3199    aSign = extractFloat64Sign( a );
3200    bSig = extractFloat64Frac( b );
3201    bExp = extractFloat64Exp( b );
3202    bSign = extractFloat64Sign( b );
3203    zSign = aSign ^ bSign;
3204    if ( aExp == 0x7FF ) {
3205        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3206            return propagateFloat64NaN( a, b STATUS_VAR );
3207        }
3208        if ( ( bExp | bSig ) == 0 ) {
3209            float_raise( float_flag_invalid STATUS_VAR);
3210            return float64_default_nan;
3211        }
3212        return packFloat64( zSign, 0x7FF, 0 );
3213    }
3214    if ( bExp == 0x7FF ) {
3215        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3216        if ( ( aExp | aSig ) == 0 ) {
3217            float_raise( float_flag_invalid STATUS_VAR);
3218            return float64_default_nan;
3219        }
3220        return packFloat64( zSign, 0x7FF, 0 );
3221    }
3222    if ( aExp == 0 ) {
3223        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3224        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3225    }
3226    if ( bExp == 0 ) {
3227        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3228        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3229    }
3230    zExp = aExp + bExp - 0x3FF;
3231    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3232    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3233    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3234    zSig0 |= ( zSig1 != 0 );
3235    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
3236        zSig0 <<= 1;
3237        --zExp;
3238    }
3239    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3240
3241}
3242
3243/*----------------------------------------------------------------------------
3244| Returns the result of dividing the double-precision floating-point value `a'
3245| by the corresponding value `b'.  The operation is performed according to
3246| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3247*----------------------------------------------------------------------------*/
3248
3249float64 float64_div( float64 a, float64 b STATUS_PARAM )
3250{
3251    flag aSign, bSign, zSign;
3252    int16 aExp, bExp, zExp;
3253    bits64 aSig, bSig, zSig;
3254    bits64 rem0, rem1;
3255    bits64 term0, term1;
3256    a = float64_squash_input_denormal(a STATUS_VAR);
3257    b = float64_squash_input_denormal(b STATUS_VAR);
3258
3259    aSig = extractFloat64Frac( a );
3260    aExp = extractFloat64Exp( a );
3261    aSign = extractFloat64Sign( a );
3262    bSig = extractFloat64Frac( b );
3263    bExp = extractFloat64Exp( b );
3264    bSign = extractFloat64Sign( b );
3265    zSign = aSign ^ bSign;
3266    if ( aExp == 0x7FF ) {
3267        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3268        if ( bExp == 0x7FF ) {
3269            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3270            float_raise( float_flag_invalid STATUS_VAR);
3271            return float64_default_nan;
3272        }
3273        return packFloat64( zSign, 0x7FF, 0 );
3274    }
3275    if ( bExp == 0x7FF ) {
3276        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3277        return packFloat64( zSign, 0, 0 );
3278    }
3279    if ( bExp == 0 ) {
3280        if ( bSig == 0 ) {
3281            if ( ( aExp | aSig ) == 0 ) {
3282                float_raise( float_flag_invalid STATUS_VAR);
3283                return float64_default_nan;
3284            }
3285            float_raise( float_flag_divbyzero STATUS_VAR);
3286            return packFloat64( zSign, 0x7FF, 0 );
3287        }
3288        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3289    }
3290    if ( aExp == 0 ) {
3291        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3292        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3293    }
3294    zExp = aExp - bExp + 0x3FD;
3295    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3296    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3297    if ( bSig <= ( aSig + aSig ) ) {
3298        aSig >>= 1;
3299        ++zExp;
3300    }
3301    zSig = estimateDiv128To64( aSig, 0, bSig );
3302    if ( ( zSig & 0x1FF ) <= 2 ) {
3303        mul64To128( bSig, zSig, &term0, &term1 );
3304        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3305        while ( (sbits64) rem0 < 0 ) {
3306            --zSig;
3307            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3308        }
3309        zSig |= ( rem1 != 0 );
3310    }
3311    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3312
3313}
3314
3315/*----------------------------------------------------------------------------
3316| Returns the remainder of the double-precision floating-point value `a'
3317| with respect to the corresponding value `b'.  The operation is performed
3318| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3319*----------------------------------------------------------------------------*/
3320
3321float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3322{
3323    flag aSign, zSign;
3324    int16 aExp, bExp, expDiff;
3325    bits64 aSig, bSig;
3326    bits64 q, alternateASig;
3327    sbits64 sigMean;
3328
3329    a = float64_squash_input_denormal(a STATUS_VAR);
3330    b = float64_squash_input_denormal(b STATUS_VAR);
3331    aSig = extractFloat64Frac( a );
3332    aExp = extractFloat64Exp( a );
3333    aSign = extractFloat64Sign( a );
3334    bSig = extractFloat64Frac( b );
3335    bExp = extractFloat64Exp( b );
3336    if ( aExp == 0x7FF ) {
3337        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3338            return propagateFloat64NaN( a, b STATUS_VAR );
3339        }
3340        float_raise( float_flag_invalid STATUS_VAR);
3341        return float64_default_nan;
3342    }
3343    if ( bExp == 0x7FF ) {
3344        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3345        return a;
3346    }
3347    if ( bExp == 0 ) {
3348        if ( bSig == 0 ) {
3349            float_raise( float_flag_invalid STATUS_VAR);
3350            return float64_default_nan;
3351        }
3352        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3353    }
3354    if ( aExp == 0 ) {
3355        if ( aSig == 0 ) return a;
3356        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3357    }
3358    expDiff = aExp - bExp;
3359    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3360    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3361    if ( expDiff < 0 ) {
3362        if ( expDiff < -1 ) return a;
3363        aSig >>= 1;
3364    }
3365    q = ( bSig <= aSig );
3366    if ( q ) aSig -= bSig;
3367    expDiff -= 64;
3368    while ( 0 < expDiff ) {
3369        q = estimateDiv128To64( aSig, 0, bSig );
3370        q = ( 2 < q ) ? q - 2 : 0;
3371        aSig = - ( ( bSig>>2 ) * q );
3372        expDiff -= 62;
3373    }
3374    expDiff += 64;
3375    if ( 0 < expDiff ) {
3376        q = estimateDiv128To64( aSig, 0, bSig );
3377        q = ( 2 < q ) ? q - 2 : 0;
3378        q >>= 64 - expDiff;
3379        bSig >>= 2;
3380        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3381    }
3382    else {
3383        aSig >>= 2;
3384        bSig >>= 2;
3385    }
3386    do {
3387        alternateASig = aSig;
3388        ++q;
3389        aSig -= bSig;
3390    } while ( 0 <= (sbits64) aSig );
3391    sigMean = aSig + alternateASig;
3392    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3393        aSig = alternateASig;
3394    }
3395    zSign = ( (sbits64) aSig < 0 );
3396    if ( zSign ) aSig = - aSig;
3397    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3398
3399}
3400
3401/*----------------------------------------------------------------------------
3402| Returns the square root of the double-precision floating-point value `a'.
3403| The operation is performed according to the IEC/IEEE Standard for Binary
3404| Floating-Point Arithmetic.
3405*----------------------------------------------------------------------------*/
3406
3407float64 float64_sqrt( float64 a STATUS_PARAM )
3408{
3409    flag aSign;
3410    int16 aExp, zExp;
3411    bits64 aSig, zSig, doubleZSig;
3412    bits64 rem0, rem1, term0, term1;
3413    a = float64_squash_input_denormal(a STATUS_VAR);
3414
3415    aSig = extractFloat64Frac( a );
3416    aExp = extractFloat64Exp( a );
3417    aSign = extractFloat64Sign( a );
3418    if ( aExp == 0x7FF ) {
3419        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3420        if ( ! aSign ) return a;
3421        float_raise( float_flag_invalid STATUS_VAR);
3422        return float64_default_nan;
3423    }
3424    if ( aSign ) {
3425        if ( ( aExp | aSig ) == 0 ) return a;
3426        float_raise( float_flag_invalid STATUS_VAR);
3427        return float64_default_nan;
3428    }
3429    if ( aExp == 0 ) {
3430        if ( aSig == 0 ) return float64_zero;
3431        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3432    }
3433    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3434    aSig |= LIT64( 0x0010000000000000 );
3435    zSig = estimateSqrt32( aExp, aSig>>21 );
3436    aSig <<= 9 - ( aExp & 1 );
3437    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3438    if ( ( zSig & 0x1FF ) <= 5 ) {
3439        doubleZSig = zSig<<1;
3440        mul64To128( zSig, zSig, &term0, &term1 );
3441        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3442        while ( (sbits64) rem0 < 0 ) {
3443            --zSig;
3444            doubleZSig -= 2;
3445            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3446        }
3447        zSig |= ( ( rem0 | rem1 ) != 0 );
3448    }
3449    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3450
3451}
3452
3453/*----------------------------------------------------------------------------
3454| Returns the binary log of the double-precision floating-point value `a'.
3455| The operation is performed according to the IEC/IEEE Standard for Binary
3456| Floating-Point Arithmetic.
3457*----------------------------------------------------------------------------*/
3458float64 float64_log2( float64 a STATUS_PARAM )
3459{
3460    flag aSign, zSign;
3461    int16 aExp;
3462    bits64 aSig, aSig0, aSig1, zSig, i;
3463    a = float64_squash_input_denormal(a STATUS_VAR);
3464
3465    aSig = extractFloat64Frac( a );
3466    aExp = extractFloat64Exp( a );
3467    aSign = extractFloat64Sign( a );
3468
3469    if ( aExp == 0 ) {
3470        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3471        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3472    }
3473    if ( aSign ) {
3474        float_raise( float_flag_invalid STATUS_VAR);
3475        return float64_default_nan;
3476    }
3477    if ( aExp == 0x7FF ) {
3478        if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3479        return a;
3480    }
3481
3482    aExp -= 0x3FF;
3483    aSig |= LIT64( 0x0010000000000000 );
3484    zSign = aExp < 0;
3485    zSig = (bits64)aExp << 52;
3486    for (i = 1LL << 51; i > 0; i >>= 1) {
3487        mul64To128( aSig, aSig, &aSig0, &aSig1 );
3488        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3489        if ( aSig & LIT64( 0x0020000000000000 ) ) {
3490            aSig >>= 1;
3491            zSig |= i;
3492        }
3493    }
3494
3495    if ( zSign )
3496        zSig = -zSig;
3497    return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3498}
3499
3500/*----------------------------------------------------------------------------
3501| Returns 1 if the double-precision floating-point value `a' is equal to the
3502| corresponding value `b', and 0 otherwise.  The comparison is performed
3503| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3504*----------------------------------------------------------------------------*/
3505
3506int float64_eq( float64 a, float64 b STATUS_PARAM )
3507{
3508    bits64 av, bv;
3509    a = float64_squash_input_denormal(a STATUS_VAR);
3510    b = float64_squash_input_denormal(b STATUS_VAR);
3511
3512    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3513         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3514       ) {
3515        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3516            float_raise( float_flag_invalid STATUS_VAR);
3517        }
3518        return 0;
3519    }
3520    av = float64_val(a);
3521    bv = float64_val(b);
3522    return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3523
3524}
3525
3526/*----------------------------------------------------------------------------
3527| Returns 1 if the double-precision floating-point value `a' is less than or
3528| equal to the corresponding value `b', and 0 otherwise.  The comparison is
3529| performed according to the IEC/IEEE Standard for Binary Floating-Point
3530| Arithmetic.
3531*----------------------------------------------------------------------------*/
3532
3533int float64_le( float64 a, float64 b STATUS_PARAM )
3534{
3535    flag aSign, bSign;
3536    bits64 av, bv;
3537    a = float64_squash_input_denormal(a STATUS_VAR);
3538    b = float64_squash_input_denormal(b STATUS_VAR);
3539
3540    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3541         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3542       ) {
3543        float_raise( float_flag_invalid STATUS_VAR);
3544        return 0;
3545    }
3546    aSign = extractFloat64Sign( a );
3547    bSign = extractFloat64Sign( b );
3548    av = float64_val(a);
3549    bv = float64_val(b);
3550    if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3551    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3552
3553}
3554
3555/*----------------------------------------------------------------------------
3556| Returns 1 if the double-precision floating-point value `a' is less than
3557| the corresponding value `b', and 0 otherwise.  The comparison is performed
3558| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3559*----------------------------------------------------------------------------*/
3560
3561int float64_lt( float64 a, float64 b STATUS_PARAM )
3562{
3563    flag aSign, bSign;
3564    bits64 av, bv;
3565
3566    a = float64_squash_input_denormal(a STATUS_VAR);
3567    b = float64_squash_input_denormal(b STATUS_VAR);
3568    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3569         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3570       ) {
3571        float_raise( float_flag_invalid STATUS_VAR);
3572        return 0;
3573    }
3574    aSign = extractFloat64Sign( a );
3575    bSign = extractFloat64Sign( b );
3576    av = float64_val(a);
3577    bv = float64_val(b);
3578    if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3579    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3580
3581}
3582
3583/*----------------------------------------------------------------------------
3584| Returns 1 if the double-precision floating-point value `a' is equal to the
3585| corresponding value `b', and 0 otherwise.  The invalid exception is raised
3586| if either operand is a NaN.  Otherwise, the comparison is performed
3587| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3588*----------------------------------------------------------------------------*/
3589
3590int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3591{
3592    bits64 av, bv;
3593    a = float64_squash_input_denormal(a STATUS_VAR);
3594    b = float64_squash_input_denormal(b STATUS_VAR);
3595
3596    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3597         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3598       ) {
3599        float_raise( float_flag_invalid STATUS_VAR);
3600        return 0;
3601    }
3602    av = float64_val(a);
3603    bv = float64_val(b);
3604    return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3605
3606}
3607
3608/*----------------------------------------------------------------------------
3609| Returns 1 if the double-precision floating-point value `a' is less than or
3610| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3611| cause an exception.  Otherwise, the comparison is performed according to the
3612| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3613*----------------------------------------------------------------------------*/
3614
3615int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3616{
3617    flag aSign, bSign;
3618    bits64 av, bv;
3619    a = float64_squash_input_denormal(a STATUS_VAR);
3620    b = float64_squash_input_denormal(b STATUS_VAR);
3621
3622    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3623         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3624       ) {
3625        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3626            float_raise( float_flag_invalid STATUS_VAR);
3627        }
3628        return 0;
3629    }
3630    aSign = extractFloat64Sign( a );
3631    bSign = extractFloat64Sign( b );
3632    av = float64_val(a);
3633    bv = float64_val(b);
3634    if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3635    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3636
3637}
3638
3639/*----------------------------------------------------------------------------
3640| Returns 1 if the double-precision floating-point value `a' is less than
3641| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3642| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3643| Standard for Binary Floating-Point Arithmetic.
3644*----------------------------------------------------------------------------*/
3645
3646int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3647{
3648    flag aSign, bSign;
3649    bits64 av, bv;
3650    a = float64_squash_input_denormal(a STATUS_VAR);
3651    b = float64_squash_input_denormal(b STATUS_VAR);
3652
3653    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3654         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3655       ) {
3656        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3657            float_raise( float_flag_invalid STATUS_VAR);
3658        }
3659        return 0;
3660    }
3661    aSign = extractFloat64Sign( a );
3662    bSign = extractFloat64Sign( b );
3663    av = float64_val(a);
3664    bv = float64_val(b);
3665    if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3666    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3667
3668}
3669
3670#ifdef FLOATX80
3671
3672/*----------------------------------------------------------------------------
3673| Returns the result of converting the extended double-precision floating-
3674| point value `a' to the 32-bit two's complement integer format.  The
3675| conversion is performed according to the IEC/IEEE Standard for Binary
3676| Floating-Point Arithmetic---which means in particular that the conversion
3677| is rounded according to the current rounding mode.  If `a' is a NaN, the
3678| largest positive integer is returned.  Otherwise, if the conversion
3679| overflows, the largest integer with the same sign as `a' is returned.
3680*----------------------------------------------------------------------------*/
3681
3682int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3683{
3684    flag aSign;
3685    int32 aExp, shiftCount;
3686    bits64 aSig;
3687
3688    aSig = extractFloatx80Frac( a );
3689    aExp = extractFloatx80Exp( a );
3690    aSign = extractFloatx80Sign( a );
3691    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3692    shiftCount = 0x4037 - aExp;
3693    if ( shiftCount <= 0 ) shiftCount = 1;
3694    shift64RightJamming( aSig, shiftCount, &aSig );
3695    return roundAndPackInt32( aSign, aSig STATUS_VAR );
3696
3697}
3698
3699/*----------------------------------------------------------------------------
3700| Returns the result of converting the extended double-precision floating-
3701| point value `a' to the 32-bit two's complement integer format.  The
3702| conversion is performed according to the IEC/IEEE Standard for Binary
3703| Floating-Point Arithmetic, except that the conversion is always rounded
3704| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3705| Otherwise, if the conversion overflows, the largest integer with the same
3706| sign as `a' is returned.
3707*----------------------------------------------------------------------------*/
3708
3709int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3710{
3711    flag aSign;
3712    int32 aExp, shiftCount;
3713    bits64 aSig, savedASig;
3714    int32 z;
3715
3716    aSig = extractFloatx80Frac( a );
3717    aExp = extractFloatx80Exp( a );
3718    aSign = extractFloatx80Sign( a );
3719    if ( 0x401E < aExp ) {
3720        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3721        goto invalid;
3722    }
3723    else if ( aExp < 0x3FFF ) {
3724        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3725        return 0;
3726    }
3727    shiftCount = 0x403E - aExp;
3728    savedASig = aSig;
3729    aSig >>= shiftCount;
3730    z = aSig;
3731    if ( aSign ) z = - z;
3732    if ( ( z < 0 ) ^ aSign ) {
3733 invalid:
3734        float_raise( float_flag_invalid STATUS_VAR);
3735        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3736    }
3737    if ( ( aSig<<shiftCount ) != savedASig ) {
3738        STATUS(float_exception_flags) |= float_flag_inexact;
3739    }
3740    return z;
3741
3742}
3743
3744/*----------------------------------------------------------------------------
3745| Returns the result of converting the extended double-precision floating-
3746| point value `a' to the 64-bit two's complement integer format.  The
3747| conversion is performed according to the IEC/IEEE Standard for Binary
3748| Floating-Point Arithmetic---which means in particular that the conversion
3749| is rounded according to the current rounding mode.  If `a' is a NaN,
3750| the largest positive integer is returned.  Otherwise, if the conversion
3751| overflows, the largest integer with the same sign as `a' is returned.
3752*----------------------------------------------------------------------------*/
3753
3754int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3755{
3756    flag aSign;
3757    int32 aExp, shiftCount;
3758    bits64 aSig, aSigExtra;
3759
3760    aSig = extractFloatx80Frac( a );
3761    aExp = extractFloatx80Exp( a );
3762    aSign = extractFloatx80Sign( a );
3763    shiftCount = 0x403E - aExp;
3764    if ( shiftCount <= 0 ) {
3765        if ( shiftCount ) {
3766            float_raise( float_flag_invalid STATUS_VAR);
3767            if (    ! aSign
3768                 || (    ( aExp == 0x7FFF )
3769                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
3770               ) {
3771                return LIT64( 0x7FFFFFFFFFFFFFFF );
3772            }
3773            return (sbits64) LIT64( 0x8000000000000000 );
3774        }
3775        aSigExtra = 0;
3776    }
3777    else {
3778        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3779    }
3780    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3781
3782}
3783
3784/*----------------------------------------------------------------------------
3785| Returns the result of converting the extended double-precision floating-
3786| point value `a' to the 64-bit two's complement integer format.  The
3787| conversion is performed according to the IEC/IEEE Standard for Binary
3788| Floating-Point Arithmetic, except that the conversion is always rounded
3789| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3790| Otherwise, if the conversion overflows, the largest integer with the same
3791| sign as `a' is returned.
3792*----------------------------------------------------------------------------*/
3793
3794int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3795{
3796    flag aSign;
3797    int32 aExp, shiftCount;
3798    bits64 aSig;
3799    int64 z;
3800
3801    aSig = extractFloatx80Frac( a );
3802    aExp = extractFloatx80Exp( a );
3803    aSign = extractFloatx80Sign( a );
3804    shiftCount = aExp - 0x403E;
3805    if ( 0 <= shiftCount ) {
3806        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3807        if ( ( a.high != 0xC03E ) || aSig ) {
3808            float_raise( float_flag_invalid STATUS_VAR);
3809            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3810                return LIT64( 0x7FFFFFFFFFFFFFFF );
3811            }
3812        }
3813        return (sbits64) LIT64( 0x8000000000000000 );
3814    }
3815    else if ( aExp < 0x3FFF ) {
3816        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3817        return 0;
3818    }
3819    z = aSig>>( - shiftCount );
3820    if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3821        STATUS(float_exception_flags) |= float_flag_inexact;
3822    }
3823    if ( aSign ) z = - z;
3824    return z;
3825
3826}
3827
3828/*----------------------------------------------------------------------------
3829| Returns the result of converting the extended double-precision floating-
3830| point value `a' to the single-precision floating-point format.  The
3831| conversion is performed according to the IEC/IEEE Standard for Binary
3832| Floating-Point Arithmetic.
3833*----------------------------------------------------------------------------*/
3834
3835float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3836{
3837    flag aSign;
3838    int32 aExp;
3839    bits64 aSig;
3840
3841    aSig = extractFloatx80Frac( a );
3842    aExp = extractFloatx80Exp( a );
3843    aSign = extractFloatx80Sign( a );
3844    if ( aExp == 0x7FFF ) {
3845        if ( (bits64) ( aSig<<1 ) ) {
3846            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3847        }
3848        return packFloat32( aSign, 0xFF, 0 );
3849    }
3850    shift64RightJamming( aSig, 33, &aSig );
3851    if ( aExp || aSig ) aExp -= 0x3F81;
3852    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3853
3854}
3855
3856/*----------------------------------------------------------------------------
3857| Returns the result of converting the extended double-precision floating-
3858| point value `a' to the double-precision floating-point format.  The
3859| conversion is performed according to the IEC/IEEE Standard for Binary
3860| Floating-Point Arithmetic.
3861*----------------------------------------------------------------------------*/
3862
3863float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3864{
3865    flag aSign;
3866    int32 aExp;
3867    bits64 aSig, zSig;
3868
3869    aSig = extractFloatx80Frac( a );
3870    aExp = extractFloatx80Exp( a );
3871    aSign = extractFloatx80Sign( a );
3872    if ( aExp == 0x7FFF ) {
3873        if ( (bits64) ( aSig<<1 ) ) {
3874            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3875        }
3876        return packFloat64( aSign, 0x7FF, 0 );
3877    }
3878    shift64RightJamming( aSig, 1, &zSig );
3879    if ( aExp || aSig ) aExp -= 0x3C01;
3880    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3881
3882}
3883
3884#ifdef FLOAT128
3885
3886/*----------------------------------------------------------------------------
3887| Returns the result of converting the extended double-precision floating-
3888| point value `a' to the quadruple-precision floating-point format.  The
3889| conversion is performed according to the IEC/IEEE Standard for Binary
3890| Floating-Point Arithmetic.
3891*----------------------------------------------------------------------------*/
3892
3893float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3894{
3895    flag aSign;
3896    int16 aExp;
3897    bits64 aSig, zSig0, zSig1;
3898
3899    aSig = extractFloatx80Frac( a );
3900    aExp = extractFloatx80Exp( a );
3901    aSign = extractFloatx80Sign( a );
3902    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3903        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3904    }
3905    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3906    return packFloat128( aSign, aExp, zSig0, zSig1 );
3907
3908}
3909
3910#endif
3911
3912/*----------------------------------------------------------------------------
3913| Rounds the extended double-precision floating-point value `a' to an integer,
3914| and returns the result as an extended quadruple-precision floating-point
3915| value.  The operation is performed according to the IEC/IEEE Standard for
3916| Binary Floating-Point Arithmetic.
3917*----------------------------------------------------------------------------*/
3918
3919floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3920{
3921    flag aSign;
3922    int32 aExp;
3923    bits64 lastBitMask, roundBitsMask;
3924    int8 roundingMode;
3925    floatx80 z;
3926
3927    aExp = extractFloatx80Exp( a );
3928    if ( 0x403E <= aExp ) {
3929        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3930            return propagateFloatx80NaN( a, a STATUS_VAR );
3931        }
3932        return a;
3933    }
3934    if ( aExp < 0x3FFF ) {
3935        if (    ( aExp == 0 )
3936             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3937            return a;
3938        }
3939        STATUS(float_exception_flags) |= float_flag_inexact;
3940        aSign = extractFloatx80Sign( a );
3941        switch ( STATUS(float_rounding_mode) ) {
3942         case float_round_nearest_even:
3943            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3944               ) {
3945                return
3946                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3947            }
3948            break;
3949         case float_round_down:
3950            return
3951                  aSign ?
3952                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3953                : packFloatx80( 0, 0, 0 );
3954         case float_round_up:
3955            return
3956                  aSign ? packFloatx80( 1, 0, 0 )
3957                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3958        }
3959        return packFloatx80( aSign, 0, 0 );
3960    }
3961    lastBitMask = 1;
3962    lastBitMask <<= 0x403E - aExp;
3963    roundBitsMask = lastBitMask - 1;
3964    z = a;
3965    roundingMode = STATUS(float_rounding_mode);
3966    if ( roundingMode == float_round_nearest_even ) {
3967        z.low += lastBitMask>>1;
3968        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3969    }
3970    else if ( roundingMode != float_round_to_zero ) {
3971        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3972            z.low += roundBitsMask;
3973        }
3974    }
3975    z.low &= ~ roundBitsMask;
3976    if ( z.low == 0 ) {
3977        ++z.high;
3978        z.low = LIT64( 0x8000000000000000 );
3979    }
3980    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3981    return z;
3982
3983}
3984
3985/*----------------------------------------------------------------------------
3986| Returns the result of adding the absolute values of the extended double-
3987| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3988| negated before being returned.  `zSign' is ignored if the result is a NaN.
3989| The addition is performed according to the IEC/IEEE Standard for Binary
3990| Floating-Point Arithmetic.
3991*----------------------------------------------------------------------------*/
3992
3993static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3994{
3995    int32 aExp, bExp, zExp;
3996    bits64 aSig, bSig, zSig0, zSig1;
3997    int32 expDiff;
3998
3999    aSig = extractFloatx80Frac( a );
4000    aExp = extractFloatx80Exp( a );
4001    bSig = extractFloatx80Frac( b );
4002    bExp = extractFloatx80Exp( b );
4003    expDiff = aExp - bExp;
4004    if ( 0 < expDiff ) {
4005        if ( aExp == 0x7FFF ) {
4006            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4007            return a;
4008        }
4009        if ( bExp == 0 ) --expDiff;
4010        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4011        zExp = aExp;
4012    }
4013    else if ( expDiff < 0 ) {
4014        if ( bExp == 0x7FFF ) {
4015            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4016            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4017        }
4018        if ( aExp == 0 ) ++expDiff;
4019        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4020        zExp = bExp;
4021    }
4022    else {
4023        if ( aExp == 0x7FFF ) {
4024            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4025                return propagateFloatx80NaN( a, b STATUS_VAR );
4026            }
4027            return a;
4028        }
4029        zSig1 = 0;
4030        zSig0 = aSig + bSig;
4031        if ( aExp == 0 ) {
4032            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4033            goto roundAndPack;
4034        }
4035        zExp = aExp;
4036        goto shiftRight1;
4037    }
4038    zSig0 = aSig + bSig;
4039    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
4040 shiftRight1:
4041    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4042    zSig0 |= LIT64( 0x8000000000000000 );
4043    ++zExp;
4044 roundAndPack:
4045    return
4046        roundAndPackFloatx80(
4047            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4048
4049}
4050
4051/*----------------------------------------------------------------------------
4052| Returns the result of subtracting the absolute values of the extended
4053| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4054| difference is negated before being returned.  `zSign' is ignored if the
4055| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4056| Standard for Binary Floating-Point Arithmetic.
4057*----------------------------------------------------------------------------*/
4058
4059static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4060{
4061    int32 aExp, bExp, zExp;
4062    bits64 aSig, bSig, zSig0, zSig1;
4063    int32 expDiff;
4064    floatx80 z;
4065
4066    aSig = extractFloatx80Frac( a );
4067    aExp = extractFloatx80Exp( a );
4068    bSig = extractFloatx80Frac( b );
4069    bExp = extractFloatx80Exp( b );
4070    expDiff = aExp - bExp;
4071    if ( 0 < expDiff ) goto aExpBigger;
4072    if ( expDiff < 0 ) goto bExpBigger;
4073    if ( aExp == 0x7FFF ) {
4074        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4075            return propagateFloatx80NaN( a, b STATUS_VAR );
4076        }
4077        float_raise( float_flag_invalid STATUS_VAR);
4078        z.low = floatx80_default_nan_low;
4079        z.high = floatx80_default_nan_high;
4080        return z;
4081    }
4082    if ( aExp == 0 ) {
4083        aExp = 1;
4084        bExp = 1;
4085    }
4086    zSig1 = 0;
4087    if ( bSig < aSig ) goto aBigger;
4088    if ( aSig < bSig ) goto bBigger;
4089    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4090 bExpBigger:
4091    if ( bExp == 0x7FFF ) {
4092        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4093        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4094    }
4095    if ( aExp == 0 ) ++expDiff;
4096    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4097 bBigger:
4098    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4099    zExp = bExp;
4100    zSign ^= 1;
4101    goto normalizeRoundAndPack;
4102 aExpBigger:
4103    if ( aExp == 0x7FFF ) {
4104        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4105        return a;
4106    }
4107    if ( bExp == 0 ) --expDiff;
4108    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4109 aBigger:
4110    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4111    zExp = aExp;
4112 normalizeRoundAndPack:
4113    return
4114        normalizeRoundAndPackFloatx80(
4115            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4116
4117}
4118
4119/*----------------------------------------------------------------------------
4120| Returns the result of adding the extended double-precision floating-point
4121| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4122| Standard for Binary Floating-Point Arithmetic.
4123*----------------------------------------------------------------------------*/
4124
4125floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4126{
4127    flag aSign, bSign;
4128
4129    aSign = extractFloatx80Sign( a );
4130    bSign = extractFloatx80Sign( b );
4131    if ( aSign == bSign ) {
4132        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4133    }
4134    else {
4135        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4136    }
4137
4138}
4139
4140/*----------------------------------------------------------------------------
4141| Returns the result of subtracting the extended double-precision floating-
4142| point values `a' and `b'.  The operation is performed according to the
4143| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4144*----------------------------------------------------------------------------*/
4145
4146floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4147{
4148    flag aSign, bSign;
4149
4150    aSign = extractFloatx80Sign( a );
4151    bSign = extractFloatx80Sign( b );
4152    if ( aSign == bSign ) {
4153        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4154    }
4155    else {
4156        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4157    }
4158
4159}
4160
4161/*----------------------------------------------------------------------------
4162| Returns the result of multiplying the extended double-precision floating-
4163| point values `a' and `b'.  The operation is performed according to the
4164| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4165*----------------------------------------------------------------------------*/
4166
4167floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4168{
4169    flag aSign, bSign, zSign;
4170    int32 aExp, bExp, zExp;
4171    bits64 aSig, bSig, zSig0, zSig1;
4172    floatx80 z;
4173
4174    aSig = extractFloatx80Frac( a );
4175    aExp = extractFloatx80Exp( a );
4176    aSign = extractFloatx80Sign( a );
4177    bSig = extractFloatx80Frac( b );
4178    bExp = extractFloatx80Exp( b );
4179    bSign = extractFloatx80Sign( b );
4180    zSign = aSign ^ bSign;
4181    if ( aExp == 0x7FFF ) {
4182        if (    (bits64) ( aSig<<1 )
4183             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4184            return propagateFloatx80NaN( a, b STATUS_VAR );
4185        }
4186        if ( ( bExp | bSig ) == 0 ) goto invalid;
4187        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4188    }
4189    if ( bExp == 0x7FFF ) {
4190        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4191        if ( ( aExp | aSig ) == 0 ) {
4192 invalid:
4193            float_raise( float_flag_invalid STATUS_VAR);
4194            z.low = floatx80_default_nan_low;
4195            z.high = floatx80_default_nan_high;
4196            return z;
4197        }
4198        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4199    }
4200    if ( aExp == 0 ) {
4201        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4202        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4203    }
4204    if ( bExp == 0 ) {
4205        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4206        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4207    }
4208    zExp = aExp + bExp - 0x3FFE;
4209    mul64To128( aSig, bSig, &zSig0, &zSig1 );
4210    if ( 0 < (sbits64) zSig0 ) {
4211        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4212        --zExp;
4213    }
4214    return
4215        roundAndPackFloatx80(
4216            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4217
4218}
4219
4220/*----------------------------------------------------------------------------
4221| Returns the result of dividing the extended double-precision floating-point
4222| value `a' by the corresponding value `b'.  The operation is performed
4223| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4224*----------------------------------------------------------------------------*/
4225
4226floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4227{
4228    flag aSign, bSign, zSign;
4229    int32 aExp, bExp, zExp;
4230    bits64 aSig, bSig, zSig0, zSig1;
4231    bits64 rem0, rem1, rem2, term0, term1, term2;
4232    floatx80 z;
4233
4234    aSig = extractFloatx80Frac( a );
4235    aExp = extractFloatx80Exp( a );
4236    aSign = extractFloatx80Sign( a );
4237    bSig = extractFloatx80Frac( b );
4238    bExp = extractFloatx80Exp( b );
4239    bSign = extractFloatx80Sign( b );
4240    zSign = aSign ^ bSign;
4241    if ( aExp == 0x7FFF ) {
4242        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4243        if ( bExp == 0x7FFF ) {
4244            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4245            goto invalid;
4246        }
4247        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4248    }
4249    if ( bExp == 0x7FFF ) {
4250        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4251        return packFloatx80( zSign, 0, 0 );
4252    }
4253    if ( bExp == 0 ) {
4254        if ( bSig == 0 ) {
4255            if ( ( aExp | aSig ) == 0 ) {
4256 invalid:
4257                float_raise( float_flag_invalid STATUS_VAR);
4258                z.low = floatx80_default_nan_low;
4259                z.high = floatx80_default_nan_high;
4260                return z;
4261            }
4262            float_raise( float_flag_divbyzero STATUS_VAR);
4263            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4264        }
4265        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4266    }
4267    if ( aExp == 0 ) {
4268        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4269        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4270    }
4271    zExp = aExp - bExp + 0x3FFE;
4272    rem1 = 0;
4273    if ( bSig <= aSig ) {
4274        shift128Right( aSig, 0, 1, &aSig, &rem1 );
4275        ++zExp;
4276    }
4277    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4278    mul64To128( bSig, zSig0, &term0, &term1 );
4279    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4280    while ( (sbits64) rem0 < 0 ) {
4281        --zSig0;
4282        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4283    }
4284    zSig1 = estimateDiv128To64( rem1, 0, bSig );
4285    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4286        mul64To128( bSig, zSig1, &term1, &term2 );
4287        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4288        while ( (sbits64) rem1 < 0 ) {
4289            --zSig1;
4290            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4291        }
4292        zSig1 |= ( ( rem1 | rem2 ) != 0 );
4293    }
4294    return
4295        roundAndPackFloatx80(
4296            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4297
4298}
4299
4300/*----------------------------------------------------------------------------
4301| Returns the remainder of the extended double-precision floating-point value
4302| `a' with respect to the corresponding value `b'.  The operation is performed
4303| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4304*----------------------------------------------------------------------------*/
4305
4306floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4307{
4308    flag aSign, zSign;
4309    int32 aExp, bExp, expDiff;
4310    bits64 aSig0, aSig1, bSig;
4311    bits64 q, term0, term1, alternateASig0, alternateASig1;
4312    floatx80 z;
4313
4314    aSig0 = extractFloatx80Frac( a );
4315    aExp = extractFloatx80Exp( a );
4316    aSign = extractFloatx80Sign( a );
4317    bSig = extractFloatx80Frac( b );
4318    bExp = extractFloatx80Exp( b );
4319    if ( aExp == 0x7FFF ) {
4320        if (    (bits64) ( aSig0<<1 )
4321             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4322            return propagateFloatx80NaN( a, b STATUS_VAR );
4323        }
4324        goto invalid;
4325    }
4326    if ( bExp == 0x7FFF ) {
4327        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4328        return a;
4329    }
4330    if ( bExp == 0 ) {
4331        if ( bSig == 0 ) {
4332 invalid:
4333            float_raise( float_flag_invalid STATUS_VAR);
4334            z.low = floatx80_default_nan_low;
4335            z.high = floatx80_default_nan_high;
4336            return z;
4337        }
4338        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4339    }
4340    if ( aExp == 0 ) {
4341        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4342        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4343    }
4344    bSig |= LIT64( 0x8000000000000000 );
4345    zSign = aSign;
4346    expDiff = aExp - bExp;
4347    aSig1 = 0;
4348    if ( expDiff < 0 ) {
4349        if ( expDiff < -1 ) return a;
4350        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4351        expDiff = 0;
4352    }
4353    q = ( bSig <= aSig0 );
4354    if ( q ) aSig0 -= bSig;
4355    expDiff -= 64;
4356    while ( 0 < expDiff ) {
4357        q = estimateDiv128To64( aSig0, aSig1, bSig );
4358        q = ( 2 < q ) ? q - 2 : 0;
4359        mul64To128( bSig, q, &term0, &term1 );
4360        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4361        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4362        expDiff -= 62;
4363    }
4364    expDiff += 64;
4365    if ( 0 < expDiff ) {
4366        q = estimateDiv128To64( aSig0, aSig1, bSig );
4367        q = ( 2 < q ) ? q - 2 : 0;
4368        q >>= 64 - expDiff;
4369        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4370        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4371        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4372        while ( le128( term0, term1, aSig0, aSig1 ) ) {
4373            ++q;
4374            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4375        }
4376    }
4377    else {
4378        term1 = 0;
4379        term0 = bSig;
4380    }
4381    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4382    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4383         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4384              && ( q & 1 ) )
4385       ) {
4386        aSig0 = alternateASig0;
4387        aSig1 = alternateASig1;
4388        zSign = ! zSign;
4389    }
4390    return
4391        normalizeRoundAndPackFloatx80(
4392            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4393
4394}
4395
4396/*----------------------------------------------------------------------------
4397| Returns the square root of the extended double-precision floating-point
4398| value `a'.  The operation is performed according to the IEC/IEEE Standard
4399| for Binary Floating-Point Arithmetic.
4400*----------------------------------------------------------------------------*/
4401
4402floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4403{
4404    flag aSign;
4405    int32 aExp, zExp;
4406    bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4407    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4408    floatx80 z;
4409
4410    aSig0 = extractFloatx80Frac( a );
4411    aExp = extractFloatx80Exp( a );
4412    aSign = extractFloatx80Sign( a );
4413    if ( aExp == 0x7FFF ) {
4414        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4415        if ( ! aSign ) return a;
4416        goto invalid;
4417    }
4418    if ( aSign ) {
4419        if ( ( aExp | aSig0 ) == 0 ) return a;
4420 invalid:
4421        float_raise( float_flag_invalid STATUS_VAR);
4422        z.low = floatx80_default_nan_low;
4423        z.high = floatx80_default_nan_high;
4424        return z;
4425    }
4426    if ( aExp == 0 ) {
4427        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4428        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4429    }
4430    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4431    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4432    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4433    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4434    doubleZSig0 = zSig0<<1;
4435    mul64To128( zSig0, zSig0, &term0, &term1 );
4436    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4437    while ( (sbits64) rem0 < 0 ) {
4438        --zSig0;
4439        doubleZSig0 -= 2;
4440        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4441    }
4442    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4443    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4444        if ( zSig1 == 0 ) zSig1 = 1;
4445        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4446        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4447        mul64To128( zSig1, zSig1, &term2, &term3 );
4448        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4449        while ( (sbits64) rem1 < 0 ) {
4450            --zSig1;
4451            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4452            term3 |= 1;
4453            term2 |= doubleZSig0;
4454            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4455        }
4456        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4457    }
4458    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4459    zSig0 |= doubleZSig0;
4460    return
4461        roundAndPackFloatx80(
4462            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4463
4464}
4465
4466/*----------------------------------------------------------------------------
4467| Returns 1 if the extended double-precision floating-point value `a' is
4468| equal to the corresponding value `b', and 0 otherwise.  The comparison is
4469| performed according to the IEC/IEEE Standard for Binary Floating-Point
4470| Arithmetic.
4471*----------------------------------------------------------------------------*/
4472
4473int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4474{
4475
4476    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4477              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4478         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4479              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4480       ) {
4481        if (    floatx80_is_signaling_nan( a )
4482             || floatx80_is_signaling_nan( b ) ) {
4483            float_raise( float_flag_invalid STATUS_VAR);
4484        }
4485        return 0;
4486    }
4487    return
4488           ( a.low == b.low )
4489        && (    ( a.high == b.high )
4490             || (    ( a.low == 0 )
4491                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4492           );
4493
4494}
4495
4496/*----------------------------------------------------------------------------
4497| Returns 1 if the extended double-precision floating-point value `a' is
4498| less than or equal to the corresponding value `b', and 0 otherwise.  The
4499| comparison is performed according to the IEC/IEEE Standard for Binary
4500| Floating-Point Arithmetic.
4501*----------------------------------------------------------------------------*/
4502
4503int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4504{
4505    flag aSign, bSign;
4506
4507    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4508              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4509         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4510              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4511       ) {
4512        float_raise( float_flag_invalid STATUS_VAR);
4513        return 0;
4514    }
4515    aSign = extractFloatx80Sign( a );
4516    bSign = extractFloatx80Sign( b );
4517    if ( aSign != bSign ) {
4518        return
4519               aSign
4520            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4521                 == 0 );
4522    }
4523    return
4524          aSign ? le128( b.high, b.low, a.high, a.low )
4525        : le128( a.high, a.low, b.high, b.low );
4526
4527}
4528
4529/*----------------------------------------------------------------------------
4530| Returns 1 if the extended double-precision floating-point value `a' is
4531| less than the corresponding value `b', and 0 otherwise.  The comparison
4532| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4533| Arithmetic.
4534*----------------------------------------------------------------------------*/
4535
4536int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4537{
4538    flag aSign, bSign;
4539
4540    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4541              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4542         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4543              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4544       ) {
4545        float_raise( float_flag_invalid STATUS_VAR);
4546        return 0;
4547    }
4548    aSign = extractFloatx80Sign( a );
4549    bSign = extractFloatx80Sign( b );
4550    if ( aSign != bSign ) {
4551        return
4552               aSign
4553            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4554                 != 0 );
4555    }
4556    return
4557          aSign ? lt128( b.high, b.low, a.high, a.low )
4558        : lt128( a.high, a.low, b.high, b.low );
4559
4560}
4561
4562/*----------------------------------------------------------------------------
4563| Returns 1 if the extended double-precision floating-point value `a' is equal
4564| to the corresponding value `b', and 0 otherwise.  The invalid exception is
4565| raised if either operand is a NaN.  Otherwise, the comparison is performed
4566| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4567*----------------------------------------------------------------------------*/
4568
4569int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4570{
4571
4572    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4573              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4574         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4575              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4576       ) {
4577        float_raise( float_flag_invalid STATUS_VAR);
4578        return 0;
4579    }
4580    return
4581           ( a.low == b.low )
4582        && (    ( a.high == b.high )
4583             || (    ( a.low == 0 )
4584                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4585           );
4586
4587}
4588
4589/*----------------------------------------------------------------------------
4590| Returns 1 if the extended double-precision floating-point value `a' is less
4591| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4592| do not cause an exception.  Otherwise, the comparison is performed according
4593| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4594*----------------------------------------------------------------------------*/
4595
4596int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4597{
4598    flag aSign, bSign;
4599
4600    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4601              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4602         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4603              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4604       ) {
4605        if (    floatx80_is_signaling_nan( a )
4606             || floatx80_is_signaling_nan( b ) ) {
4607            float_raise( float_flag_invalid STATUS_VAR);
4608        }
4609        return 0;
4610    }
4611    aSign = extractFloatx80Sign( a );
4612    bSign = extractFloatx80Sign( b );
4613    if ( aSign != bSign ) {
4614        return
4615               aSign
4616            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4617                 == 0 );
4618    }
4619    return
4620          aSign ? le128( b.high, b.low, a.high, a.low )
4621        : le128( a.high, a.low, b.high, b.low );
4622
4623}
4624
4625/*----------------------------------------------------------------------------
4626| Returns 1 if the extended double-precision floating-point value `a' is less
4627| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4628| an exception.  Otherwise, the comparison is performed according to the
4629| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4630*----------------------------------------------------------------------------*/
4631
4632int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4633{
4634    flag aSign, bSign;
4635
4636    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4637              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4638         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4639              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4640       ) {
4641        if (    floatx80_is_signaling_nan( a )
4642             || floatx80_is_signaling_nan( b ) ) {
4643            float_raise( float_flag_invalid STATUS_VAR);
4644        }
4645        return 0;
4646    }
4647    aSign = extractFloatx80Sign( a );
4648    bSign = extractFloatx80Sign( b );
4649    if ( aSign != bSign ) {
4650        return
4651               aSign
4652            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4653                 != 0 );
4654    }
4655    return
4656          aSign ? lt128( b.high, b.low, a.high, a.low )
4657        : lt128( a.high, a.low, b.high, b.low );
4658
4659}
4660
4661#endif
4662
4663#ifdef FLOAT128
4664
4665/*----------------------------------------------------------------------------
4666| Returns the result of converting the quadruple-precision floating-point
4667| value `a' to the 32-bit two's complement integer format.  The conversion
4668| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4669| Arithmetic---which means in particular that the conversion is rounded
4670| according to the current rounding mode.  If `a' is a NaN, the largest
4671| positive integer is returned.  Otherwise, if the conversion overflows, the
4672| largest integer with the same sign as `a' is returned.
4673*----------------------------------------------------------------------------*/
4674
4675int32 float128_to_int32( float128 a STATUS_PARAM )
4676{
4677    flag aSign;
4678    int32 aExp, shiftCount;
4679    bits64 aSig0, aSig1;
4680
4681    aSig1 = extractFloat128Frac1( a );
4682    aSig0 = extractFloat128Frac0( a );
4683    aExp = extractFloat128Exp( a );
4684    aSign = extractFloat128Sign( a );
4685    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4686    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4687    aSig0 |= ( aSig1 != 0 );
4688    shiftCount = 0x4028 - aExp;
4689    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4690    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4691
4692}
4693
4694/*----------------------------------------------------------------------------
4695| Returns the result of converting the quadruple-precision floating-point
4696| value `a' to the 32-bit two's complement integer format.  The conversion
4697| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4698| Arithmetic, except that the conversion is always rounded toward zero.  If
4699| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4700| conversion overflows, the largest integer with the same sign as `a' is
4701| returned.
4702*----------------------------------------------------------------------------*/
4703
4704int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4705{
4706    flag aSign;
4707    int32 aExp, shiftCount;
4708    bits64 aSig0, aSig1, savedASig;
4709    int32 z;
4710
4711    aSig1 = extractFloat128Frac1( a );
4712    aSig0 = extractFloat128Frac0( a );
4713    aExp = extractFloat128Exp( a );
4714    aSign = extractFloat128Sign( a );
4715    aSig0 |= ( aSig1 != 0 );
4716    if ( 0x401E < aExp ) {
4717        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4718        goto invalid;
4719    }
4720    else if ( aExp < 0x3FFF ) {
4721        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4722        return 0;
4723    }
4724    aSig0 |= LIT64( 0x0001000000000000 );
4725    shiftCount = 0x402F - aExp;
4726    savedASig = aSig0;
4727    aSig0 >>= shiftCount;
4728    z = aSig0;
4729    if ( aSign ) z = - z;
4730    if ( ( z < 0 ) ^ aSign ) {
4731 invalid:
4732        float_raise( float_flag_invalid STATUS_VAR);
4733        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4734    }
4735    if ( ( aSig0<<shiftCount ) != savedASig ) {
4736        STATUS(float_exception_flags) |= float_flag_inexact;
4737    }
4738    return z;
4739
4740}
4741
4742/*----------------------------------------------------------------------------
4743| Returns the result of converting the quadruple-precision floating-point
4744| value `a' to the 64-bit two's complement integer format.  The conversion
4745| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4746| Arithmetic---which means in particular that the conversion is rounded
4747| according to the current rounding mode.  If `a' is a NaN, the largest
4748| positive integer is returned.  Otherwise, if the conversion overflows, the
4749| largest integer with the same sign as `a' is returned.
4750*----------------------------------------------------------------------------*/
4751
4752int64 float128_to_int64( float128 a STATUS_PARAM )
4753{
4754    flag aSign;
4755    int32 aExp, shiftCount;
4756    bits64 aSig0, aSig1;
4757
4758    aSig1 = extractFloat128Frac1( a );
4759    aSig0 = extractFloat128Frac0( a );
4760    aExp = extractFloat128Exp( a );
4761    aSign = extractFloat128Sign( a );
4762    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4763    shiftCount = 0x402F - aExp;
4764    if ( shiftCount <= 0 ) {
4765        if ( 0x403E < aExp ) {
4766            float_raise( float_flag_invalid STATUS_VAR);
4767            if (    ! aSign
4768                 || (    ( aExp == 0x7FFF )
4769                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4770                    )
4771               ) {
4772                return LIT64( 0x7FFFFFFFFFFFFFFF );
4773            }
4774            return (sbits64) LIT64( 0x8000000000000000 );
4775        }
4776        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4777    }
4778    else {
4779        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4780    }
4781    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4782
4783}
4784
4785/*----------------------------------------------------------------------------
4786| Returns the result of converting the quadruple-precision floating-point
4787| value `a' to the 64-bit two's complement integer format.  The conversion
4788| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4789| Arithmetic, except that the conversion is always rounded toward zero.
4790| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4791| the conversion overflows, the largest integer with the same sign as `a' is
4792| returned.
4793*----------------------------------------------------------------------------*/
4794
4795int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4796{
4797    flag aSign;
4798    int32 aExp, shiftCount;
4799    bits64 aSig0, aSig1;
4800    int64 z;
4801
4802    aSig1 = extractFloat128Frac1( a );
4803    aSig0 = extractFloat128Frac0( a );
4804    aExp = extractFloat128Exp( a );
4805    aSign = extractFloat128Sign( a );
4806    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4807    shiftCount = aExp - 0x402F;
4808    if ( 0 < shiftCount ) {
4809        if ( 0x403E <= aExp ) {
4810            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4811            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4812                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4813                if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4814            }
4815            else {
4816                float_raise( float_flag_invalid STATUS_VAR);
4817                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4818                    return LIT64( 0x7FFFFFFFFFFFFFFF );
4819                }
4820            }
4821            return (sbits64) LIT64( 0x8000000000000000 );
4822        }
4823        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4824        if ( (bits64) ( aSig1<<shiftCount ) ) {
4825            STATUS(float_exception_flags) |= float_flag_inexact;
4826        }
4827    }
4828    else {
4829        if ( aExp < 0x3FFF ) {
4830            if ( aExp | aSig0 | aSig1 ) {
4831                STATUS(float_exception_flags) |= float_flag_inexact;
4832            }
4833            return 0;
4834        }
4835        z = aSig0>>( - shiftCount );
4836        if (    aSig1
4837             || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4838            STATUS(float_exception_flags) |= float_flag_inexact;
4839        }
4840    }
4841    if ( aSign ) z = - z;
4842    return z;
4843
4844}
4845
4846/*----------------------------------------------------------------------------
4847| Returns the result of converting the quadruple-precision floating-point
4848| value `a' to the single-precision floating-point format.  The conversion
4849| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4850| Arithmetic.
4851*----------------------------------------------------------------------------*/
4852
4853float32 float128_to_float32( float128 a STATUS_PARAM )
4854{
4855    flag aSign;
4856    int32 aExp;
4857    bits64 aSig0, aSig1;
4858    bits32 zSig;
4859
4860    aSig1 = extractFloat128Frac1( a );
4861    aSig0 = extractFloat128Frac0( a );
4862    aExp = extractFloat128Exp( a );
4863    aSign = extractFloat128Sign( a );
4864    if ( aExp == 0x7FFF ) {
4865        if ( aSig0 | aSig1 ) {
4866            return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4867        }
4868        return packFloat32( aSign, 0xFF, 0 );
4869    }
4870    aSig0 |= ( aSig1 != 0 );
4871    shift64RightJamming( aSig0, 18, &aSig0 );
4872    zSig = aSig0;
4873    if ( aExp || zSig ) {
4874        zSig |= 0x40000000;
4875        aExp -= 0x3F81;
4876    }
4877    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4878
4879}
4880
4881/*----------------------------------------------------------------------------
4882| Returns the result of converting the quadruple-precision floating-point
4883| value `a' to the double-precision floating-point format.  The conversion
4884| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4885| Arithmetic.
4886*----------------------------------------------------------------------------*/
4887
4888float64 float128_to_float64( float128 a STATUS_PARAM )
4889{
4890    flag aSign;
4891    int32 aExp;
4892    bits64 aSig0, aSig1;
4893
4894    aSig1 = extractFloat128Frac1( a );
4895    aSig0 = extractFloat128Frac0( a );
4896    aExp = extractFloat128Exp( a );
4897    aSign = extractFloat128Sign( a );
4898    if ( aExp == 0x7FFF ) {
4899        if ( aSig0 | aSig1 ) {
4900            return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4901        }
4902        return packFloat64( aSign, 0x7FF, 0 );
4903    }
4904    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4905    aSig0 |= ( aSig1 != 0 );
4906    if ( aExp || aSig0 ) {
4907        aSig0 |= LIT64( 0x4000000000000000 );
4908        aExp -= 0x3C01;
4909    }
4910    return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4911
4912}
4913
4914#ifdef FLOATX80
4915
4916/*----------------------------------------------------------------------------
4917| Returns the result of converting the quadruple-precision floating-point
4918| value `a' to the extended double-precision floating-point format.  The
4919| conversion is performed according to the IEC/IEEE Standard for Binary
4920| Floating-Point Arithmetic.
4921*----------------------------------------------------------------------------*/
4922
4923floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4924{
4925    flag aSign;
4926    int32 aExp;
4927    bits64 aSig0, aSig1;
4928
4929    aSig1 = extractFloat128Frac1( a );
4930    aSig0 = extractFloat128Frac0( a );
4931    aExp = extractFloat128Exp( a );
4932    aSign = extractFloat128Sign( a );
4933    if ( aExp == 0x7FFF ) {
4934        if ( aSig0 | aSig1 ) {
4935            return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4936        }
4937        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4938    }
4939    if ( aExp == 0 ) {
4940        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4941        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4942    }
4943    else {
4944        aSig0 |= LIT64( 0x0001000000000000 );
4945    }
4946    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4947    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4948
4949}
4950
4951#endif
4952
4953/*----------------------------------------------------------------------------
4954| Rounds the quadruple-precision floating-point value `a' to an integer, and
4955| returns the result as a quadruple-precision floating-point value.  The
4956| operation is performed according to the IEC/IEEE Standard for Binary
4957| Floating-Point Arithmetic.
4958*----------------------------------------------------------------------------*/
4959
4960float128 float128_round_to_int( float128 a STATUS_PARAM )
4961{
4962    flag aSign;
4963    int32 aExp;
4964    bits64 lastBitMask, roundBitsMask;
4965    int8 roundingMode;
4966    float128 z;
4967
4968    aExp = extractFloat128Exp( a );
4969    if ( 0x402F <= aExp ) {
4970        if ( 0x406F <= aExp ) {
4971            if (    ( aExp == 0x7FFF )
4972                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4973               ) {
4974                return propagateFloat128NaN( a, a STATUS_VAR );
4975            }
4976            return a;
4977        }
4978        lastBitMask = 1;
4979        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4980        roundBitsMask = lastBitMask - 1;
4981        z = a;
4982        roundingMode = STATUS(float_rounding_mode);
4983        if ( roundingMode == float_round_nearest_even ) {
4984            if ( lastBitMask ) {
4985                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4986                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4987            }
4988            else {
4989                if ( (sbits64) z.low < 0 ) {
4990                    ++z.high;
4991                    if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4992                }
4993            }
4994        }
4995        else if ( roundingMode != float_round_to_zero ) {
4996            if (   extractFloat128Sign( z )
4997                 ^ ( roundingMode == float_round_up ) ) {
4998                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4999            }
5000        }
5001        z.low &= ~ roundBitsMask;
5002    }
5003    else {
5004        if ( aExp < 0x3FFF ) {
5005            if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5006            STATUS(float_exception_flags) |= float_flag_inexact;
5007            aSign = extractFloat128Sign( a );
5008            switch ( STATUS(float_rounding_mode) ) {
5009             case float_round_nearest_even:
5010                if (    ( aExp == 0x3FFE )
5011                     && (   extractFloat128Frac0( a )
5012                          | extractFloat128Frac1( a ) )
5013                   ) {
5014                    return packFloat128( aSign, 0x3FFF, 0, 0 );
5015                }
5016                break;
5017             case float_round_down:
5018                return
5019                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5020                    : packFloat128( 0, 0, 0, 0 );
5021             case float_round_up:
5022                return
5023                      aSign ? packFloat128( 1, 0, 0, 0 )
5024                    : packFloat128( 0, 0x3FFF, 0, 0 );
5025            }
5026            return packFloat128( aSign, 0, 0, 0 );
5027        }
5028        lastBitMask = 1;
5029        lastBitMask <<= 0x402F - aExp;
5030        roundBitsMask = lastBitMask - 1;
5031        z.low = 0;
5032        z.high = a.high;
5033        roundingMode = STATUS(float_rounding_mode);
5034        if ( roundingMode == float_round_nearest_even ) {
5035            z.high += lastBitMask>>1;
5036            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5037                z.high &= ~ lastBitMask;
5038            }
5039        }
5040        else if ( roundingMode != float_round_to_zero ) {
5041            if (   extractFloat128Sign( z )
5042                 ^ ( roundingMode == float_round_up ) ) {
5043                z.high |= ( a.low != 0 );
5044                z.high += roundBitsMask;
5045            }
5046        }
5047        z.high &= ~ roundBitsMask;
5048    }
5049    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5050        STATUS(float_exception_flags) |= float_flag_inexact;
5051    }
5052    return z;
5053
5054}
5055
5056/*----------------------------------------------------------------------------
5057| Returns the result of adding the absolute values of the quadruple-precision
5058| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5059| before being returned.  `zSign' is ignored if the result is a NaN.
5060| The addition is performed according to the IEC/IEEE Standard for Binary
5061| Floating-Point Arithmetic.
5062*----------------------------------------------------------------------------*/
5063
5064static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5065{
5066    int32 aExp, bExp, zExp;
5067    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5068    int32 expDiff;
5069
5070    aSig1 = extractFloat128Frac1( a );
5071    aSig0 = extractFloat128Frac0( a );
5072    aExp = extractFloat128Exp( a );
5073    bSig1 = extractFloat128Frac1( b );
5074    bSig0 = extractFloat128Frac0( b );
5075    bExp = extractFloat128Exp( b );
5076    expDiff = aExp - bExp;
5077    if ( 0 < expDiff ) {
5078        if ( aExp == 0x7FFF ) {
5079            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5080            return a;
5081        }
5082        if ( bExp == 0 ) {
5083            --expDiff;
5084        }
5085        else {
5086            bSig0 |= LIT64( 0x0001000000000000 );
5087        }
5088        shift128ExtraRightJamming(
5089            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5090        zExp = aExp;
5091    }
5092    else if ( expDiff < 0 ) {
5093        if ( bExp == 0x7FFF ) {
5094            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5095            return packFloat128( zSign, 0x7FFF, 0, 0 );
5096        }
5097        if ( aExp == 0 ) {
5098            ++expDiff;
5099        }
5100        else {
5101            aSig0 |= LIT64( 0x0001000000000000 );
5102        }
5103        shift128ExtraRightJamming(
5104            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5105        zExp = bExp;
5106    }
5107    else {
5108        if ( aExp == 0x7FFF ) {
5109            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5110                return propagateFloat128NaN( a, b STATUS_VAR );
5111            }
5112            return a;
5113        }
5114        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5115        if ( aExp == 0 ) {
5116            if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5117            return packFloat128( zSign, 0, zSig0, zSig1 );
5118        }
5119        zSig2 = 0;
5120        zSig0 |= LIT64( 0x0002000000000000 );
5121        zExp = aExp;
5122        goto shiftRight1;
5123    }
5124    aSig0 |= LIT64( 0x0001000000000000 );
5125    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5126    --zExp;
5127    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5128    ++zExp;
5129 shiftRight1:
5130    shift128ExtraRightJamming(
5131        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5132 roundAndPack:
5133    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5134
5135}
5136
5137/*----------------------------------------------------------------------------
5138| Returns the result of subtracting the absolute values of the quadruple-
5139| precision floating-point values `a' and `b'.  If `zSign' is 1, the
5140| difference is negated before being returned.  `zSign' is ignored if the
5141| result is a NaN.  The subtraction is performed according to the IEC/IEEE
5142| Standard for Binary Floating-Point Arithmetic.
5143*----------------------------------------------------------------------------*/
5144
5145static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5146{
5147    int32 aExp, bExp, zExp;
5148    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5149    int32 expDiff;
5150    float128 z;
5151
5152    aSig1 = extractFloat128Frac1( a );
5153    aSig0 = extractFloat128Frac0( a );
5154    aExp = extractFloat128Exp( a );
5155    bSig1 = extractFloat128Frac1( b );
5156    bSig0 = extractFloat128Frac0( b );
5157    bExp = extractFloat128Exp( b );
5158    expDiff = aExp - bExp;
5159    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5160    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5161    if ( 0 < expDiff ) goto aExpBigger;
5162    if ( expDiff < 0 ) goto bExpBigger;
5163    if ( aExp == 0x7FFF ) {
5164        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5165            return propagateFloat128NaN( a, b STATUS_VAR );
5166        }
5167        float_raise( float_flag_invalid STATUS_VAR);
5168        z.low = float128_default_nan_low;
5169        z.high = float128_default_nan_high;
5170        return z;
5171    }
5172    if ( aExp == 0 ) {
5173        aExp = 1;
5174        bExp = 1;
5175    }
5176    if ( bSig0 < aSig0 ) goto aBigger;
5177    if ( aSig0 < bSig0 ) goto bBigger;
5178    if ( bSig1 < aSig1 ) goto aBigger;
5179    if ( aSig1 < bSig1 ) goto bBigger;
5180    return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5181 bExpBigger:
5182    if ( bExp == 0x7FFF ) {
5183        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5184        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5185    }
5186    if ( aExp == 0 ) {
5187        ++expDiff;
5188    }
5189    else {
5190        aSig0 |= LIT64( 0x4000000000000000 );
5191    }
5192    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5193    bSig0 |= LIT64( 0x4000000000000000 );
5194 bBigger:
5195    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5196    zExp = bExp;
5197    zSign ^= 1;
5198    goto normalizeRoundAndPack;
5199 aExpBigger:
5200    if ( aExp == 0x7FFF ) {
5201        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5202        return a;
5203    }
5204    if ( bExp == 0 ) {
5205        --expDiff;
5206    }
5207    else {
5208        bSig0 |= LIT64( 0x4000000000000000 );
5209    }
5210    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5211    aSig0 |= LIT64( 0x4000000000000000 );
5212 aBigger:
5213    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5214    zExp = aExp;
5215 normalizeRoundAndPack:
5216    --zExp;
5217    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5218
5219}
5220
5221/*----------------------------------------------------------------------------
5222| Returns the result of adding the quadruple-precision floating-point values
5223| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5224| for Binary Floating-Point Arithmetic.
5225*----------------------------------------------------------------------------*/
5226
5227float128 float128_add( float128 a, float128 b STATUS_PARAM )
5228{
5229    flag aSign, bSign;
5230
5231    aSign = extractFloat128Sign( a );
5232    bSign = extractFloat128Sign( b );
5233    if ( aSign == bSign ) {
5234        return addFloat128Sigs( a, b, aSign STATUS_VAR );
5235    }
5236    else {
5237        return subFloat128Sigs( a, b, aSign STATUS_VAR );
5238    }
5239
5240}
5241
5242/*----------------------------------------------------------------------------
5243| Returns the result of subtracting the quadruple-precision floating-point
5244| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5245| Standard for Binary Floating-Point Arithmetic.
5246*----------------------------------------------------------------------------*/
5247
5248float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5249{
5250    flag aSign, bSign;
5251
5252    aSign = extractFloat128Sign( a );
5253    bSign = extractFloat128Sign( b );
5254    if ( aSign == bSign ) {
5255        return subFloat128Sigs( a, b, aSign STATUS_VAR );
5256    }
5257    else {
5258        return addFloat128Sigs( a, b, aSign STATUS_VAR );
5259    }
5260
5261}
5262
5263/*----------------------------------------------------------------------------
5264| Returns the result of multiplying the quadruple-precision floating-point
5265| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5266| Standard for Binary Floating-Point Arithmetic.
5267*----------------------------------------------------------------------------*/
5268
5269float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5270{
5271    flag aSign, bSign, zSign;
5272    int32 aExp, bExp, zExp;
5273    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5274    float128 z;
5275
5276    aSig1 = extractFloat128Frac1( a );
5277    aSig0 = extractFloat128Frac0( a );
5278    aExp = extractFloat128Exp( a );
5279    aSign = extractFloat128Sign( a );
5280    bSig1 = extractFloat128Frac1( b );
5281    bSig0 = extractFloat128Frac0( b );
5282    bExp = extractFloat128Exp( b );
5283    bSign = extractFloat128Sign( b );
5284    zSign = aSign ^ bSign;
5285    if ( aExp == 0x7FFF ) {
5286        if (    ( aSig0 | aSig1 )
5287             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5288            return propagateFloat128NaN( a, b STATUS_VAR );
5289        }
5290        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5291        return packFloat128( zSign, 0x7FFF, 0, 0 );
5292    }
5293    if ( bExp == 0x7FFF ) {
5294        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5295        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5296 invalid:
5297            float_raise( float_flag_invalid STATUS_VAR);
5298            z.low = float128_default_nan_low;
5299            z.high = float128_default_nan_high;
5300            return z;
5301        }
5302        return packFloat128( zSign, 0x7FFF, 0, 0 );
5303    }
5304    if ( aExp == 0 ) {
5305        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5306        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5307    }
5308    if ( bExp == 0 ) {
5309        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5310        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5311    }
5312    zExp = aExp + bExp - 0x4000;
5313    aSig0 |= LIT64( 0x0001000000000000 );
5314    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5315    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5316    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5317    zSig2 |= ( zSig3 != 0 );
5318    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5319        shift128ExtraRightJamming(
5320            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5321        ++zExp;
5322    }
5323    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5324
5325}
5326
5327/*----------------------------------------------------------------------------
5328| Returns the result of dividing the quadruple-precision floating-point value
5329| `a' by the corresponding value `b'.  The operation is performed according to
5330| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5331*----------------------------------------------------------------------------*/
5332
5333float128 float128_div( float128 a, float128 b STATUS_PARAM )
5334{
5335    flag aSign, bSign, zSign;
5336    int32 aExp, bExp, zExp;
5337    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5338    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5339    float128 z;
5340
5341    aSig1 = extractFloat128Frac1( a );
5342    aSig0 = extractFloat128Frac0( a );
5343    aExp = extractFloat128Exp( a );
5344    aSign = extractFloat128Sign( a );
5345    bSig1 = extractFloat128Frac1( b );
5346    bSig0 = extractFloat128Frac0( b );
5347    bExp = extractFloat128Exp( b );
5348    bSign = extractFloat128Sign( b );
5349    zSign = aSign ^ bSign;
5350    if ( aExp == 0x7FFF ) {
5351        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5352        if ( bExp == 0x7FFF ) {
5353            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5354            goto invalid;
5355        }
5356        return packFloat128( zSign, 0x7FFF, 0, 0 );
5357    }
5358    if ( bExp == 0x7FFF ) {
5359        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5360        return packFloat128( zSign, 0, 0, 0 );
5361    }
5362    if ( bExp == 0 ) {
5363        if ( ( bSig0 | bSig1 ) == 0 ) {
5364            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5365 invalid:
5366                float_raise( float_flag_invalid STATUS_VAR);
5367                z.low = float128_default_nan_low;
5368                z.high = float128_default_nan_high;
5369                return z;
5370            }
5371            float_raise( float_flag_divbyzero STATUS_VAR);
5372            return packFloat128( zSign, 0x7FFF, 0, 0 );
5373        }
5374        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5375    }
5376    if ( aExp == 0 ) {
5377        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5378        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5379    }
5380    zExp = aExp - bExp + 0x3FFD;
5381    shortShift128Left(
5382        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5383    shortShift128Left(
5384        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5385    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5386        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5387        ++zExp;
5388    }
5389    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5390    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5391    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5392    while ( (sbits64) rem0 < 0 ) {
5393        --zSig0;
5394        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5395    }
5396    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5397    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5398        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5399        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5400        while ( (sbits64) rem1 < 0 ) {
5401            --zSig1;
5402            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5403        }
5404        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5405    }
5406    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5407    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5408
5409}
5410
5411/*----------------------------------------------------------------------------
5412| Returns the remainder of the quadruple-precision floating-point value `a'
5413| with respect to the corresponding value `b'.  The operation is performed
5414| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5415*----------------------------------------------------------------------------*/
5416
5417float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5418{
5419    flag aSign, zSign;
5420    int32 aExp, bExp, expDiff;
5421    bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5422    bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5423    sbits64 sigMean0;
5424    float128 z;
5425
5426    aSig1 = extractFloat128Frac1( a );
5427    aSig0 = extractFloat128Frac0( a );
5428    aExp = extractFloat128Exp( a );
5429    aSign = extractFloat128Sign( a );
5430    bSig1 = extractFloat128Frac1( b );
5431    bSig0 = extractFloat128Frac0( b );
5432    bExp = extractFloat128Exp( b );
5433    if ( aExp == 0x7FFF ) {
5434        if (    ( aSig0 | aSig1 )
5435             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5436            return propagateFloat128NaN( a, b STATUS_VAR );
5437        }
5438        goto invalid;
5439    }
5440    if ( bExp == 0x7FFF ) {
5441        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5442        return a;
5443    }
5444    if ( bExp == 0 ) {
5445        if ( ( bSig0 | bSig1 ) == 0 ) {
5446 invalid:
5447            float_raise( float_flag_invalid STATUS_VAR);
5448            z.low = float128_default_nan_low;
5449            z.high = float128_default_nan_high;
5450            return z;
5451        }
5452        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5453    }
5454    if ( aExp == 0 ) {
5455        if ( ( aSig0 | aSig1 ) == 0 ) return a;
5456        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5457    }
5458    expDiff = aExp - bExp;
5459    if ( expDiff < -1 ) return a;
5460    shortShift128Left(
5461        aSig0 | LIT64( 0x0001000000000000 ),
5462        aSig1,
5463        15 - ( expDiff < 0 ),
5464        &aSig0,
5465        &aSig1
5466    );
5467    shortShift128Left(
5468        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5469    q = le128( bSig0, bSig1, aSig0, aSig1 );
5470    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5471    expDiff -= 64;
5472    while ( 0 < expDiff ) {
5473        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5474        q = ( 4 < q ) ? q - 4 : 0;
5475        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5476        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5477        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5478        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5479        expDiff -= 61;
5480    }
5481    if ( -64 < expDiff ) {
5482        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5483        q = ( 4 < q ) ? q - 4 : 0;
5484        q >>= - expDiff;
5485        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5486        expDiff += 52;
5487        if ( expDiff < 0 ) {
5488            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5489        }
5490        else {
5491            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5492        }
5493        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5494        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5495    }
5496    else {
5497        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5498        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5499    }
5500    do {
5501        alternateASig0 = aSig0;
5502        alternateASig1 = aSig1;
5503        ++q;
5504        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5505    } while ( 0 <= (sbits64) aSig0 );
5506    add128(
5507        aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5508    if (    ( sigMean0 < 0 )
5509         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5510        aSig0 = alternateASig0;
5511        aSig1 = alternateASig1;
5512    }
5513    zSign = ( (sbits64) aSig0 < 0 );
5514    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5515    return
5516        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5517
5518}
5519
5520/*----------------------------------------------------------------------------
5521| Returns the square root of the quadruple-precision floating-point value `a'.
5522| The operation is performed according to the IEC/IEEE Standard for Binary
5523| Floating-Point Arithmetic.
5524*----------------------------------------------------------------------------*/
5525
5526float128 float128_sqrt( float128 a STATUS_PARAM )
5527{
5528    flag aSign;
5529    int32 aExp, zExp;
5530    bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5531    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5532    float128 z;
5533
5534    aSig1 = extractFloat128Frac1( a );
5535    aSig0 = extractFloat128Frac0( a );
5536    aExp = extractFloat128Exp( a );
5537    aSign = extractFloat128Sign( a );
5538    if ( aExp == 0x7FFF ) {
5539        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5540        if ( ! aSign ) return a;
5541        goto invalid;
5542    }
5543    if ( aSign ) {
5544        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5545 invalid:
5546        float_raise( float_flag_invalid STATUS_VAR);
5547        z.low = float128_default_nan_low;
5548        z.high = float128_default_nan_high;
5549        return z;
5550    }
5551    if ( aExp == 0 ) {
5552        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5553        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5554    }
5555    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5556    aSig0 |= LIT64( 0x0001000000000000 );
5557    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5558    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5559    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5560    doubleZSig0 = zSig0<<1;
5561    mul64To128( zSig0, zSig0, &term0, &term1 );
5562    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5563    while ( (sbits64) rem0 < 0 ) {
5564        --zSig0;
5565        doubleZSig0 -= 2;
5566        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5567    }
5568    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5569    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5570        if ( zSig1 == 0 ) zSig1 = 1;
5571        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5572        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5573        mul64To128( zSig1, zSig1, &term2, &term3 );
5574        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5575        while ( (sbits64) rem1 < 0 ) {
5576            --zSig1;
5577            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5578            term3 |= 1;
5579            term2 |= doubleZSig0;
5580            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5581        }
5582        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5583    }
5584    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5585    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5586
5587}
5588
5589/*----------------------------------------------------------------------------
5590| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5591| the corresponding value `b', and 0 otherwise.  The comparison is performed
5592| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5593*----------------------------------------------------------------------------*/
5594
5595int float128_eq( float128 a, float128 b STATUS_PARAM )
5596{
5597
5598    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5599              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5600         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5601              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5602       ) {
5603        if (    float128_is_signaling_nan( a )
5604             || float128_is_signaling_nan( b ) ) {
5605            float_raise( float_flag_invalid STATUS_VAR);
5606        }
5607        return 0;
5608    }
5609    return
5610           ( a.low == b.low )
5611        && (    ( a.high == b.high )
5612             || (    ( a.low == 0 )
5613                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5614           );
5615
5616}
5617
5618/*----------------------------------------------------------------------------
5619| Returns 1 if the quadruple-precision floating-point value `a' is less than
5620| or equal to the corresponding value `b', and 0 otherwise.  The comparison
5621| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5622| Arithmetic.
5623*----------------------------------------------------------------------------*/
5624
5625int float128_le( float128 a, float128 b STATUS_PARAM )
5626{
5627    flag aSign, bSign;
5628
5629    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5630              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5631         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5632              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5633       ) {
5634        float_raise( float_flag_invalid STATUS_VAR);
5635        return 0;
5636    }
5637    aSign = extractFloat128Sign( a );
5638    bSign = extractFloat128Sign( b );
5639    if ( aSign != bSign ) {
5640        return
5641               aSign
5642            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5643                 == 0 );
5644    }
5645    return
5646          aSign ? le128( b.high, b.low, a.high, a.low )
5647        : le128( a.high, a.low, b.high, b.low );
5648
5649}
5650
5651/*----------------------------------------------------------------------------
5652| Returns 1 if the quadruple-precision floating-point value `a' is less than
5653| the corresponding value `b', and 0 otherwise.  The comparison is performed
5654| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5655*----------------------------------------------------------------------------*/
5656
5657int float128_lt( float128 a, float128 b STATUS_PARAM )
5658{
5659    flag aSign, bSign;
5660
5661    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5662              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5663         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5664              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5665       ) {
5666        float_raise( float_flag_invalid STATUS_VAR);
5667        return 0;
5668    }
5669    aSign = extractFloat128Sign( a );
5670    bSign = extractFloat128Sign( b );
5671    if ( aSign != bSign ) {
5672        return
5673               aSign
5674            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5675                 != 0 );
5676    }
5677    return
5678          aSign ? lt128( b.high, b.low, a.high, a.low )
5679        : lt128( a.high, a.low, b.high, b.low );
5680
5681}
5682
5683/*----------------------------------------------------------------------------
5684| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5685| the corresponding value `b', and 0 otherwise.  The invalid exception is
5686| raised if either operand is a NaN.  Otherwise, the comparison is performed
5687| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5688*----------------------------------------------------------------------------*/
5689
5690int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5691{
5692
5693    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5694              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5695         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5696              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5697       ) {
5698        float_raise( float_flag_invalid STATUS_VAR);
5699        return 0;
5700    }
5701    return
5702           ( a.low == b.low )
5703        && (    ( a.high == b.high )
5704             || (    ( a.low == 0 )
5705                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5706           );
5707
5708}
5709
5710/*----------------------------------------------------------------------------
5711| Returns 1 if the quadruple-precision floating-point value `a' is less than
5712| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5713| cause an exception.  Otherwise, the comparison is performed according to the
5714| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5715*----------------------------------------------------------------------------*/
5716
5717int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5718{
5719    flag aSign, bSign;
5720
5721    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5722              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5723         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5724              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5725       ) {
5726        if (    float128_is_signaling_nan( a )
5727             || float128_is_signaling_nan( b ) ) {
5728            float_raise( float_flag_invalid STATUS_VAR);
5729        }
5730        return 0;
5731    }
5732    aSign = extractFloat128Sign( a );
5733    bSign = extractFloat128Sign( b );
5734    if ( aSign != bSign ) {
5735        return
5736               aSign
5737            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5738                 == 0 );
5739    }
5740    return
5741          aSign ? le128( b.high, b.low, a.high, a.low )
5742        : le128( a.high, a.low, b.high, b.low );
5743
5744}
5745
5746/*----------------------------------------------------------------------------
5747| Returns 1 if the quadruple-precision floating-point value `a' is less than
5748| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5749| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5750| Standard for Binary Floating-Point Arithmetic.
5751*----------------------------------------------------------------------------*/
5752
5753int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5754{
5755    flag aSign, bSign;
5756
5757    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5758              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5759         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5760              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5761       ) {
5762        if (    float128_is_signaling_nan( a )
5763             || float128_is_signaling_nan( b ) ) {
5764            float_raise( float_flag_invalid STATUS_VAR);
5765        }
5766        return 0;
5767    }
5768    aSign = extractFloat128Sign( a );
5769    bSign = extractFloat128Sign( b );
5770    if ( aSign != bSign ) {
5771        return
5772               aSign
5773            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5774                 != 0 );
5775    }
5776    return
5777          aSign ? lt128( b.high, b.low, a.high, a.low )
5778        : lt128( a.high, a.low, b.high, b.low );
5779
5780}
5781
5782#endif
5783
5784/* misc functions */
5785float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5786{
5787    return int64_to_float32(a STATUS_VAR);
5788}
5789
5790float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5791{
5792    return int64_to_float64(a STATUS_VAR);
5793}
5794
5795unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5796{
5797    int64_t v;
5798    unsigned int res;
5799
5800    v = float32_to_int64(a STATUS_VAR);
5801    if (v < 0) {
5802        res = 0;
5803        float_raise( float_flag_invalid STATUS_VAR);
5804    } else if (v > 0xffffffff) {
5805        res = 0xffffffff;
5806        float_raise( float_flag_invalid STATUS_VAR);
5807    } else {
5808        res = v;
5809    }
5810    return res;
5811}
5812
5813unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5814{
5815    int64_t v;
5816    unsigned int res;
5817
5818    v = float32_to_int64_round_to_zero(a STATUS_VAR);
5819    if (v < 0) {
5820        res = 0;
5821        float_raise( float_flag_invalid STATUS_VAR);
5822    } else if (v > 0xffffffff) {
5823        res = 0xffffffff;
5824        float_raise( float_flag_invalid STATUS_VAR);
5825    } else {
5826        res = v;
5827    }
5828    return res;
5829}
5830
5831unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
5832{
5833    int64_t v;
5834    unsigned int res;
5835
5836    v = float32_to_int64_round_to_zero(a STATUS_VAR);
5837    if (v < 0) {
5838        res = 0;
5839        float_raise( float_flag_invalid STATUS_VAR);
5840    } else if (v > 0xffff) {
5841        res = 0xffff;
5842        float_raise( float_flag_invalid STATUS_VAR);
5843    } else {
5844        res = v;
5845    }
5846    return res;
5847}
5848
5849unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5850{
5851    int64_t v;
5852    unsigned int res;
5853
5854    v = float64_to_int64(a STATUS_VAR);
5855    if (v < 0) {
5856        res = 0;
5857        float_raise( float_flag_invalid STATUS_VAR);
5858    } else if (v > 0xffffffff) {
5859        res = 0xffffffff;
5860        float_raise( float_flag_invalid STATUS_VAR);
5861    } else {
5862        res = v;
5863    }
5864    return res;
5865}
5866
5867unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5868{
5869    int64_t v;
5870    unsigned int res;
5871
5872    v = float64_to_int64_round_to_zero(a STATUS_VAR);
5873    if (v < 0) {
5874        res = 0;
5875        float_raise( float_flag_invalid STATUS_VAR);
5876    } else if (v > 0xffffffff) {
5877        res = 0xffffffff;
5878        float_raise( float_flag_invalid STATUS_VAR);
5879    } else {
5880        res = v;
5881    }
5882    return res;
5883}
5884
5885unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
5886{
5887    int64_t v;
5888    unsigned int res;
5889
5890    v = float64_to_int64_round_to_zero(a STATUS_VAR);
5891    if (v < 0) {
5892        res = 0;
5893        float_raise( float_flag_invalid STATUS_VAR);
5894    } else if (v > 0xffff) {
5895        res = 0xffff;
5896        float_raise( float_flag_invalid STATUS_VAR);
5897    } else {
5898        res = v;
5899    }
5900    return res;
5901}
5902
5903/* FIXME: This looks broken.  */
5904uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5905{
5906    int64_t v;
5907
5908    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5909    v += float64_val(a);
5910    v = float64_to_int64(make_float64(v) STATUS_VAR);
5911
5912    return v - INT64_MIN;
5913}
5914
5915uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5916{
5917    int64_t v;
5918
5919    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5920    v += float64_val(a);
5921    v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5922
5923    return v - INT64_MIN;
5924}
5925
5926#define COMPARE(s, nan_exp)                                                  \
5927INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5928                                      int is_quiet STATUS_PARAM )            \
5929{                                                                            \
5930    flag aSign, bSign;                                                       \
5931    bits ## s av, bv;                                                        \
5932    a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
5933    b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
5934                                                                             \
5935    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5936         extractFloat ## s ## Frac( a ) ) ||                                 \
5937        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5938          extractFloat ## s ## Frac( b ) )) {                                \
5939        if (!is_quiet ||                                                     \
5940            float ## s ## _is_signaling_nan( a ) ||                          \
5941            float ## s ## _is_signaling_nan( b ) ) {                         \
5942            float_raise( float_flag_invalid STATUS_VAR);                     \
5943        }                                                                    \
5944        return float_relation_unordered;                                     \
5945    }                                                                        \
5946    aSign = extractFloat ## s ## Sign( a );                                  \
5947    bSign = extractFloat ## s ## Sign( b );                                  \
5948    av = float ## s ## _val(a);                                              \
5949    bv = float ## s ## _val(b);                                              \
5950    if ( aSign != bSign ) {                                                  \
5951        if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
5952            /* zero case */                                                  \
5953            return float_relation_equal;                                     \
5954        } else {                                                             \
5955            return 1 - (2 * aSign);                                          \
5956        }                                                                    \
5957    } else {                                                                 \
5958        if (av == bv) {                                                      \
5959            return float_relation_equal;                                     \
5960        } else {                                                             \
5961            return 1 - 2 * (aSign ^ ( av < bv ));                            \
5962        }                                                                    \
5963    }                                                                        \
5964}                                                                            \
5965                                                                             \
5966int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
5967{                                                                            \
5968    return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5969}                                                                            \
5970                                                                             \
5971int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
5972{                                                                            \
5973    return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
5974}
5975
5976COMPARE(32, 0xff)
5977COMPARE(64, 0x7ff)
5978
5979INLINE int float128_compare_internal( float128 a, float128 b,
5980                                      int is_quiet STATUS_PARAM )
5981{
5982    flag aSign, bSign;
5983
5984    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5985          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
5986        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
5987          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
5988        if (!is_quiet ||
5989            float128_is_signaling_nan( a ) ||
5990            float128_is_signaling_nan( b ) ) {
5991            float_raise( float_flag_invalid STATUS_VAR);
5992        }
5993        return float_relation_unordered;
5994    }
5995    aSign = extractFloat128Sign( a );
5996    bSign = extractFloat128Sign( b );
5997    if ( aSign != bSign ) {
5998        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
5999            /* zero case */
6000            return float_relation_equal;
6001        } else {
6002            return 1 - (2 * aSign);
6003        }
6004    } else {
6005        if (a.low == b.low && a.high == b.high) {
6006            return float_relation_equal;
6007        } else {
6008            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6009        }
6010    }
6011}
6012
6013int float128_compare( float128 a, float128 b STATUS_PARAM )
6014{
6015    return float128_compare_internal(a, b, 0 STATUS_VAR);
6016}
6017
6018int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6019{
6020    return float128_compare_internal(a, b, 1 STATUS_VAR);
6021}
6022
6023/* Multiply A by 2 raised to the power N.  */
6024float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6025{
6026    flag aSign;
6027    int16 aExp;
6028    bits32 aSig;
6029
6030    a = float32_squash_input_denormal(a STATUS_VAR);
6031    aSig = extractFloat32Frac( a );
6032    aExp = extractFloat32Exp( a );
6033    aSign = extractFloat32Sign( a );
6034
6035    if ( aExp == 0xFF ) {
6036        return a;
6037    }
6038    if ( aExp != 0 )
6039        aSig |= 0x00800000;
6040    else if ( aSig == 0 )
6041        return a;
6042
6043    aExp += n - 1;
6044    aSig <<= 7;
6045    return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6046}
6047
6048float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6049{
6050    flag aSign;
6051    int16 aExp;
6052    bits64 aSig;
6053
6054    a = float64_squash_input_denormal(a STATUS_VAR);
6055    aSig = extractFloat64Frac( a );
6056    aExp = extractFloat64Exp( a );
6057    aSign = extractFloat64Sign( a );
6058
6059    if ( aExp == 0x7FF ) {
6060        return a;
6061    }
6062    if ( aExp != 0 )
6063        aSig |= LIT64( 0x0010000000000000 );
6064    else if ( aSig == 0 )
6065        return a;
6066
6067    aExp += n - 1;
6068    aSig <<= 10;
6069    return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6070}
6071
6072#ifdef FLOATX80
6073floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6074{
6075    flag aSign;
6076    int16 aExp;
6077    bits64 aSig;
6078
6079    aSig = extractFloatx80Frac( a );
6080    aExp = extractFloatx80Exp( a );
6081    aSign = extractFloatx80Sign( a );
6082
6083    if ( aExp == 0x7FF ) {
6084        return a;
6085    }
6086    if (aExp == 0 && aSig == 0)
6087        return a;
6088
6089    aExp += n;
6090    return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6091                                          aSign, aExp, aSig, 0 STATUS_VAR );
6092}
6093#endif
6094
6095#ifdef FLOAT128
6096float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6097{
6098    flag aSign;
6099    int32 aExp;
6100    bits64 aSig0, aSig1;
6101
6102    aSig1 = extractFloat128Frac1( a );
6103    aSig0 = extractFloat128Frac0( a );
6104    aExp = extractFloat128Exp( a );
6105    aSign = extractFloat128Sign( a );
6106    if ( aExp == 0x7FFF ) {
6107        return a;
6108    }
6109    if ( aExp != 0 )
6110        aSig0 |= LIT64( 0x0001000000000000 );
6111    else if ( aSig0 == 0 && aSig1 == 0 )
6112        return a;
6113
6114    aExp += n - 1;
6115    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6116                                          STATUS_VAR );
6117
6118}
6119#endif
6120