qemu/fpu/softfloat.c
<<
>>
Prefs
   1
   2/*============================================================================
   3
   4This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
   5Package, Release 2b.
   6
   7Written by John R. Hauser.  This work was made possible in part by the
   8International Computer Science Institute, located at Suite 600, 1947 Center
   9Street, Berkeley, California 94704.  Funding was partially provided by the
  10National Science Foundation under grant MIP-9311980.  The original version
  11of this code was written as part of a project to build a fixed-point vector
  12processor in collaboration with the University of California at Berkeley,
  13overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15arithmetic/SoftFloat.html'.
  16
  17THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  18been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25
  26Derivative works are acceptable, even for commercial purposes, so long as
  27(1) the source code for the derivative work includes prominent notice that
  28the work is derivative, and (2) the source code includes prominent notice with
  29these four paragraphs for those parts of this code that are retained.
  30
  31=============================================================================*/
  32
  33/* FIXME: Flush-To-Zero only effects results.  Denormal inputs should also
  34   be flushed to zero.  */
  35#include "softfloat.h"
  36
  37/*----------------------------------------------------------------------------
  38| Primitive arithmetic functions, including multi-word arithmetic, and
  39| division and square root approximations.  (Can be specialized to target if
  40| desired.)
  41*----------------------------------------------------------------------------*/
  42#include "softfloat-macros.h"
  43
  44/*----------------------------------------------------------------------------
  45| Functions and definitions to determine:  (1) whether tininess for underflow
  46| is detected before or after rounding by default, (2) what (if anything)
  47| happens when exceptions are raised, (3) how signaling NaNs are distinguished
  48| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  49| are propagated from function inputs to output.  These details are target-
  50| specific.
  51*----------------------------------------------------------------------------*/
  52#include "softfloat-specialize.h"
  53
  54void set_float_rounding_mode(int val STATUS_PARAM)
  55{
  56    STATUS(float_rounding_mode) = val;
  57}
  58
  59void set_float_exception_flags(int val STATUS_PARAM)
  60{
  61    STATUS(float_exception_flags) = val;
  62}
  63
  64#ifdef FLOATX80
  65void set_floatx80_rounding_precision(int val STATUS_PARAM)
  66{
  67    STATUS(floatx80_rounding_precision) = val;
  68}
  69#endif
  70
  71/*----------------------------------------------------------------------------
  72| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  73| and 7, and returns the properly rounded 32-bit integer corresponding to the
  74| input.  If `zSign' is 1, the input is negated before being converted to an
  75| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
  76| is simply rounded to an integer, with the inexact exception raised if the
  77| input cannot be represented exactly as an integer.  However, if the fixed-
  78| point input is too large, the invalid exception is raised and the largest
  79| positive or negative integer is returned.
  80*----------------------------------------------------------------------------*/
  81
  82static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
  83{
  84    int8 roundingMode;
  85    flag roundNearestEven;
  86    int8 roundIncrement, roundBits;
  87    int32 z;
  88
  89    roundingMode = STATUS(float_rounding_mode);
  90    roundNearestEven = ( roundingMode == float_round_nearest_even );
  91    roundIncrement = 0x40;
  92    if ( ! roundNearestEven ) {
  93        if ( roundingMode == float_round_to_zero ) {
  94            roundIncrement = 0;
  95        }
  96        else {
  97            roundIncrement = 0x7F;
  98            if ( zSign ) {
  99                if ( roundingMode == float_round_up ) roundIncrement = 0;
 100            }
 101            else {
 102                if ( roundingMode == float_round_down ) roundIncrement = 0;
 103            }
 104        }
 105    }
 106    roundBits = absZ & 0x7F;
 107    absZ = ( absZ + roundIncrement )>>7;
 108    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 109    z = absZ;
 110    if ( zSign ) z = - z;
 111    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 112        float_raise( float_flag_invalid STATUS_VAR);
 113        return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 114    }
 115    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 116    return z;
 117
 118}
 119
 120/*----------------------------------------------------------------------------
 121| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 122| `absZ1', with binary point between bits 63 and 64 (between the input words),
 123| and returns the properly rounded 64-bit integer corresponding to the input.
 124| If `zSign' is 1, the input is negated before being converted to an integer.
 125| Ordinarily, the fixed-point input is simply rounded to an integer, with
 126| the inexact exception raised if the input cannot be represented exactly as
 127| an integer.  However, if the fixed-point input is too large, the invalid
 128| exception is raised and the largest positive or negative integer is
 129| returned.
 130*----------------------------------------------------------------------------*/
 131
 132static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
 133{
 134    int8 roundingMode;
 135    flag roundNearestEven, increment;
 136    int64 z;
 137
 138    roundingMode = STATUS(float_rounding_mode);
 139    roundNearestEven = ( roundingMode == float_round_nearest_even );
 140    increment = ( (sbits64) absZ1 < 0 );
 141    if ( ! roundNearestEven ) {
 142        if ( roundingMode == float_round_to_zero ) {
 143            increment = 0;
 144        }
 145        else {
 146            if ( zSign ) {
 147                increment = ( roundingMode == float_round_down ) && absZ1;
 148            }
 149            else {
 150                increment = ( roundingMode == float_round_up ) && absZ1;
 151            }
 152        }
 153    }
 154    if ( increment ) {
 155        ++absZ0;
 156        if ( absZ0 == 0 ) goto overflow;
 157        absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 158    }
 159    z = absZ0;
 160    if ( zSign ) z = - z;
 161    if ( z && ( ( z < 0 ) ^ zSign ) ) {
 162 overflow:
 163        float_raise( float_flag_invalid STATUS_VAR);
 164        return
 165              zSign ? (sbits64) LIT64( 0x8000000000000000 )
 166            : LIT64( 0x7FFFFFFFFFFFFFFF );
 167    }
 168    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 169    return z;
 170
 171}
 172
 173/*----------------------------------------------------------------------------
 174| Returns the fraction bits of the single-precision floating-point value `a'.
 175*----------------------------------------------------------------------------*/
 176
 177INLINE bits32 extractFloat32Frac( float32 a )
 178{
 179
 180    return float32_val(a) & 0x007FFFFF;
 181
 182}
 183
 184/*----------------------------------------------------------------------------
 185| Returns the exponent bits of the single-precision floating-point value `a'.
 186*----------------------------------------------------------------------------*/
 187
 188INLINE int16 extractFloat32Exp( float32 a )
 189{
 190
 191    return ( float32_val(a)>>23 ) & 0xFF;
 192
 193}
 194
 195/*----------------------------------------------------------------------------
 196| Returns the sign bit of the single-precision floating-point value `a'.
 197*----------------------------------------------------------------------------*/
 198
 199INLINE flag extractFloat32Sign( float32 a )
 200{
 201
 202    return float32_val(a)>>31;
 203
 204}
 205
 206/*----------------------------------------------------------------------------
 207| Normalizes the subnormal single-precision floating-point value represented
 208| by the denormalized significand `aSig'.  The normalized exponent and
 209| significand are stored at the locations pointed to by `zExpPtr' and
 210| `zSigPtr', respectively.
 211*----------------------------------------------------------------------------*/
 212
 213static void
 214 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 215{
 216    int8 shiftCount;
 217
 218    shiftCount = countLeadingZeros32( aSig ) - 8;
 219    *zSigPtr = aSig<<shiftCount;
 220    *zExpPtr = 1 - shiftCount;
 221
 222}
 223
 224/*----------------------------------------------------------------------------
 225| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 226| single-precision floating-point value, returning the result.  After being
 227| shifted into the proper positions, the three fields are simply added
 228| together to form the result.  This means that any integer portion of `zSig'
 229| will be added into the exponent.  Since a properly normalized significand
 230| will have an integer portion equal to 1, the `zExp' input should be 1 less
 231| than the desired result exponent whenever `zSig' is a complete, normalized
 232| significand.
 233*----------------------------------------------------------------------------*/
 234
 235INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 236{
 237
 238    return make_float32(
 239          ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
 240
 241}
 242
 243/*----------------------------------------------------------------------------
 244| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 245| and significand `zSig', and returns the proper single-precision floating-
 246| point value corresponding to the abstract input.  Ordinarily, the abstract
 247| value is simply rounded and packed into the single-precision format, with
 248| the inexact exception raised if the abstract input cannot be represented
 249| exactly.  However, if the abstract value is too large, the overflow and
 250| inexact exceptions are raised and an infinity or maximal finite value is
 251| returned.  If the abstract value is too small, the input value is rounded to
 252| a subnormal number, and the underflow and inexact exceptions are raised if
 253| the abstract input cannot be represented exactly as a subnormal single-
 254| precision floating-point number.
 255|     The input significand `zSig' has its binary point between bits 30
 256| and 29, which is 7 bits to the left of the usual location.  This shifted
 257| significand must be normalized or smaller.  If `zSig' is not normalized,
 258| `zExp' must be 0; in that case, the result returned is a subnormal number,
 259| and it must not require rounding.  In the usual case that `zSig' is
 260| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 261| The handling of underflow and overflow follows the IEC/IEEE Standard for
 262| Binary Floating-Point Arithmetic.
 263*----------------------------------------------------------------------------*/
 264
 265static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 266{
 267    int8 roundingMode;
 268    flag roundNearestEven;
 269    int8 roundIncrement, roundBits;
 270    flag isTiny;
 271
 272    roundingMode = STATUS(float_rounding_mode);
 273    roundNearestEven = ( roundingMode == float_round_nearest_even );
 274    roundIncrement = 0x40;
 275    if ( ! roundNearestEven ) {
 276        if ( roundingMode == float_round_to_zero ) {
 277            roundIncrement = 0;
 278        }
 279        else {
 280            roundIncrement = 0x7F;
 281            if ( zSign ) {
 282                if ( roundingMode == float_round_up ) roundIncrement = 0;
 283            }
 284            else {
 285                if ( roundingMode == float_round_down ) roundIncrement = 0;
 286            }
 287        }
 288    }
 289    roundBits = zSig & 0x7F;
 290    if ( 0xFD <= (bits16) zExp ) {
 291        if (    ( 0xFD < zExp )
 292             || (    ( zExp == 0xFD )
 293                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 294           ) {
 295            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 296            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 297        }
 298        if ( zExp < 0 ) {
 299            if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
 300            isTiny =
 301                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 302                || ( zExp < -1 )
 303                || ( zSig + roundIncrement < 0x80000000 );
 304            shift32RightJamming( zSig, - zExp, &zSig );
 305            zExp = 0;
 306            roundBits = zSig & 0x7F;
 307            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 308        }
 309    }
 310    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 311    zSig = ( zSig + roundIncrement )>>7;
 312    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 313    if ( zSig == 0 ) zExp = 0;
 314    return packFloat32( zSign, zExp, zSig );
 315
 316}
 317
 318/*----------------------------------------------------------------------------
 319| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 320| and significand `zSig', and returns the proper single-precision floating-
 321| point value corresponding to the abstract input.  This routine is just like
 322| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 323| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 324| floating-point exponent.
 325*----------------------------------------------------------------------------*/
 326
 327static float32
 328 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 329{
 330    int8 shiftCount;
 331
 332    shiftCount = countLeadingZeros32( zSig ) - 1;
 333    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 334
 335}
 336
 337/*----------------------------------------------------------------------------
 338| Returns the fraction bits of the double-precision floating-point value `a'.
 339*----------------------------------------------------------------------------*/
 340
 341INLINE bits64 extractFloat64Frac( float64 a )
 342{
 343
 344    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 345
 346}
 347
 348/*----------------------------------------------------------------------------
 349| Returns the exponent bits of the double-precision floating-point value `a'.
 350*----------------------------------------------------------------------------*/
 351
 352INLINE int16 extractFloat64Exp( float64 a )
 353{
 354
 355    return ( float64_val(a)>>52 ) & 0x7FF;
 356
 357}
 358
 359/*----------------------------------------------------------------------------
 360| Returns the sign bit of the double-precision floating-point value `a'.
 361*----------------------------------------------------------------------------*/
 362
 363INLINE flag extractFloat64Sign( float64 a )
 364{
 365
 366    return float64_val(a)>>63;
 367
 368}
 369
 370/*----------------------------------------------------------------------------
 371| Normalizes the subnormal double-precision floating-point value represented
 372| by the denormalized significand `aSig'.  The normalized exponent and
 373| significand are stored at the locations pointed to by `zExpPtr' and
 374| `zSigPtr', respectively.
 375*----------------------------------------------------------------------------*/
 376
 377static void
 378 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 379{
 380    int8 shiftCount;
 381
 382    shiftCount = countLeadingZeros64( aSig ) - 11;
 383    *zSigPtr = aSig<<shiftCount;
 384    *zExpPtr = 1 - shiftCount;
 385
 386}
 387
 388/*----------------------------------------------------------------------------
 389| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 390| double-precision floating-point value, returning the result.  After being
 391| shifted into the proper positions, the three fields are simply added
 392| together to form the result.  This means that any integer portion of `zSig'
 393| will be added into the exponent.  Since a properly normalized significand
 394| will have an integer portion equal to 1, the `zExp' input should be 1 less
 395| than the desired result exponent whenever `zSig' is a complete, normalized
 396| significand.
 397*----------------------------------------------------------------------------*/
 398
 399INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 400{
 401
 402    return make_float64(
 403        ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
 404
 405}
 406
 407/*----------------------------------------------------------------------------
 408| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 409| and significand `zSig', and returns the proper double-precision floating-
 410| point value corresponding to the abstract input.  Ordinarily, the abstract
 411| value is simply rounded and packed into the double-precision format, with
 412| the inexact exception raised if the abstract input cannot be represented
 413| exactly.  However, if the abstract value is too large, the overflow and
 414| inexact exceptions are raised and an infinity or maximal finite value is
 415| returned.  If the abstract value is too small, the input value is rounded
 416| to a subnormal number, and the underflow and inexact exceptions are raised
 417| if the abstract input cannot be represented exactly as a subnormal double-
 418| precision floating-point number.
 419|     The input significand `zSig' has its binary point between bits 62
 420| and 61, which is 10 bits to the left of the usual location.  This shifted
 421| significand must be normalized or smaller.  If `zSig' is not normalized,
 422| `zExp' must be 0; in that case, the result returned is a subnormal number,
 423| and it must not require rounding.  In the usual case that `zSig' is
 424| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 425| The handling of underflow and overflow follows the IEC/IEEE Standard for
 426| Binary Floating-Point Arithmetic.
 427*----------------------------------------------------------------------------*/
 428
 429static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 430{
 431    int8 roundingMode;
 432    flag roundNearestEven;
 433    int16 roundIncrement, roundBits;
 434    flag isTiny;
 435
 436    roundingMode = STATUS(float_rounding_mode);
 437    roundNearestEven = ( roundingMode == float_round_nearest_even );
 438    roundIncrement = 0x200;
 439    if ( ! roundNearestEven ) {
 440        if ( roundingMode == float_round_to_zero ) {
 441            roundIncrement = 0;
 442        }
 443        else {
 444            roundIncrement = 0x3FF;
 445            if ( zSign ) {
 446                if ( roundingMode == float_round_up ) roundIncrement = 0;
 447            }
 448            else {
 449                if ( roundingMode == float_round_down ) roundIncrement = 0;
 450            }
 451        }
 452    }
 453    roundBits = zSig & 0x3FF;
 454    if ( 0x7FD <= (bits16) zExp ) {
 455        if (    ( 0x7FD < zExp )
 456             || (    ( zExp == 0x7FD )
 457                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 458           ) {
 459            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 460            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 461        }
 462        if ( zExp < 0 ) {
 463            if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
 464            isTiny =
 465                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 466                || ( zExp < -1 )
 467                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 468            shift64RightJamming( zSig, - zExp, &zSig );
 469            zExp = 0;
 470            roundBits = zSig & 0x3FF;
 471            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 472        }
 473    }
 474    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 475    zSig = ( zSig + roundIncrement )>>10;
 476    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 477    if ( zSig == 0 ) zExp = 0;
 478    return packFloat64( zSign, zExp, zSig );
 479
 480}
 481
 482/*----------------------------------------------------------------------------
 483| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 484| and significand `zSig', and returns the proper double-precision floating-
 485| point value corresponding to the abstract input.  This routine is just like
 486| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 487| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 488| floating-point exponent.
 489*----------------------------------------------------------------------------*/
 490
 491static float64
 492 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 493{
 494    int8 shiftCount;
 495
 496    shiftCount = countLeadingZeros64( zSig ) - 1;
 497    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 498
 499}
 500
 501#ifdef FLOATX80
 502
 503/*----------------------------------------------------------------------------
 504| Returns the fraction bits of the extended double-precision floating-point
 505| value `a'.
 506*----------------------------------------------------------------------------*/
 507
 508INLINE bits64 extractFloatx80Frac( floatx80 a )
 509{
 510
 511    return a.low;
 512
 513}
 514
 515/*----------------------------------------------------------------------------
 516| Returns the exponent bits of the extended double-precision floating-point
 517| value `a'.
 518*----------------------------------------------------------------------------*/
 519
 520INLINE int32 extractFloatx80Exp( floatx80 a )
 521{
 522
 523    return a.high & 0x7FFF;
 524
 525}
 526
 527/*----------------------------------------------------------------------------
 528| Returns the sign bit of the extended double-precision floating-point value
 529| `a'.
 530*----------------------------------------------------------------------------*/
 531
 532INLINE flag extractFloatx80Sign( floatx80 a )
 533{
 534
 535    return a.high>>15;
 536
 537}
 538
 539/*----------------------------------------------------------------------------
 540| Normalizes the subnormal extended double-precision floating-point value
 541| represented by the denormalized significand `aSig'.  The normalized exponent
 542| and significand are stored at the locations pointed to by `zExpPtr' and
 543| `zSigPtr', respectively.
 544*----------------------------------------------------------------------------*/
 545
 546static void
 547 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 548{
 549    int8 shiftCount;
 550
 551    shiftCount = countLeadingZeros64( aSig );
 552    *zSigPtr = aSig<<shiftCount;
 553    *zExpPtr = 1 - shiftCount;
 554
 555}
 556
 557/*----------------------------------------------------------------------------
 558| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 559| extended double-precision floating-point value, returning the result.
 560*----------------------------------------------------------------------------*/
 561
 562INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 563{
 564    floatx80 z;
 565
 566    z.low = zSig;
 567    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 568    return z;
 569
 570}
 571
 572/*----------------------------------------------------------------------------
 573| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 574| and extended significand formed by the concatenation of `zSig0' and `zSig1',
 575| and returns the proper extended double-precision floating-point value
 576| corresponding to the abstract input.  Ordinarily, the abstract value is
 577| rounded and packed into the extended double-precision format, with the
 578| inexact exception raised if the abstract input cannot be represented
 579| exactly.  However, if the abstract value is too large, the overflow and
 580| inexact exceptions are raised and an infinity or maximal finite value is
 581| returned.  If the abstract value is too small, the input value is rounded to
 582| a subnormal number, and the underflow and inexact exceptions are raised if
 583| the abstract input cannot be represented exactly as a subnormal extended
 584| double-precision floating-point number.
 585|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 586| number of bits as single or double precision, respectively.  Otherwise, the
 587| result is rounded to the full precision of the extended double-precision
 588| format.
 589|     The input significand must be normalized or smaller.  If the input
 590| significand is not normalized, `zExp' must be 0; in that case, the result
 591| returned is a subnormal number, and it must not require rounding.  The
 592| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 593| Floating-Point Arithmetic.
 594*----------------------------------------------------------------------------*/
 595
 596static floatx80
 597 roundAndPackFloatx80(
 598     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 599 STATUS_PARAM)
 600{
 601    int8 roundingMode;
 602    flag roundNearestEven, increment, isTiny;
 603    int64 roundIncrement, roundMask, roundBits;
 604
 605    roundingMode = STATUS(float_rounding_mode);
 606    roundNearestEven = ( roundingMode == float_round_nearest_even );
 607    if ( roundingPrecision == 80 ) goto precision80;
 608    if ( roundingPrecision == 64 ) {
 609        roundIncrement = LIT64( 0x0000000000000400 );
 610        roundMask = LIT64( 0x00000000000007FF );
 611    }
 612    else if ( roundingPrecision == 32 ) {
 613        roundIncrement = LIT64( 0x0000008000000000 );
 614        roundMask = LIT64( 0x000000FFFFFFFFFF );
 615    }
 616    else {
 617        goto precision80;
 618    }
 619    zSig0 |= ( zSig1 != 0 );
 620    if ( ! roundNearestEven ) {
 621        if ( roundingMode == float_round_to_zero ) {
 622            roundIncrement = 0;
 623        }
 624        else {
 625            roundIncrement = roundMask;
 626            if ( zSign ) {
 627                if ( roundingMode == float_round_up ) roundIncrement = 0;
 628            }
 629            else {
 630                if ( roundingMode == float_round_down ) roundIncrement = 0;
 631            }
 632        }
 633    }
 634    roundBits = zSig0 & roundMask;
 635    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 636        if (    ( 0x7FFE < zExp )
 637             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 638           ) {
 639            goto overflow;
 640        }
 641        if ( zExp <= 0 ) {
 642            if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
 643            isTiny =
 644                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 645                || ( zExp < 0 )
 646                || ( zSig0 <= zSig0 + roundIncrement );
 647            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 648            zExp = 0;
 649            roundBits = zSig0 & roundMask;
 650            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 651            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 652            zSig0 += roundIncrement;
 653            if ( (sbits64) zSig0 < 0 ) zExp = 1;
 654            roundIncrement = roundMask + 1;
 655            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 656                roundMask |= roundIncrement;
 657            }
 658            zSig0 &= ~ roundMask;
 659            return packFloatx80( zSign, zExp, zSig0 );
 660        }
 661    }
 662    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 663    zSig0 += roundIncrement;
 664    if ( zSig0 < roundIncrement ) {
 665        ++zExp;
 666        zSig0 = LIT64( 0x8000000000000000 );
 667    }
 668    roundIncrement = roundMask + 1;
 669    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 670        roundMask |= roundIncrement;
 671    }
 672    zSig0 &= ~ roundMask;
 673    if ( zSig0 == 0 ) zExp = 0;
 674    return packFloatx80( zSign, zExp, zSig0 );
 675 precision80:
 676    increment = ( (sbits64) zSig1 < 0 );
 677    if ( ! roundNearestEven ) {
 678        if ( roundingMode == float_round_to_zero ) {
 679            increment = 0;
 680        }
 681        else {
 682            if ( zSign ) {
 683                increment = ( roundingMode == float_round_down ) && zSig1;
 684            }
 685            else {
 686                increment = ( roundingMode == float_round_up ) && zSig1;
 687            }
 688        }
 689    }
 690    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 691        if (    ( 0x7FFE < zExp )
 692             || (    ( zExp == 0x7FFE )
 693                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 694                  && increment
 695                )
 696           ) {
 697            roundMask = 0;
 698 overflow:
 699            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 700            if (    ( roundingMode == float_round_to_zero )
 701                 || ( zSign && ( roundingMode == float_round_up ) )
 702                 || ( ! zSign && ( roundingMode == float_round_down ) )
 703               ) {
 704                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 705            }
 706            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 707        }
 708        if ( zExp <= 0 ) {
 709            isTiny =
 710                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 711                || ( zExp < 0 )
 712                || ! increment
 713                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 714            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 715            zExp = 0;
 716            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 717            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 718            if ( roundNearestEven ) {
 719                increment = ( (sbits64) zSig1 < 0 );
 720            }
 721            else {
 722                if ( zSign ) {
 723                    increment = ( roundingMode == float_round_down ) && zSig1;
 724                }
 725                else {
 726                    increment = ( roundingMode == float_round_up ) && zSig1;
 727                }
 728            }
 729            if ( increment ) {
 730                ++zSig0;
 731                zSig0 &=
 732                    ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 733                if ( (sbits64) zSig0 < 0 ) zExp = 1;
 734            }
 735            return packFloatx80( zSign, zExp, zSig0 );
 736        }
 737    }
 738    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 739    if ( increment ) {
 740        ++zSig0;
 741        if ( zSig0 == 0 ) {
 742            ++zExp;
 743            zSig0 = LIT64( 0x8000000000000000 );
 744        }
 745        else {
 746            zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 747        }
 748    }
 749    else {
 750        if ( zSig0 == 0 ) zExp = 0;
 751    }
 752    return packFloatx80( zSign, zExp, zSig0 );
 753
 754}
 755
 756/*----------------------------------------------------------------------------
 757| Takes an abstract floating-point value having sign `zSign', exponent
 758| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 759| and returns the proper extended double-precision floating-point value
 760| corresponding to the abstract input.  This routine is just like
 761| `roundAndPackFloatx80' except that the input significand does not have to be
 762| normalized.
 763*----------------------------------------------------------------------------*/
 764
 765static floatx80
 766 normalizeRoundAndPackFloatx80(
 767     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 768 STATUS_PARAM)
 769{
 770    int8 shiftCount;
 771
 772    if ( zSig0 == 0 ) {
 773        zSig0 = zSig1;
 774        zSig1 = 0;
 775        zExp -= 64;
 776    }
 777    shiftCount = countLeadingZeros64( zSig0 );
 778    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 779    zExp -= shiftCount;
 780    return
 781        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 782
 783}
 784
 785#endif
 786
 787#ifdef FLOAT128
 788
 789/*----------------------------------------------------------------------------
 790| Returns the least-significant 64 fraction bits of the quadruple-precision
 791| floating-point value `a'.
 792*----------------------------------------------------------------------------*/
 793
 794INLINE bits64 extractFloat128Frac1( float128 a )
 795{
 796
 797    return a.low;
 798
 799}
 800
 801/*----------------------------------------------------------------------------
 802| Returns the most-significant 48 fraction bits of the quadruple-precision
 803| floating-point value `a'.
 804*----------------------------------------------------------------------------*/
 805
 806INLINE bits64 extractFloat128Frac0( float128 a )
 807{
 808
 809    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 810
 811}
 812
 813/*----------------------------------------------------------------------------
 814| Returns the exponent bits of the quadruple-precision floating-point value
 815| `a'.
 816*----------------------------------------------------------------------------*/
 817
 818INLINE int32 extractFloat128Exp( float128 a )
 819{
 820
 821    return ( a.high>>48 ) & 0x7FFF;
 822
 823}
 824
 825/*----------------------------------------------------------------------------
 826| Returns the sign bit of the quadruple-precision floating-point value `a'.
 827*----------------------------------------------------------------------------*/
 828
 829INLINE flag extractFloat128Sign( float128 a )
 830{
 831
 832    return a.high>>63;
 833
 834}
 835
 836/*----------------------------------------------------------------------------
 837| Normalizes the subnormal quadruple-precision floating-point value
 838| represented by the denormalized significand formed by the concatenation of
 839| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 840| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 841| significand are stored at the location pointed to by `zSig0Ptr', and the
 842| least significant 64 bits of the normalized significand are stored at the
 843| location pointed to by `zSig1Ptr'.
 844*----------------------------------------------------------------------------*/
 845
 846static void
 847 normalizeFloat128Subnormal(
 848     bits64 aSig0,
 849     bits64 aSig1,
 850     int32 *zExpPtr,
 851     bits64 *zSig0Ptr,
 852     bits64 *zSig1Ptr
 853 )
 854{
 855    int8 shiftCount;
 856
 857    if ( aSig0 == 0 ) {
 858        shiftCount = countLeadingZeros64( aSig1 ) - 15;
 859        if ( shiftCount < 0 ) {
 860            *zSig0Ptr = aSig1>>( - shiftCount );
 861            *zSig1Ptr = aSig1<<( shiftCount & 63 );
 862        }
 863        else {
 864            *zSig0Ptr = aSig1<<shiftCount;
 865            *zSig1Ptr = 0;
 866        }
 867        *zExpPtr = - shiftCount - 63;
 868    }
 869    else {
 870        shiftCount = countLeadingZeros64( aSig0 ) - 15;
 871        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 872        *zExpPtr = 1 - shiftCount;
 873    }
 874
 875}
 876
 877/*----------------------------------------------------------------------------
 878| Packs the sign `zSign', the exponent `zExp', and the significand formed
 879| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 880| floating-point value, returning the result.  After being shifted into the
 881| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 882| added together to form the most significant 32 bits of the result.  This
 883| means that any integer portion of `zSig0' will be added into the exponent.
 884| Since a properly normalized significand will have an integer portion equal
 885| to 1, the `zExp' input should be 1 less than the desired result exponent
 886| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 887| significand.
 888*----------------------------------------------------------------------------*/
 889
 890INLINE float128
 891 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 892{
 893    float128 z;
 894
 895    z.low = zSig1;
 896    z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 897    return z;
 898
 899}
 900
 901/*----------------------------------------------------------------------------
 902| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 903| and extended significand formed by the concatenation of `zSig0', `zSig1',
 904| and `zSig2', and returns the proper quadruple-precision floating-point value
 905| corresponding to the abstract input.  Ordinarily, the abstract value is
 906| simply rounded and packed into the quadruple-precision format, with the
 907| inexact exception raised if the abstract input cannot be represented
 908| exactly.  However, if the abstract value is too large, the overflow and
 909| inexact exceptions are raised and an infinity or maximal finite value is
 910| returned.  If the abstract value is too small, the input value is rounded to
 911| a subnormal number, and the underflow and inexact exceptions are raised if
 912| the abstract input cannot be represented exactly as a subnormal quadruple-
 913| precision floating-point number.
 914|     The input significand must be normalized or smaller.  If the input
 915| significand is not normalized, `zExp' must be 0; in that case, the result
 916| returned is a subnormal number, and it must not require rounding.  In the
 917| usual case that the input significand is normalized, `zExp' must be 1 less
 918| than the ``true'' floating-point exponent.  The handling of underflow and
 919| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 920*----------------------------------------------------------------------------*/
 921
 922static float128
 923 roundAndPackFloat128(
 924     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
 925{
 926    int8 roundingMode;
 927    flag roundNearestEven, increment, isTiny;
 928
 929    roundingMode = STATUS(float_rounding_mode);
 930    roundNearestEven = ( roundingMode == float_round_nearest_even );
 931    increment = ( (sbits64) zSig2 < 0 );
 932    if ( ! roundNearestEven ) {
 933        if ( roundingMode == float_round_to_zero ) {
 934            increment = 0;
 935        }
 936        else {
 937            if ( zSign ) {
 938                increment = ( roundingMode == float_round_down ) && zSig2;
 939            }
 940            else {
 941                increment = ( roundingMode == float_round_up ) && zSig2;
 942            }
 943        }
 944    }
 945    if ( 0x7FFD <= (bits32) zExp ) {
 946        if (    ( 0x7FFD < zExp )
 947             || (    ( zExp == 0x7FFD )
 948                  && eq128(
 949                         LIT64( 0x0001FFFFFFFFFFFF ),
 950                         LIT64( 0xFFFFFFFFFFFFFFFF ),
 951                         zSig0,
 952                         zSig1
 953                     )
 954                  && increment
 955                )
 956           ) {
 957            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 958            if (    ( roundingMode == float_round_to_zero )
 959                 || ( zSign && ( roundingMode == float_round_up ) )
 960                 || ( ! zSign && ( roundingMode == float_round_down ) )
 961               ) {
 962                return
 963                    packFloat128(
 964                        zSign,
 965                        0x7FFE,
 966                        LIT64( 0x0000FFFFFFFFFFFF ),
 967                        LIT64( 0xFFFFFFFFFFFFFFFF )
 968                    );
 969            }
 970            return packFloat128( zSign, 0x7FFF, 0, 0 );
 971        }
 972        if ( zExp < 0 ) {
 973            if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
 974            isTiny =
 975                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 976                || ( zExp < -1 )
 977                || ! increment
 978                || lt128(
 979                       zSig0,
 980                       zSig1,
 981                       LIT64( 0x0001FFFFFFFFFFFF ),
 982                       LIT64( 0xFFFFFFFFFFFFFFFF )
 983                   );
 984            shift128ExtraRightJamming(
 985                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
 986            zExp = 0;
 987            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
 988            if ( roundNearestEven ) {
 989                increment = ( (sbits64) zSig2 < 0 );
 990            }
 991            else {
 992                if ( zSign ) {
 993                    increment = ( roundingMode == float_round_down ) && zSig2;
 994                }
 995                else {
 996                    increment = ( roundingMode == float_round_up ) && zSig2;
 997                }
 998            }
 999        }
1000    }
1001    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1002    if ( increment ) {
1003        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1004        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1005    }
1006    else {
1007        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1008    }
1009    return packFloat128( zSign, zExp, zSig0, zSig1 );
1010
1011}
1012
1013/*----------------------------------------------------------------------------
1014| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1015| and significand formed by the concatenation of `zSig0' and `zSig1', and
1016| returns the proper quadruple-precision floating-point value corresponding
1017| to the abstract input.  This routine is just like `roundAndPackFloat128'
1018| except that the input significand has fewer bits and does not have to be
1019| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1020| point exponent.
1021*----------------------------------------------------------------------------*/
1022
1023static float128
1024 normalizeRoundAndPackFloat128(
1025     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1026{
1027    int8 shiftCount;
1028    bits64 zSig2;
1029
1030    if ( zSig0 == 0 ) {
1031        zSig0 = zSig1;
1032        zSig1 = 0;
1033        zExp -= 64;
1034    }
1035    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1036    if ( 0 <= shiftCount ) {
1037        zSig2 = 0;
1038        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1039    }
1040    else {
1041        shift128ExtraRightJamming(
1042            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1043    }
1044    zExp -= shiftCount;
1045    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1046
1047}
1048
1049#endif
1050
1051/*----------------------------------------------------------------------------
1052| Returns the result of converting the 32-bit two's complement integer `a'
1053| to the single-precision floating-point format.  The conversion is performed
1054| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1055*----------------------------------------------------------------------------*/
1056
1057float32 int32_to_float32( int32 a STATUS_PARAM )
1058{
1059    flag zSign;
1060
1061    if ( a == 0 ) return float32_zero;
1062    if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1063    zSign = ( a < 0 );
1064    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1065
1066}
1067
1068/*----------------------------------------------------------------------------
1069| Returns the result of converting the 32-bit two's complement integer `a'
1070| to the double-precision floating-point format.  The conversion is performed
1071| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1072*----------------------------------------------------------------------------*/
1073
1074float64 int32_to_float64( int32 a STATUS_PARAM )
1075{
1076    flag zSign;
1077    uint32 absA;
1078    int8 shiftCount;
1079    bits64 zSig;
1080
1081    if ( a == 0 ) return float64_zero;
1082    zSign = ( a < 0 );
1083    absA = zSign ? - a : a;
1084    shiftCount = countLeadingZeros32( absA ) + 21;
1085    zSig = absA;
1086    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1087
1088}
1089
1090#ifdef FLOATX80
1091
1092/*----------------------------------------------------------------------------
1093| Returns the result of converting the 32-bit two's complement integer `a'
1094| to the extended double-precision floating-point format.  The conversion
1095| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1096| Arithmetic.
1097*----------------------------------------------------------------------------*/
1098
1099floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1100{
1101    flag zSign;
1102    uint32 absA;
1103    int8 shiftCount;
1104    bits64 zSig;
1105
1106    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1107    zSign = ( a < 0 );
1108    absA = zSign ? - a : a;
1109    shiftCount = countLeadingZeros32( absA ) + 32;
1110    zSig = absA;
1111    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1112
1113}
1114
1115#endif
1116
1117#ifdef FLOAT128
1118
1119/*----------------------------------------------------------------------------
1120| Returns the result of converting the 32-bit two's complement integer `a' to
1121| the quadruple-precision floating-point format.  The conversion is performed
1122| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1123*----------------------------------------------------------------------------*/
1124
1125float128 int32_to_float128( int32 a STATUS_PARAM )
1126{
1127    flag zSign;
1128    uint32 absA;
1129    int8 shiftCount;
1130    bits64 zSig0;
1131
1132    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1133    zSign = ( a < 0 );
1134    absA = zSign ? - a : a;
1135    shiftCount = countLeadingZeros32( absA ) + 17;
1136    zSig0 = absA;
1137    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1138
1139}
1140
1141#endif
1142
1143/*----------------------------------------------------------------------------
1144| Returns the result of converting the 64-bit two's complement integer `a'
1145| to the single-precision floating-point format.  The conversion is performed
1146| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1147*----------------------------------------------------------------------------*/
1148
1149float32 int64_to_float32( int64 a STATUS_PARAM )
1150{
1151    flag zSign;
1152    uint64 absA;
1153    int8 shiftCount;
1154
1155    if ( a == 0 ) return float32_zero;
1156    zSign = ( a < 0 );
1157    absA = zSign ? - a : a;
1158    shiftCount = countLeadingZeros64( absA ) - 40;
1159    if ( 0 <= shiftCount ) {
1160        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1161    }
1162    else {
1163        shiftCount += 7;
1164        if ( shiftCount < 0 ) {
1165            shift64RightJamming( absA, - shiftCount, &absA );
1166        }
1167        else {
1168            absA <<= shiftCount;
1169        }
1170        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1171    }
1172
1173}
1174
1175float32 uint64_to_float32( uint64 a STATUS_PARAM )
1176{
1177    int8 shiftCount;
1178
1179    if ( a == 0 ) return float32_zero;
1180    shiftCount = countLeadingZeros64( a ) - 40;
1181    if ( 0 <= shiftCount ) {
1182        return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1183    }
1184    else {
1185        shiftCount += 7;
1186        if ( shiftCount < 0 ) {
1187            shift64RightJamming( a, - shiftCount, &a );
1188        }
1189        else {
1190            a <<= shiftCount;
1191        }
1192        return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1193    }
1194}
1195
1196/*----------------------------------------------------------------------------
1197| Returns the result of converting the 64-bit two's complement integer `a'
1198| to the double-precision floating-point format.  The conversion is performed
1199| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1200*----------------------------------------------------------------------------*/
1201
1202float64 int64_to_float64( int64 a STATUS_PARAM )
1203{
1204    flag zSign;
1205
1206    if ( a == 0 ) return float64_zero;
1207    if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1208        return packFloat64( 1, 0x43E, 0 );
1209    }
1210    zSign = ( a < 0 );
1211    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1212
1213}
1214
1215float64 uint64_to_float64( uint64 a STATUS_PARAM )
1216{
1217    if ( a == 0 ) return float64_zero;
1218    return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1219
1220}
1221
1222#ifdef FLOATX80
1223
1224/*----------------------------------------------------------------------------
1225| Returns the result of converting the 64-bit two's complement integer `a'
1226| to the extended double-precision floating-point format.  The conversion
1227| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1228| Arithmetic.
1229*----------------------------------------------------------------------------*/
1230
1231floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1232{
1233    flag zSign;
1234    uint64 absA;
1235    int8 shiftCount;
1236
1237    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1238    zSign = ( a < 0 );
1239    absA = zSign ? - a : a;
1240    shiftCount = countLeadingZeros64( absA );
1241    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1242
1243}
1244
1245#endif
1246
1247#ifdef FLOAT128
1248
1249/*----------------------------------------------------------------------------
1250| Returns the result of converting the 64-bit two's complement integer `a' to
1251| the quadruple-precision floating-point format.  The conversion is performed
1252| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1253*----------------------------------------------------------------------------*/
1254
1255float128 int64_to_float128( int64 a STATUS_PARAM )
1256{
1257    flag zSign;
1258    uint64 absA;
1259    int8 shiftCount;
1260    int32 zExp;
1261    bits64 zSig0, zSig1;
1262
1263    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1264    zSign = ( a < 0 );
1265    absA = zSign ? - a : a;
1266    shiftCount = countLeadingZeros64( absA ) + 49;
1267    zExp = 0x406E - shiftCount;
1268    if ( 64 <= shiftCount ) {
1269        zSig1 = 0;
1270        zSig0 = absA;
1271        shiftCount -= 64;
1272    }
1273    else {
1274        zSig1 = absA;
1275        zSig0 = 0;
1276    }
1277    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1278    return packFloat128( zSign, zExp, zSig0, zSig1 );
1279
1280}
1281
1282#endif
1283
1284/*----------------------------------------------------------------------------
1285| Returns the result of converting the single-precision floating-point value
1286| `a' to the 32-bit two's complement integer format.  The conversion is
1287| performed according to the IEC/IEEE Standard for Binary Floating-Point
1288| Arithmetic---which means in particular that the conversion is rounded
1289| according to the current rounding mode.  If `a' is a NaN, the largest
1290| positive integer is returned.  Otherwise, if the conversion overflows, the
1291| largest integer with the same sign as `a' is returned.
1292*----------------------------------------------------------------------------*/
1293
1294int32 float32_to_int32( float32 a STATUS_PARAM )
1295{
1296    flag aSign;
1297    int16 aExp, shiftCount;
1298    bits32 aSig;
1299    bits64 aSig64;
1300
1301    aSig = extractFloat32Frac( a );
1302    aExp = extractFloat32Exp( a );
1303    aSign = extractFloat32Sign( a );
1304    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1305    if ( aExp ) aSig |= 0x00800000;
1306    shiftCount = 0xAF - aExp;
1307    aSig64 = aSig;
1308    aSig64 <<= 32;
1309    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1310    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1311
1312}
1313
1314/*----------------------------------------------------------------------------
1315| Returns the result of converting the single-precision floating-point value
1316| `a' to the 32-bit two's complement integer format.  The conversion is
1317| performed according to the IEC/IEEE Standard for Binary Floating-Point
1318| Arithmetic, except that the conversion is always rounded toward zero.
1319| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1320| the conversion overflows, the largest integer with the same sign as `a' is
1321| returned.
1322*----------------------------------------------------------------------------*/
1323
1324int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1325{
1326    flag aSign;
1327    int16 aExp, shiftCount;
1328    bits32 aSig;
1329    int32 z;
1330
1331    aSig = extractFloat32Frac( a );
1332    aExp = extractFloat32Exp( a );
1333    aSign = extractFloat32Sign( a );
1334    shiftCount = aExp - 0x9E;
1335    if ( 0 <= shiftCount ) {
1336        if ( float32_val(a) != 0xCF000000 ) {
1337            float_raise( float_flag_invalid STATUS_VAR);
1338            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1339        }
1340        return (sbits32) 0x80000000;
1341    }
1342    else if ( aExp <= 0x7E ) {
1343        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1344        return 0;
1345    }
1346    aSig = ( aSig | 0x00800000 )<<8;
1347    z = aSig>>( - shiftCount );
1348    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1349        STATUS(float_exception_flags) |= float_flag_inexact;
1350    }
1351    if ( aSign ) z = - z;
1352    return z;
1353
1354}
1355
1356/*----------------------------------------------------------------------------
1357| Returns the result of converting the single-precision floating-point value
1358| `a' to the 64-bit two's complement integer format.  The conversion is
1359| performed according to the IEC/IEEE Standard for Binary Floating-Point
1360| Arithmetic---which means in particular that the conversion is rounded
1361| according to the current rounding mode.  If `a' is a NaN, the largest
1362| positive integer is returned.  Otherwise, if the conversion overflows, the
1363| largest integer with the same sign as `a' is returned.
1364*----------------------------------------------------------------------------*/
1365
1366int64 float32_to_int64( float32 a STATUS_PARAM )
1367{
1368    flag aSign;
1369    int16 aExp, shiftCount;
1370    bits32 aSig;
1371    bits64 aSig64, aSigExtra;
1372
1373    aSig = extractFloat32Frac( a );
1374    aExp = extractFloat32Exp( a );
1375    aSign = extractFloat32Sign( a );
1376    shiftCount = 0xBE - aExp;
1377    if ( shiftCount < 0 ) {
1378        float_raise( float_flag_invalid STATUS_VAR);
1379        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1380            return LIT64( 0x7FFFFFFFFFFFFFFF );
1381        }
1382        return (sbits64) LIT64( 0x8000000000000000 );
1383    }
1384    if ( aExp ) aSig |= 0x00800000;
1385    aSig64 = aSig;
1386    aSig64 <<= 40;
1387    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1388    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1389
1390}
1391
1392/*----------------------------------------------------------------------------
1393| Returns the result of converting the single-precision floating-point value
1394| `a' to the 64-bit two's complement integer format.  The conversion is
1395| performed according to the IEC/IEEE Standard for Binary Floating-Point
1396| Arithmetic, except that the conversion is always rounded toward zero.  If
1397| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1398| conversion overflows, the largest integer with the same sign as `a' is
1399| returned.
1400*----------------------------------------------------------------------------*/
1401
1402int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1403{
1404    flag aSign;
1405    int16 aExp, shiftCount;
1406    bits32 aSig;
1407    bits64 aSig64;
1408    int64 z;
1409
1410    aSig = extractFloat32Frac( a );
1411    aExp = extractFloat32Exp( a );
1412    aSign = extractFloat32Sign( a );
1413    shiftCount = aExp - 0xBE;
1414    if ( 0 <= shiftCount ) {
1415        if ( float32_val(a) != 0xDF000000 ) {
1416            float_raise( float_flag_invalid STATUS_VAR);
1417            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1418                return LIT64( 0x7FFFFFFFFFFFFFFF );
1419            }
1420        }
1421        return (sbits64) LIT64( 0x8000000000000000 );
1422    }
1423    else if ( aExp <= 0x7E ) {
1424        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1425        return 0;
1426    }
1427    aSig64 = aSig | 0x00800000;
1428    aSig64 <<= 40;
1429    z = aSig64>>( - shiftCount );
1430    if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1431        STATUS(float_exception_flags) |= float_flag_inexact;
1432    }
1433    if ( aSign ) z = - z;
1434    return z;
1435
1436}
1437
1438/*----------------------------------------------------------------------------
1439| Returns the result of converting the single-precision floating-point value
1440| `a' to the double-precision floating-point format.  The conversion is
1441| performed according to the IEC/IEEE Standard for Binary Floating-Point
1442| Arithmetic.
1443*----------------------------------------------------------------------------*/
1444
1445float64 float32_to_float64( float32 a STATUS_PARAM )
1446{
1447    flag aSign;
1448    int16 aExp;
1449    bits32 aSig;
1450
1451    aSig = extractFloat32Frac( a );
1452    aExp = extractFloat32Exp( a );
1453    aSign = extractFloat32Sign( a );
1454    if ( aExp == 0xFF ) {
1455        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1456        return packFloat64( aSign, 0x7FF, 0 );
1457    }
1458    if ( aExp == 0 ) {
1459        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1460        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1461        --aExp;
1462    }
1463    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1464
1465}
1466
1467#ifdef FLOATX80
1468
1469/*----------------------------------------------------------------------------
1470| Returns the result of converting the single-precision floating-point value
1471| `a' to the extended double-precision floating-point format.  The conversion
1472| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1473| Arithmetic.
1474*----------------------------------------------------------------------------*/
1475
1476floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1477{
1478    flag aSign;
1479    int16 aExp;
1480    bits32 aSig;
1481
1482    aSig = extractFloat32Frac( a );
1483    aExp = extractFloat32Exp( a );
1484    aSign = extractFloat32Sign( a );
1485    if ( aExp == 0xFF ) {
1486        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1487        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1488    }
1489    if ( aExp == 0 ) {
1490        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1491        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1492    }
1493    aSig |= 0x00800000;
1494    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1495
1496}
1497
1498#endif
1499
1500#ifdef FLOAT128
1501
1502/*----------------------------------------------------------------------------
1503| Returns the result of converting the single-precision floating-point value
1504| `a' to the double-precision floating-point format.  The conversion is
1505| performed according to the IEC/IEEE Standard for Binary Floating-Point
1506| Arithmetic.
1507*----------------------------------------------------------------------------*/
1508
1509float128 float32_to_float128( float32 a STATUS_PARAM )
1510{
1511    flag aSign;
1512    int16 aExp;
1513    bits32 aSig;
1514
1515    aSig = extractFloat32Frac( a );
1516    aExp = extractFloat32Exp( a );
1517    aSign = extractFloat32Sign( a );
1518    if ( aExp == 0xFF ) {
1519        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1520        return packFloat128( aSign, 0x7FFF, 0, 0 );
1521    }
1522    if ( aExp == 0 ) {
1523        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1524        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1525        --aExp;
1526    }
1527    return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1528
1529}
1530
1531#endif
1532
1533/*----------------------------------------------------------------------------
1534| Rounds the single-precision floating-point value `a' to an integer, and
1535| returns the result as a single-precision floating-point value.  The
1536| operation is performed according to the IEC/IEEE Standard for Binary
1537| Floating-Point Arithmetic.
1538*----------------------------------------------------------------------------*/
1539
1540float32 float32_round_to_int( float32 a STATUS_PARAM)
1541{
1542    flag aSign;
1543    int16 aExp;
1544    bits32 lastBitMask, roundBitsMask;
1545    int8 roundingMode;
1546    bits32 z;
1547
1548    aExp = extractFloat32Exp( a );
1549    if ( 0x96 <= aExp ) {
1550        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1551            return propagateFloat32NaN( a, a STATUS_VAR );
1552        }
1553        return a;
1554    }
1555    if ( aExp <= 0x7E ) {
1556        if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1557        STATUS(float_exception_flags) |= float_flag_inexact;
1558        aSign = extractFloat32Sign( a );
1559        switch ( STATUS(float_rounding_mode) ) {
1560         case float_round_nearest_even:
1561            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1562                return packFloat32( aSign, 0x7F, 0 );
1563            }
1564            break;
1565         case float_round_down:
1566            return make_float32(aSign ? 0xBF800000 : 0);
1567         case float_round_up:
1568            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1569        }
1570        return packFloat32( aSign, 0, 0 );
1571    }
1572    lastBitMask = 1;
1573    lastBitMask <<= 0x96 - aExp;
1574    roundBitsMask = lastBitMask - 1;
1575    z = float32_val(a);
1576    roundingMode = STATUS(float_rounding_mode);
1577    if ( roundingMode == float_round_nearest_even ) {
1578        z += lastBitMask>>1;
1579        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1580    }
1581    else if ( roundingMode != float_round_to_zero ) {
1582        if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1583            z += roundBitsMask;
1584        }
1585    }
1586    z &= ~ roundBitsMask;
1587    if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1588    return make_float32(z);
1589
1590}
1591
1592/*----------------------------------------------------------------------------
1593| Returns the result of adding the absolute values of the single-precision
1594| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1595| before being returned.  `zSign' is ignored if the result is a NaN.
1596| The addition is performed according to the IEC/IEEE Standard for Binary
1597| Floating-Point Arithmetic.
1598*----------------------------------------------------------------------------*/
1599
1600static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1601{
1602    int16 aExp, bExp, zExp;
1603    bits32 aSig, bSig, zSig;
1604    int16 expDiff;
1605
1606    aSig = extractFloat32Frac( a );
1607    aExp = extractFloat32Exp( a );
1608    bSig = extractFloat32Frac( b );
1609    bExp = extractFloat32Exp( b );
1610    expDiff = aExp - bExp;
1611    aSig <<= 6;
1612    bSig <<= 6;
1613    if ( 0 < expDiff ) {
1614        if ( aExp == 0xFF ) {
1615            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1616            return a;
1617        }
1618        if ( bExp == 0 ) {
1619            --expDiff;
1620        }
1621        else {
1622            bSig |= 0x20000000;
1623        }
1624        shift32RightJamming( bSig, expDiff, &bSig );
1625        zExp = aExp;
1626    }
1627    else if ( expDiff < 0 ) {
1628        if ( bExp == 0xFF ) {
1629            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1630            return packFloat32( zSign, 0xFF, 0 );
1631        }
1632        if ( aExp == 0 ) {
1633            ++expDiff;
1634        }
1635        else {
1636            aSig |= 0x20000000;
1637        }
1638        shift32RightJamming( aSig, - expDiff, &aSig );
1639        zExp = bExp;
1640    }
1641    else {
1642        if ( aExp == 0xFF ) {
1643            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1644            return a;
1645        }
1646        if ( aExp == 0 ) {
1647            if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1648            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1649        }
1650        zSig = 0x40000000 + aSig + bSig;
1651        zExp = aExp;
1652        goto roundAndPack;
1653    }
1654    aSig |= 0x20000000;
1655    zSig = ( aSig + bSig )<<1;
1656    --zExp;
1657    if ( (sbits32) zSig < 0 ) {
1658        zSig = aSig + bSig;
1659        ++zExp;
1660    }
1661 roundAndPack:
1662    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1663
1664}
1665
1666/*----------------------------------------------------------------------------
1667| Returns the result of subtracting the absolute values of the single-
1668| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1669| difference is negated before being returned.  `zSign' is ignored if the
1670| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1671| Standard for Binary Floating-Point Arithmetic.
1672*----------------------------------------------------------------------------*/
1673
1674static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1675{
1676    int16 aExp, bExp, zExp;
1677    bits32 aSig, bSig, zSig;
1678    int16 expDiff;
1679
1680    aSig = extractFloat32Frac( a );
1681    aExp = extractFloat32Exp( a );
1682    bSig = extractFloat32Frac( b );
1683    bExp = extractFloat32Exp( b );
1684    expDiff = aExp - bExp;
1685    aSig <<= 7;
1686    bSig <<= 7;
1687    if ( 0 < expDiff ) goto aExpBigger;
1688    if ( expDiff < 0 ) goto bExpBigger;
1689    if ( aExp == 0xFF ) {
1690        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1691        float_raise( float_flag_invalid STATUS_VAR);
1692        return float32_default_nan;
1693    }
1694    if ( aExp == 0 ) {
1695        aExp = 1;
1696        bExp = 1;
1697    }
1698    if ( bSig < aSig ) goto aBigger;
1699    if ( aSig < bSig ) goto bBigger;
1700    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1701 bExpBigger:
1702    if ( bExp == 0xFF ) {
1703        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1704        return packFloat32( zSign ^ 1, 0xFF, 0 );
1705    }
1706    if ( aExp == 0 ) {
1707        ++expDiff;
1708    }
1709    else {
1710        aSig |= 0x40000000;
1711    }
1712    shift32RightJamming( aSig, - expDiff, &aSig );
1713    bSig |= 0x40000000;
1714 bBigger:
1715    zSig = bSig - aSig;
1716    zExp = bExp;
1717    zSign ^= 1;
1718    goto normalizeRoundAndPack;
1719 aExpBigger:
1720    if ( aExp == 0xFF ) {
1721        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1722        return a;
1723    }
1724    if ( bExp == 0 ) {
1725        --expDiff;
1726    }
1727    else {
1728        bSig |= 0x40000000;
1729    }
1730    shift32RightJamming( bSig, expDiff, &bSig );
1731    aSig |= 0x40000000;
1732 aBigger:
1733    zSig = aSig - bSig;
1734    zExp = aExp;
1735 normalizeRoundAndPack:
1736    --zExp;
1737    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1738
1739}
1740
1741/*----------------------------------------------------------------------------
1742| Returns the result of adding the single-precision floating-point values `a'
1743| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1744| Binary Floating-Point Arithmetic.
1745*----------------------------------------------------------------------------*/
1746
1747float32 float32_add( float32 a, float32 b STATUS_PARAM )
1748{
1749    flag aSign, bSign;
1750
1751    aSign = extractFloat32Sign( a );
1752    bSign = extractFloat32Sign( b );
1753    if ( aSign == bSign ) {
1754        return addFloat32Sigs( a, b, aSign STATUS_VAR);
1755    }
1756    else {
1757        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1758    }
1759
1760}
1761
1762/*----------------------------------------------------------------------------
1763| Returns the result of subtracting the single-precision floating-point values
1764| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1765| for Binary Floating-Point Arithmetic.
1766*----------------------------------------------------------------------------*/
1767
1768float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1769{
1770    flag aSign, bSign;
1771
1772    aSign = extractFloat32Sign( a );
1773    bSign = extractFloat32Sign( b );
1774    if ( aSign == bSign ) {
1775        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1776    }
1777    else {
1778        return addFloat32Sigs( a, b, aSign STATUS_VAR );
1779    }
1780
1781}
1782
1783/*----------------------------------------------------------------------------
1784| Returns the result of multiplying the single-precision floating-point values
1785| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1786| for Binary Floating-Point Arithmetic.
1787*----------------------------------------------------------------------------*/
1788
1789float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1790{
1791    flag aSign, bSign, zSign;
1792    int16 aExp, bExp, zExp;
1793    bits32 aSig, bSig;
1794    bits64 zSig64;
1795    bits32 zSig;
1796
1797    aSig = extractFloat32Frac( a );
1798    aExp = extractFloat32Exp( a );
1799    aSign = extractFloat32Sign( a );
1800    bSig = extractFloat32Frac( b );
1801    bExp = extractFloat32Exp( b );
1802    bSign = extractFloat32Sign( b );
1803    zSign = aSign ^ bSign;
1804    if ( aExp == 0xFF ) {
1805        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1806            return propagateFloat32NaN( a, b STATUS_VAR );
1807        }
1808        if ( ( bExp | bSig ) == 0 ) {
1809            float_raise( float_flag_invalid STATUS_VAR);
1810            return float32_default_nan;
1811        }
1812        return packFloat32( zSign, 0xFF, 0 );
1813    }
1814    if ( bExp == 0xFF ) {
1815        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1816        if ( ( aExp | aSig ) == 0 ) {
1817            float_raise( float_flag_invalid STATUS_VAR);
1818            return float32_default_nan;
1819        }
1820        return packFloat32( zSign, 0xFF, 0 );
1821    }
1822    if ( aExp == 0 ) {
1823        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1824        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1825    }
1826    if ( bExp == 0 ) {
1827        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1828        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1829    }
1830    zExp = aExp + bExp - 0x7F;
1831    aSig = ( aSig | 0x00800000 )<<7;
1832    bSig = ( bSig | 0x00800000 )<<8;
1833    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1834    zSig = zSig64;
1835    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1836        zSig <<= 1;
1837        --zExp;
1838    }
1839    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1840
1841}
1842
1843/*----------------------------------------------------------------------------
1844| Returns the result of dividing the single-precision floating-point value `a'
1845| by the corresponding value `b'.  The operation is performed according to the
1846| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1847*----------------------------------------------------------------------------*/
1848
1849float32 float32_div( float32 a, float32 b STATUS_PARAM )
1850{
1851    flag aSign, bSign, zSign;
1852    int16 aExp, bExp, zExp;
1853    bits32 aSig, bSig, zSig;
1854
1855    aSig = extractFloat32Frac( a );
1856    aExp = extractFloat32Exp( a );
1857    aSign = extractFloat32Sign( a );
1858    bSig = extractFloat32Frac( b );
1859    bExp = extractFloat32Exp( b );
1860    bSign = extractFloat32Sign( b );
1861    zSign = aSign ^ bSign;
1862    if ( aExp == 0xFF ) {
1863        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1864        if ( bExp == 0xFF ) {
1865            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1866            float_raise( float_flag_invalid STATUS_VAR);
1867            return float32_default_nan;
1868        }
1869        return packFloat32( zSign, 0xFF, 0 );
1870    }
1871    if ( bExp == 0xFF ) {
1872        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1873        return packFloat32( zSign, 0, 0 );
1874    }
1875    if ( bExp == 0 ) {
1876        if ( bSig == 0 ) {
1877            if ( ( aExp | aSig ) == 0 ) {
1878                float_raise( float_flag_invalid STATUS_VAR);
1879                return float32_default_nan;
1880            }
1881            float_raise( float_flag_divbyzero STATUS_VAR);
1882            return packFloat32( zSign, 0xFF, 0 );
1883        }
1884        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1885    }
1886    if ( aExp == 0 ) {
1887        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1888        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1889    }
1890    zExp = aExp - bExp + 0x7D;
1891    aSig = ( aSig | 0x00800000 )<<7;
1892    bSig = ( bSig | 0x00800000 )<<8;
1893    if ( bSig <= ( aSig + aSig ) ) {
1894        aSig >>= 1;
1895        ++zExp;
1896    }
1897    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1898    if ( ( zSig & 0x3F ) == 0 ) {
1899        zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1900    }
1901    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1902
1903}
1904
1905/*----------------------------------------------------------------------------
1906| Returns the remainder of the single-precision floating-point value `a'
1907| with respect to the corresponding value `b'.  The operation is performed
1908| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1909*----------------------------------------------------------------------------*/
1910
1911float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1912{
1913    flag aSign, bSign, zSign;
1914    int16 aExp, bExp, expDiff;
1915    bits32 aSig, bSig;
1916    bits32 q;
1917    bits64 aSig64, bSig64, q64;
1918    bits32 alternateASig;
1919    sbits32 sigMean;
1920
1921    aSig = extractFloat32Frac( a );
1922    aExp = extractFloat32Exp( a );
1923    aSign = extractFloat32Sign( a );
1924    bSig = extractFloat32Frac( b );
1925    bExp = extractFloat32Exp( b );
1926    bSign = extractFloat32Sign( b );
1927    if ( aExp == 0xFF ) {
1928        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1929            return propagateFloat32NaN( a, b STATUS_VAR );
1930        }
1931        float_raise( float_flag_invalid STATUS_VAR);
1932        return float32_default_nan;
1933    }
1934    if ( bExp == 0xFF ) {
1935        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1936        return a;
1937    }
1938    if ( bExp == 0 ) {
1939        if ( bSig == 0 ) {
1940            float_raise( float_flag_invalid STATUS_VAR);
1941            return float32_default_nan;
1942        }
1943        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1944    }
1945    if ( aExp == 0 ) {
1946        if ( aSig == 0 ) return a;
1947        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1948    }
1949    expDiff = aExp - bExp;
1950    aSig |= 0x00800000;
1951    bSig |= 0x00800000;
1952    if ( expDiff < 32 ) {
1953        aSig <<= 8;
1954        bSig <<= 8;
1955        if ( expDiff < 0 ) {
1956            if ( expDiff < -1 ) return a;
1957            aSig >>= 1;
1958        }
1959        q = ( bSig <= aSig );
1960        if ( q ) aSig -= bSig;
1961        if ( 0 < expDiff ) {
1962            q = ( ( (bits64) aSig )<<32 ) / bSig;
1963            q >>= 32 - expDiff;
1964            bSig >>= 2;
1965            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1966        }
1967        else {
1968            aSig >>= 2;
1969            bSig >>= 2;
1970        }
1971    }
1972    else {
1973        if ( bSig <= aSig ) aSig -= bSig;
1974        aSig64 = ( (bits64) aSig )<<40;
1975        bSig64 = ( (bits64) bSig )<<40;
1976        expDiff -= 64;
1977        while ( 0 < expDiff ) {
1978            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1979            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1980            aSig64 = - ( ( bSig * q64 )<<38 );
1981            expDiff -= 62;
1982        }
1983        expDiff += 64;
1984        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1985        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1986        q = q64>>( 64 - expDiff );
1987        bSig <<= 6;
1988        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1989    }
1990    do {
1991        alternateASig = aSig;
1992        ++q;
1993        aSig -= bSig;
1994    } while ( 0 <= (sbits32) aSig );
1995    sigMean = aSig + alternateASig;
1996    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1997        aSig = alternateASig;
1998    }
1999    zSign = ( (sbits32) aSig < 0 );
2000    if ( zSign ) aSig = - aSig;
2001    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2002
2003}
2004
2005/*----------------------------------------------------------------------------
2006| Returns the square root of the single-precision floating-point value `a'.
2007| The operation is performed according to the IEC/IEEE Standard for Binary
2008| Floating-Point Arithmetic.
2009*----------------------------------------------------------------------------*/
2010
2011float32 float32_sqrt( float32 a STATUS_PARAM )
2012{
2013    flag aSign;
2014    int16 aExp, zExp;
2015    bits32 aSig, zSig;
2016    bits64 rem, term;
2017
2018    aSig = extractFloat32Frac( a );
2019    aExp = extractFloat32Exp( a );
2020    aSign = extractFloat32Sign( a );
2021    if ( aExp == 0xFF ) {
2022        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2023        if ( ! aSign ) return a;
2024        float_raise( float_flag_invalid STATUS_VAR);
2025        return float32_default_nan;
2026    }
2027    if ( aSign ) {
2028        if ( ( aExp | aSig ) == 0 ) return a;
2029        float_raise( float_flag_invalid STATUS_VAR);
2030        return float32_default_nan;
2031    }
2032    if ( aExp == 0 ) {
2033        if ( aSig == 0 ) return float32_zero;
2034        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2035    }
2036    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2037    aSig = ( aSig | 0x00800000 )<<8;
2038    zSig = estimateSqrt32( aExp, aSig ) + 2;
2039    if ( ( zSig & 0x7F ) <= 5 ) {
2040        if ( zSig < 2 ) {
2041            zSig = 0x7FFFFFFF;
2042            goto roundAndPack;
2043        }
2044        aSig >>= aExp & 1;
2045        term = ( (bits64) zSig ) * zSig;
2046        rem = ( ( (bits64) aSig )<<32 ) - term;
2047        while ( (sbits64) rem < 0 ) {
2048            --zSig;
2049            rem += ( ( (bits64) zSig )<<1 ) | 1;
2050        }
2051        zSig |= ( rem != 0 );
2052    }
2053    shift32RightJamming( zSig, 1, &zSig );
2054 roundAndPack:
2055    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2056
2057}
2058
2059/*----------------------------------------------------------------------------
2060| Returns the binary log of the single-precision floating-point value `a'.
2061| The operation is performed according to the IEC/IEEE Standard for Binary
2062| Floating-Point Arithmetic.
2063*----------------------------------------------------------------------------*/
2064float32 float32_log2( float32 a STATUS_PARAM )
2065{
2066    flag aSign, zSign;
2067    int16 aExp;
2068    bits32 aSig, zSig, i;
2069
2070    aSig = extractFloat32Frac( a );
2071    aExp = extractFloat32Exp( a );
2072    aSign = extractFloat32Sign( a );
2073
2074    if ( aExp == 0 ) {
2075        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2076        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2077    }
2078    if ( aSign ) {
2079        float_raise( float_flag_invalid STATUS_VAR);
2080        return float32_default_nan;
2081    }
2082    if ( aExp == 0xFF ) {
2083        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2084        return a;
2085    }
2086
2087    aExp -= 0x7F;
2088    aSig |= 0x00800000;
2089    zSign = aExp < 0;
2090    zSig = aExp << 23;
2091
2092    for (i = 1 << 22; i > 0; i >>= 1) {
2093        aSig = ( (bits64)aSig * aSig ) >> 23;
2094        if ( aSig & 0x01000000 ) {
2095            aSig >>= 1;
2096            zSig |= i;
2097        }
2098    }
2099
2100    if ( zSign )
2101        zSig = -zSig;
2102
2103    return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2104}
2105
2106/*----------------------------------------------------------------------------
2107| Returns 1 if the single-precision floating-point value `a' is equal to
2108| the corresponding value `b', and 0 otherwise.  The comparison is performed
2109| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2110*----------------------------------------------------------------------------*/
2111
2112int float32_eq( float32 a, float32 b STATUS_PARAM )
2113{
2114
2115    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2116         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2117       ) {
2118        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2119            float_raise( float_flag_invalid STATUS_VAR);
2120        }
2121        return 0;
2122    }
2123    return ( float32_val(a) == float32_val(b) ) ||
2124            ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2125
2126}
2127
2128/*----------------------------------------------------------------------------
2129| Returns 1 if the single-precision floating-point value `a' is less than
2130| or equal to the corresponding value `b', and 0 otherwise.  The comparison
2131| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2132| Arithmetic.
2133*----------------------------------------------------------------------------*/
2134
2135int float32_le( float32 a, float32 b STATUS_PARAM )
2136{
2137    flag aSign, bSign;
2138    bits32 av, bv;
2139
2140    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2141         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2142       ) {
2143        float_raise( float_flag_invalid STATUS_VAR);
2144        return 0;
2145    }
2146    aSign = extractFloat32Sign( a );
2147    bSign = extractFloat32Sign( b );
2148    av = float32_val(a);
2149    bv = float32_val(b);
2150    if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2151    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2152
2153}
2154
2155/*----------------------------------------------------------------------------
2156| Returns 1 if the single-precision floating-point value `a' is less than
2157| the corresponding value `b', and 0 otherwise.  The comparison is performed
2158| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2159*----------------------------------------------------------------------------*/
2160
2161int float32_lt( float32 a, float32 b STATUS_PARAM )
2162{
2163    flag aSign, bSign;
2164    bits32 av, bv;
2165
2166    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2167         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2168       ) {
2169        float_raise( float_flag_invalid STATUS_VAR);
2170        return 0;
2171    }
2172    aSign = extractFloat32Sign( a );
2173    bSign = extractFloat32Sign( b );
2174    av = float32_val(a);
2175    bv = float32_val(b);
2176    if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2177    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2178
2179}
2180
2181/*----------------------------------------------------------------------------
2182| Returns 1 if the single-precision floating-point value `a' is equal to
2183| the corresponding value `b', and 0 otherwise.  The invalid exception is
2184| raised if either operand is a NaN.  Otherwise, the comparison is performed
2185| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2186*----------------------------------------------------------------------------*/
2187
2188int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2189{
2190    bits32 av, bv;
2191
2192    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2193         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2194       ) {
2195        float_raise( float_flag_invalid STATUS_VAR);
2196        return 0;
2197    }
2198    av = float32_val(a);
2199    bv = float32_val(b);
2200    return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2201
2202}
2203
2204/*----------------------------------------------------------------------------
2205| Returns 1 if the single-precision floating-point value `a' is less than or
2206| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2207| cause an exception.  Otherwise, the comparison is performed according to the
2208| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2209*----------------------------------------------------------------------------*/
2210
2211int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2212{
2213    flag aSign, bSign;
2214    bits32 av, bv;
2215
2216    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2217         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2218       ) {
2219        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2220            float_raise( float_flag_invalid STATUS_VAR);
2221        }
2222        return 0;
2223    }
2224    aSign = extractFloat32Sign( a );
2225    bSign = extractFloat32Sign( b );
2226    av = float32_val(a);
2227    bv = float32_val(b);
2228    if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2229    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2230
2231}
2232
2233/*----------------------------------------------------------------------------
2234| Returns 1 if the single-precision floating-point value `a' is less than
2235| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2236| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2237| Standard for Binary Floating-Point Arithmetic.
2238*----------------------------------------------------------------------------*/
2239
2240int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2241{
2242    flag aSign, bSign;
2243    bits32 av, bv;
2244
2245    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2246         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2247       ) {
2248        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2249            float_raise( float_flag_invalid STATUS_VAR);
2250        }
2251        return 0;
2252    }
2253    aSign = extractFloat32Sign( a );
2254    bSign = extractFloat32Sign( b );
2255    av = float32_val(a);
2256    bv = float32_val(b);
2257    if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2258    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2259
2260}
2261
2262/*----------------------------------------------------------------------------
2263| Returns the result of converting the double-precision floating-point value
2264| `a' to the 32-bit two's complement integer format.  The conversion is
2265| performed according to the IEC/IEEE Standard for Binary Floating-Point
2266| Arithmetic---which means in particular that the conversion is rounded
2267| according to the current rounding mode.  If `a' is a NaN, the largest
2268| positive integer is returned.  Otherwise, if the conversion overflows, the
2269| largest integer with the same sign as `a' is returned.
2270*----------------------------------------------------------------------------*/
2271
2272int32 float64_to_int32( float64 a STATUS_PARAM )
2273{
2274    flag aSign;
2275    int16 aExp, shiftCount;
2276    bits64 aSig;
2277
2278    aSig = extractFloat64Frac( a );
2279    aExp = extractFloat64Exp( a );
2280    aSign = extractFloat64Sign( a );
2281    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2282    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2283    shiftCount = 0x42C - aExp;
2284    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2285    return roundAndPackInt32( aSign, aSig STATUS_VAR );
2286
2287}
2288
2289/*----------------------------------------------------------------------------
2290| Returns the result of converting the double-precision floating-point value
2291| `a' to the 32-bit two's complement integer format.  The conversion is
2292| performed according to the IEC/IEEE Standard for Binary Floating-Point
2293| Arithmetic, except that the conversion is always rounded toward zero.
2294| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2295| the conversion overflows, the largest integer with the same sign as `a' is
2296| returned.
2297*----------------------------------------------------------------------------*/
2298
2299int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2300{
2301    flag aSign;
2302    int16 aExp, shiftCount;
2303    bits64 aSig, savedASig;
2304    int32 z;
2305
2306    aSig = extractFloat64Frac( a );
2307    aExp = extractFloat64Exp( a );
2308    aSign = extractFloat64Sign( a );
2309    if ( 0x41E < aExp ) {
2310        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2311        goto invalid;
2312    }
2313    else if ( aExp < 0x3FF ) {
2314        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2315        return 0;
2316    }
2317    aSig |= LIT64( 0x0010000000000000 );
2318    shiftCount = 0x433 - aExp;
2319    savedASig = aSig;
2320    aSig >>= shiftCount;
2321    z = aSig;
2322    if ( aSign ) z = - z;
2323    if ( ( z < 0 ) ^ aSign ) {
2324 invalid:
2325        float_raise( float_flag_invalid STATUS_VAR);
2326        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2327    }
2328    if ( ( aSig<<shiftCount ) != savedASig ) {
2329        STATUS(float_exception_flags) |= float_flag_inexact;
2330    }
2331    return z;
2332
2333}
2334
2335/*----------------------------------------------------------------------------
2336| Returns the result of converting the double-precision floating-point value
2337| `a' to the 64-bit two's complement integer format.  The conversion is
2338| performed according to the IEC/IEEE Standard for Binary Floating-Point
2339| Arithmetic---which means in particular that the conversion is rounded
2340| according to the current rounding mode.  If `a' is a NaN, the largest
2341| positive integer is returned.  Otherwise, if the conversion overflows, the
2342| largest integer with the same sign as `a' is returned.
2343*----------------------------------------------------------------------------*/
2344
2345int64 float64_to_int64( float64 a STATUS_PARAM )
2346{
2347    flag aSign;
2348    int16 aExp, shiftCount;
2349    bits64 aSig, aSigExtra;
2350
2351    aSig = extractFloat64Frac( a );
2352    aExp = extractFloat64Exp( a );
2353    aSign = extractFloat64Sign( a );
2354    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2355    shiftCount = 0x433 - aExp;
2356    if ( shiftCount <= 0 ) {
2357        if ( 0x43E < aExp ) {
2358            float_raise( float_flag_invalid STATUS_VAR);
2359            if (    ! aSign
2360                 || (    ( aExp == 0x7FF )
2361                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2362               ) {
2363                return LIT64( 0x7FFFFFFFFFFFFFFF );
2364            }
2365            return (sbits64) LIT64( 0x8000000000000000 );
2366        }
2367        aSigExtra = 0;
2368        aSig <<= - shiftCount;
2369    }
2370    else {
2371        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2372    }
2373    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2374
2375}
2376
2377/*----------------------------------------------------------------------------
2378| Returns the result of converting the double-precision floating-point value
2379| `a' to the 64-bit two's complement integer format.  The conversion is
2380| performed according to the IEC/IEEE Standard for Binary Floating-Point
2381| Arithmetic, except that the conversion is always rounded toward zero.
2382| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2383| the conversion overflows, the largest integer with the same sign as `a' is
2384| returned.
2385*----------------------------------------------------------------------------*/
2386
2387int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2388{
2389    flag aSign;
2390    int16 aExp, shiftCount;
2391    bits64 aSig;
2392    int64 z;
2393
2394    aSig = extractFloat64Frac( a );
2395    aExp = extractFloat64Exp( a );
2396    aSign = extractFloat64Sign( a );
2397    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2398    shiftCount = aExp - 0x433;
2399    if ( 0 <= shiftCount ) {
2400        if ( 0x43E <= aExp ) {
2401            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2402                float_raise( float_flag_invalid STATUS_VAR);
2403                if (    ! aSign
2404                     || (    ( aExp == 0x7FF )
2405                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2406                   ) {
2407                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2408                }
2409            }
2410            return (sbits64) LIT64( 0x8000000000000000 );
2411        }
2412        z = aSig<<shiftCount;
2413    }
2414    else {
2415        if ( aExp < 0x3FE ) {
2416            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2417            return 0;
2418        }
2419        z = aSig>>( - shiftCount );
2420        if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2421            STATUS(float_exception_flags) |= float_flag_inexact;
2422        }
2423    }
2424    if ( aSign ) z = - z;
2425    return z;
2426
2427}
2428
2429/*----------------------------------------------------------------------------
2430| Returns the result of converting the double-precision floating-point value
2431| `a' to the single-precision floating-point format.  The conversion is
2432| performed according to the IEC/IEEE Standard for Binary Floating-Point
2433| Arithmetic.
2434*----------------------------------------------------------------------------*/
2435
2436float32 float64_to_float32( float64 a STATUS_PARAM )
2437{
2438    flag aSign;
2439    int16 aExp;
2440    bits64 aSig;
2441    bits32 zSig;
2442
2443    aSig = extractFloat64Frac( a );
2444    aExp = extractFloat64Exp( a );
2445    aSign = extractFloat64Sign( a );
2446    if ( aExp == 0x7FF ) {
2447        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2448        return packFloat32( aSign, 0xFF, 0 );
2449    }
2450    shift64RightJamming( aSig, 22, &aSig );
2451    zSig = aSig;
2452    if ( aExp || zSig ) {
2453        zSig |= 0x40000000;
2454        aExp -= 0x381;
2455    }
2456    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2457
2458}
2459
2460
2461/*----------------------------------------------------------------------------
2462| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2463| half-precision floating-point value, returning the result.  After being
2464| shifted into the proper positions, the three fields are simply added
2465| together to form the result.  This means that any integer portion of `zSig'
2466| will be added into the exponent.  Since a properly normalized significand
2467| will have an integer portion equal to 1, the `zExp' input should be 1 less
2468| than the desired result exponent whenever `zSig' is a complete, normalized
2469| significand.
2470*----------------------------------------------------------------------------*/
2471static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2472{
2473    return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig;
2474}
2475
2476/* Half precision floats come in two formats: standard IEEE and "ARM" format.
2477   The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2478  
2479float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM )
2480{
2481    flag aSign;
2482    int16 aExp;
2483    bits32 aSig;
2484
2485    aSign = a >> 15;
2486    aExp = (a >> 10) & 0x1f;
2487    aSig = a & 0x3ff;
2488
2489    if (aExp == 0x1f && ieee) {
2490        if (aSig) {
2491            /* Make sure correct exceptions are raised.  */
2492            float32ToCommonNaN(a STATUS_VAR);
2493            aSig |= 0x200;
2494        }
2495        return packFloat32(aSign, 0xff, aSig << 13);
2496    }
2497    if (aExp == 0) {
2498        int8 shiftCount;
2499
2500        if (aSig == 0) {
2501            return packFloat32(aSign, 0, 0);
2502        }
2503
2504        shiftCount = countLeadingZeros32( aSig ) - 21;
2505        aSig = aSig << shiftCount;
2506        aExp = -shiftCount;
2507    }
2508    return packFloat32( aSign, aExp + 0x70, aSig << 13);
2509}
2510
2511bits16 float32_to_float16( float32 a, flag ieee STATUS_PARAM)
2512{
2513    flag aSign;
2514    int16 aExp;
2515    bits32 aSig;
2516    bits32 mask;
2517    bits32 increment;
2518    int8 roundingMode;
2519
2520    aSig = extractFloat32Frac( a );
2521    aExp = extractFloat32Exp( a );
2522    aSign = extractFloat32Sign( a );
2523    if ( aExp == 0xFF ) {
2524        if (aSig) {
2525            /* Make sure correct exceptions are raised.  */
2526            float32ToCommonNaN(a STATUS_VAR);
2527            aSig |= 0x00400000;
2528        }
2529        return packFloat16(aSign, 0x1f, aSig >> 13);
2530    }
2531    if (aExp == 0 && aSign == 0) {
2532        return packFloat16(aSign, 0, 0);
2533    }
2534    /* Decimal point between bits 22 and 23.  */
2535    aSig |= 0x00800000;
2536    aExp -= 0x7f;
2537    if (aExp < -14) {
2538        mask = 0x007fffff;
2539        if (aExp < -24) {
2540            aExp = -25;
2541        } else {
2542            mask >>= 24 + aExp;
2543        }
2544    } else {
2545        mask = 0x00001fff;
2546    }
2547    if (aSig & mask) {
2548        float_raise( float_flag_underflow STATUS_VAR );
2549        roundingMode = STATUS(float_rounding_mode);
2550        switch (roundingMode) {
2551        case float_round_nearest_even:
2552            increment = (mask + 1) >> 1;
2553            if ((aSig & mask) == increment) {
2554                increment = aSig & (increment << 1);
2555            }
2556            break;
2557        case float_round_up:
2558            increment = aSign ? 0 : mask;
2559            break;
2560        case float_round_down:
2561            increment = aSign ? mask : 0;
2562            break;
2563        default: /* round_to_zero */
2564            increment = 0;
2565            break;
2566        }
2567        aSig += increment;
2568        if (aSig >= 0x01000000) {
2569            aSig >>= 1;
2570            aExp++;
2571        }
2572    } else if (aExp < -14
2573          && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2574        float_raise( float_flag_underflow STATUS_VAR);
2575    }
2576
2577    if (ieee) {
2578        if (aExp > 15) {
2579            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2580            return packFloat16(aSign, 0x1f, 0);
2581        }
2582    } else {
2583        if (aExp > 16) {
2584            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2585            return packFloat16(aSign, 0x1f, 0x3ff);
2586        }
2587    }
2588    if (aExp < -24) {
2589        return packFloat16(aSign, 0, 0);
2590    }
2591    if (aExp < -14) {
2592        aSig >>= -14 - aExp;
2593        aExp = -14;
2594    }
2595    return packFloat16(aSign, aExp + 14, aSig >> 13);
2596}
2597
2598#ifdef FLOATX80
2599
2600/*----------------------------------------------------------------------------
2601| Returns the result of converting the double-precision floating-point value
2602| `a' to the extended double-precision floating-point format.  The conversion
2603| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2604| Arithmetic.
2605*----------------------------------------------------------------------------*/
2606
2607floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2608{
2609    flag aSign;
2610    int16 aExp;
2611    bits64 aSig;
2612
2613    aSig = extractFloat64Frac( a );
2614    aExp = extractFloat64Exp( a );
2615    aSign = extractFloat64Sign( a );
2616    if ( aExp == 0x7FF ) {
2617        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2618        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2619    }
2620    if ( aExp == 0 ) {
2621        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2622        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2623    }
2624    return
2625        packFloatx80(
2626            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2627
2628}
2629
2630#endif
2631
2632#ifdef FLOAT128
2633
2634/*----------------------------------------------------------------------------
2635| Returns the result of converting the double-precision floating-point value
2636| `a' to the quadruple-precision floating-point format.  The conversion is
2637| performed according to the IEC/IEEE Standard for Binary Floating-Point
2638| Arithmetic.
2639*----------------------------------------------------------------------------*/
2640
2641float128 float64_to_float128( float64 a STATUS_PARAM )
2642{
2643    flag aSign;
2644    int16 aExp;
2645    bits64 aSig, zSig0, zSig1;
2646
2647    aSig = extractFloat64Frac( a );
2648    aExp = extractFloat64Exp( a );
2649    aSign = extractFloat64Sign( a );
2650    if ( aExp == 0x7FF ) {
2651        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2652        return packFloat128( aSign, 0x7FFF, 0, 0 );
2653    }
2654    if ( aExp == 0 ) {
2655        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2656        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2657        --aExp;
2658    }
2659    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2660    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2661
2662}
2663
2664#endif
2665
2666/*----------------------------------------------------------------------------
2667| Rounds the double-precision floating-point value `a' to an integer, and
2668| returns the result as a double-precision floating-point value.  The
2669| operation is performed according to the IEC/IEEE Standard for Binary
2670| Floating-Point Arithmetic.
2671*----------------------------------------------------------------------------*/
2672
2673float64 float64_round_to_int( float64 a STATUS_PARAM )
2674{
2675    flag aSign;
2676    int16 aExp;
2677    bits64 lastBitMask, roundBitsMask;
2678    int8 roundingMode;
2679    bits64 z;
2680
2681    aExp = extractFloat64Exp( a );
2682    if ( 0x433 <= aExp ) {
2683        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2684            return propagateFloat64NaN( a, a STATUS_VAR );
2685        }
2686        return a;
2687    }
2688    if ( aExp < 0x3FF ) {
2689        if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2690        STATUS(float_exception_flags) |= float_flag_inexact;
2691        aSign = extractFloat64Sign( a );
2692        switch ( STATUS(float_rounding_mode) ) {
2693         case float_round_nearest_even:
2694            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2695                return packFloat64( aSign, 0x3FF, 0 );
2696            }
2697            break;
2698         case float_round_down:
2699            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2700         case float_round_up:
2701            return make_float64(
2702            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2703        }
2704        return packFloat64( aSign, 0, 0 );
2705    }
2706    lastBitMask = 1;
2707    lastBitMask <<= 0x433 - aExp;
2708    roundBitsMask = lastBitMask - 1;
2709    z = float64_val(a);
2710    roundingMode = STATUS(float_rounding_mode);
2711    if ( roundingMode == float_round_nearest_even ) {
2712        z += lastBitMask>>1;
2713        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2714    }
2715    else if ( roundingMode != float_round_to_zero ) {
2716        if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2717            z += roundBitsMask;
2718        }
2719    }
2720    z &= ~ roundBitsMask;
2721    if ( z != float64_val(a) )
2722        STATUS(float_exception_flags) |= float_flag_inexact;
2723    return make_float64(z);
2724
2725}
2726
2727float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2728{
2729    int oldmode;
2730    float64 res;
2731    oldmode = STATUS(float_rounding_mode);
2732    STATUS(float_rounding_mode) = float_round_to_zero;
2733    res = float64_round_to_int(a STATUS_VAR);
2734    STATUS(float_rounding_mode) = oldmode;
2735    return res;
2736}
2737
2738/*----------------------------------------------------------------------------
2739| Returns the result of adding the absolute values of the double-precision
2740| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2741| before being returned.  `zSign' is ignored if the result is a NaN.
2742| The addition is performed according to the IEC/IEEE Standard for Binary
2743| Floating-Point Arithmetic.
2744*----------------------------------------------------------------------------*/
2745
2746static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2747{
2748    int16 aExp, bExp, zExp;
2749    bits64 aSig, bSig, zSig;
2750    int16 expDiff;
2751
2752    aSig = extractFloat64Frac( a );
2753    aExp = extractFloat64Exp( a );
2754    bSig = extractFloat64Frac( b );
2755    bExp = extractFloat64Exp( b );
2756    expDiff = aExp - bExp;
2757    aSig <<= 9;
2758    bSig <<= 9;
2759    if ( 0 < expDiff ) {
2760        if ( aExp == 0x7FF ) {
2761            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2762            return a;
2763        }
2764        if ( bExp == 0 ) {
2765            --expDiff;
2766        }
2767        else {
2768            bSig |= LIT64( 0x2000000000000000 );
2769        }
2770        shift64RightJamming( bSig, expDiff, &bSig );
2771        zExp = aExp;
2772    }
2773    else if ( expDiff < 0 ) {
2774        if ( bExp == 0x7FF ) {
2775            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2776            return packFloat64( zSign, 0x7FF, 0 );
2777        }
2778        if ( aExp == 0 ) {
2779            ++expDiff;
2780        }
2781        else {
2782            aSig |= LIT64( 0x2000000000000000 );
2783        }
2784        shift64RightJamming( aSig, - expDiff, &aSig );
2785        zExp = bExp;
2786    }
2787    else {
2788        if ( aExp == 0x7FF ) {
2789            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2790            return a;
2791        }
2792        if ( aExp == 0 ) {
2793            if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
2794            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2795        }
2796        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2797        zExp = aExp;
2798        goto roundAndPack;
2799    }
2800    aSig |= LIT64( 0x2000000000000000 );
2801    zSig = ( aSig + bSig )<<1;
2802    --zExp;
2803    if ( (sbits64) zSig < 0 ) {
2804        zSig = aSig + bSig;
2805        ++zExp;
2806    }
2807 roundAndPack:
2808    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2809
2810}
2811
2812/*----------------------------------------------------------------------------
2813| Returns the result of subtracting the absolute values of the double-
2814| precision floating-point values `a' and `b'.  If `zSign' is 1, the
2815| difference is negated before being returned.  `zSign' is ignored if the
2816| result is a NaN.  The subtraction is performed according to the IEC/IEEE
2817| Standard for Binary Floating-Point Arithmetic.
2818*----------------------------------------------------------------------------*/
2819
2820static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2821{
2822    int16 aExp, bExp, zExp;
2823    bits64 aSig, bSig, zSig;
2824    int16 expDiff;
2825
2826    aSig = extractFloat64Frac( a );
2827    aExp = extractFloat64Exp( a );
2828    bSig = extractFloat64Frac( b );
2829    bExp = extractFloat64Exp( b );
2830    expDiff = aExp - bExp;
2831    aSig <<= 10;
2832    bSig <<= 10;
2833    if ( 0 < expDiff ) goto aExpBigger;
2834    if ( expDiff < 0 ) goto bExpBigger;
2835    if ( aExp == 0x7FF ) {
2836        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2837        float_raise( float_flag_invalid STATUS_VAR);
2838        return float64_default_nan;
2839    }
2840    if ( aExp == 0 ) {
2841        aExp = 1;
2842        bExp = 1;
2843    }
2844    if ( bSig < aSig ) goto aBigger;
2845    if ( aSig < bSig ) goto bBigger;
2846    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2847 bExpBigger:
2848    if ( bExp == 0x7FF ) {
2849        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2850        return packFloat64( zSign ^ 1, 0x7FF, 0 );
2851    }
2852    if ( aExp == 0 ) {
2853        ++expDiff;
2854    }
2855    else {
2856        aSig |= LIT64( 0x4000000000000000 );
2857    }
2858    shift64RightJamming( aSig, - expDiff, &aSig );
2859    bSig |= LIT64( 0x4000000000000000 );
2860 bBigger:
2861    zSig = bSig - aSig;
2862    zExp = bExp;
2863    zSign ^= 1;
2864    goto normalizeRoundAndPack;
2865 aExpBigger:
2866    if ( aExp == 0x7FF ) {
2867        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2868        return a;
2869    }
2870    if ( bExp == 0 ) {
2871        --expDiff;
2872    }
2873    else {
2874        bSig |= LIT64( 0x4000000000000000 );
2875    }
2876    shift64RightJamming( bSig, expDiff, &bSig );
2877    aSig |= LIT64( 0x4000000000000000 );
2878 aBigger:
2879    zSig = aSig - bSig;
2880    zExp = aExp;
2881 normalizeRoundAndPack:
2882    --zExp;
2883    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2884
2885}
2886
2887/*----------------------------------------------------------------------------
2888| Returns the result of adding the double-precision floating-point values `a'
2889| and `b'.  The operation is performed according to the IEC/IEEE Standard for
2890| Binary Floating-Point Arithmetic.
2891*----------------------------------------------------------------------------*/
2892
2893float64 float64_add( float64 a, float64 b STATUS_PARAM )
2894{
2895    flag aSign, bSign;
2896
2897    aSign = extractFloat64Sign( a );
2898    bSign = extractFloat64Sign( b );
2899    if ( aSign == bSign ) {
2900        return addFloat64Sigs( a, b, aSign STATUS_VAR );
2901    }
2902    else {
2903        return subFloat64Sigs( a, b, aSign STATUS_VAR );
2904    }
2905
2906}
2907
2908/*----------------------------------------------------------------------------
2909| Returns the result of subtracting the double-precision floating-point values
2910| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2911| for Binary Floating-Point Arithmetic.
2912*----------------------------------------------------------------------------*/
2913
2914float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2915{
2916    flag aSign, bSign;
2917
2918    aSign = extractFloat64Sign( a );
2919    bSign = extractFloat64Sign( b );
2920    if ( aSign == bSign ) {
2921        return subFloat64Sigs( a, b, aSign STATUS_VAR );
2922    }
2923    else {
2924        return addFloat64Sigs( a, b, aSign STATUS_VAR );
2925    }
2926
2927}
2928
2929/*----------------------------------------------------------------------------
2930| Returns the result of multiplying the double-precision floating-point values
2931| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2932| for Binary Floating-Point Arithmetic.
2933*----------------------------------------------------------------------------*/
2934
2935float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2936{
2937    flag aSign, bSign, zSign;
2938    int16 aExp, bExp, zExp;
2939    bits64 aSig, bSig, zSig0, zSig1;
2940
2941    aSig = extractFloat64Frac( a );
2942    aExp = extractFloat64Exp( a );
2943    aSign = extractFloat64Sign( a );
2944    bSig = extractFloat64Frac( b );
2945    bExp = extractFloat64Exp( b );
2946    bSign = extractFloat64Sign( b );
2947    zSign = aSign ^ bSign;
2948    if ( aExp == 0x7FF ) {
2949        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2950            return propagateFloat64NaN( a, b STATUS_VAR );
2951        }
2952        if ( ( bExp | bSig ) == 0 ) {
2953            float_raise( float_flag_invalid STATUS_VAR);
2954            return float64_default_nan;
2955        }
2956        return packFloat64( zSign, 0x7FF, 0 );
2957    }
2958    if ( bExp == 0x7FF ) {
2959        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2960        if ( ( aExp | aSig ) == 0 ) {
2961            float_raise( float_flag_invalid STATUS_VAR);
2962            return float64_default_nan;
2963        }
2964        return packFloat64( zSign, 0x7FF, 0 );
2965    }
2966    if ( aExp == 0 ) {
2967        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2968        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2969    }
2970    if ( bExp == 0 ) {
2971        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2972        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2973    }
2974    zExp = aExp + bExp - 0x3FF;
2975    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2976    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2977    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2978    zSig0 |= ( zSig1 != 0 );
2979    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2980        zSig0 <<= 1;
2981        --zExp;
2982    }
2983    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2984
2985}
2986
2987/*----------------------------------------------------------------------------
2988| Returns the result of dividing the double-precision floating-point value `a'
2989| by the corresponding value `b'.  The operation is performed according to
2990| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2991*----------------------------------------------------------------------------*/
2992
2993float64 float64_div( float64 a, float64 b STATUS_PARAM )
2994{
2995    flag aSign, bSign, zSign;
2996    int16 aExp, bExp, zExp;
2997    bits64 aSig, bSig, zSig;
2998    bits64 rem0, rem1;
2999    bits64 term0, term1;
3000
3001    aSig = extractFloat64Frac( a );
3002    aExp = extractFloat64Exp( a );
3003    aSign = extractFloat64Sign( a );
3004    bSig = extractFloat64Frac( b );
3005    bExp = extractFloat64Exp( b );
3006    bSign = extractFloat64Sign( b );
3007    zSign = aSign ^ bSign;
3008    if ( aExp == 0x7FF ) {
3009        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3010        if ( bExp == 0x7FF ) {
3011            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3012            float_raise( float_flag_invalid STATUS_VAR);
3013            return float64_default_nan;
3014        }
3015        return packFloat64( zSign, 0x7FF, 0 );
3016    }
3017    if ( bExp == 0x7FF ) {
3018        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3019        return packFloat64( zSign, 0, 0 );
3020    }
3021    if ( bExp == 0 ) {
3022        if ( bSig == 0 ) {
3023            if ( ( aExp | aSig ) == 0 ) {
3024                float_raise( float_flag_invalid STATUS_VAR);
3025                return float64_default_nan;
3026            }
3027            float_raise( float_flag_divbyzero STATUS_VAR);
3028            return packFloat64( zSign, 0x7FF, 0 );
3029        }
3030        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3031    }
3032    if ( aExp == 0 ) {
3033        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3034        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3035    }
3036    zExp = aExp - bExp + 0x3FD;
3037    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3038    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3039    if ( bSig <= ( aSig + aSig ) ) {
3040        aSig >>= 1;
3041        ++zExp;
3042    }
3043    zSig = estimateDiv128To64( aSig, 0, bSig );
3044    if ( ( zSig & 0x1FF ) <= 2 ) {
3045        mul64To128( bSig, zSig, &term0, &term1 );
3046        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3047        while ( (sbits64) rem0 < 0 ) {
3048            --zSig;
3049            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3050        }
3051        zSig |= ( rem1 != 0 );
3052    }
3053    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3054
3055}
3056
3057/*----------------------------------------------------------------------------
3058| Returns the remainder of the double-precision floating-point value `a'
3059| with respect to the corresponding value `b'.  The operation is performed
3060| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3061*----------------------------------------------------------------------------*/
3062
3063float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3064{
3065    flag aSign, bSign, zSign;
3066    int16 aExp, bExp, expDiff;
3067    bits64 aSig, bSig;
3068    bits64 q, alternateASig;
3069    sbits64 sigMean;
3070
3071    aSig = extractFloat64Frac( a );
3072    aExp = extractFloat64Exp( a );
3073    aSign = extractFloat64Sign( a );
3074    bSig = extractFloat64Frac( b );
3075    bExp = extractFloat64Exp( b );
3076    bSign = extractFloat64Sign( b );
3077    if ( aExp == 0x7FF ) {
3078        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3079            return propagateFloat64NaN( a, b STATUS_VAR );
3080        }
3081        float_raise( float_flag_invalid STATUS_VAR);
3082        return float64_default_nan;
3083    }
3084    if ( bExp == 0x7FF ) {
3085        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3086        return a;
3087    }
3088    if ( bExp == 0 ) {
3089        if ( bSig == 0 ) {
3090            float_raise( float_flag_invalid STATUS_VAR);
3091            return float64_default_nan;
3092        }
3093        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3094    }
3095    if ( aExp == 0 ) {
3096        if ( aSig == 0 ) return a;
3097        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3098    }
3099    expDiff = aExp - bExp;
3100    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3101    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3102    if ( expDiff < 0 ) {
3103        if ( expDiff < -1 ) return a;
3104        aSig >>= 1;
3105    }
3106    q = ( bSig <= aSig );
3107    if ( q ) aSig -= bSig;
3108    expDiff -= 64;
3109    while ( 0 < expDiff ) {
3110        q = estimateDiv128To64( aSig, 0, bSig );
3111        q = ( 2 < q ) ? q - 2 : 0;
3112        aSig = - ( ( bSig>>2 ) * q );
3113        expDiff -= 62;
3114    }
3115    expDiff += 64;
3116    if ( 0 < expDiff ) {
3117        q = estimateDiv128To64( aSig, 0, bSig );
3118        q = ( 2 < q ) ? q - 2 : 0;
3119        q >>= 64 - expDiff;
3120        bSig >>= 2;
3121        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3122    }
3123    else {
3124        aSig >>= 2;
3125        bSig >>= 2;
3126    }
3127    do {
3128        alternateASig = aSig;
3129        ++q;
3130        aSig -= bSig;
3131    } while ( 0 <= (sbits64) aSig );
3132    sigMean = aSig + alternateASig;
3133    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3134        aSig = alternateASig;
3135    }
3136    zSign = ( (sbits64) aSig < 0 );
3137    if ( zSign ) aSig = - aSig;
3138    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3139
3140}
3141
3142/*----------------------------------------------------------------------------
3143| Returns the square root of the double-precision floating-point value `a'.
3144| The operation is performed according to the IEC/IEEE Standard for Binary
3145| Floating-Point Arithmetic.
3146*----------------------------------------------------------------------------*/
3147
3148float64 float64_sqrt( float64 a STATUS_PARAM )
3149{
3150    flag aSign;
3151    int16 aExp, zExp;
3152    bits64 aSig, zSig, doubleZSig;
3153    bits64 rem0, rem1, term0, term1;
3154
3155    aSig = extractFloat64Frac( a );
3156    aExp = extractFloat64Exp( a );
3157    aSign = extractFloat64Sign( a );
3158    if ( aExp == 0x7FF ) {
3159        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3160        if ( ! aSign ) return a;
3161        float_raise( float_flag_invalid STATUS_VAR);
3162        return float64_default_nan;
3163    }
3164    if ( aSign ) {
3165        if ( ( aExp | aSig ) == 0 ) return a;
3166        float_raise( float_flag_invalid STATUS_VAR);
3167        return float64_default_nan;
3168    }
3169    if ( aExp == 0 ) {
3170        if ( aSig == 0 ) return float64_zero;
3171        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3172    }
3173    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3174    aSig |= LIT64( 0x0010000000000000 );
3175    zSig = estimateSqrt32( aExp, aSig>>21 );
3176    aSig <<= 9 - ( aExp & 1 );
3177    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3178    if ( ( zSig & 0x1FF ) <= 5 ) {
3179        doubleZSig = zSig<<1;
3180        mul64To128( zSig, zSig, &term0, &term1 );
3181        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3182        while ( (sbits64) rem0 < 0 ) {
3183            --zSig;
3184            doubleZSig -= 2;
3185            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3186        }
3187        zSig |= ( ( rem0 | rem1 ) != 0 );
3188    }
3189    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3190
3191}
3192
3193/*----------------------------------------------------------------------------
3194| Returns the binary log of the double-precision floating-point value `a'.
3195| The operation is performed according to the IEC/IEEE Standard for Binary
3196| Floating-Point Arithmetic.
3197*----------------------------------------------------------------------------*/
3198float64 float64_log2( float64 a STATUS_PARAM )
3199{
3200    flag aSign, zSign;
3201    int16 aExp;
3202    bits64 aSig, aSig0, aSig1, zSig, i;
3203
3204    aSig = extractFloat64Frac( a );
3205    aExp = extractFloat64Exp( a );
3206    aSign = extractFloat64Sign( a );
3207
3208    if ( aExp == 0 ) {
3209        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3210        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3211    }
3212    if ( aSign ) {
3213        float_raise( float_flag_invalid STATUS_VAR);
3214        return float64_default_nan;
3215    }
3216    if ( aExp == 0x7FF ) {
3217        if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3218        return a;
3219    }
3220
3221    aExp -= 0x3FF;
3222    aSig |= LIT64( 0x0010000000000000 );
3223    zSign = aExp < 0;
3224    zSig = (bits64)aExp << 52;
3225    for (i = 1LL << 51; i > 0; i >>= 1) {
3226        mul64To128( aSig, aSig, &aSig0, &aSig1 );
3227        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3228        if ( aSig & LIT64( 0x0020000000000000 ) ) {
3229            aSig >>= 1;
3230            zSig |= i;
3231        }
3232    }
3233
3234    if ( zSign )
3235        zSig = -zSig;
3236    return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3237}
3238
3239/*----------------------------------------------------------------------------
3240| Returns 1 if the double-precision floating-point value `a' is equal to the
3241| corresponding value `b', and 0 otherwise.  The comparison is performed
3242| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3243*----------------------------------------------------------------------------*/
3244
3245int float64_eq( float64 a, float64 b STATUS_PARAM )
3246{
3247    bits64 av, bv;
3248
3249    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3250         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3251       ) {
3252        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3253            float_raise( float_flag_invalid STATUS_VAR);
3254        }
3255        return 0;
3256    }
3257    av = float64_val(a);
3258    bv = float64_val(b);
3259    return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3260
3261}
3262
3263/*----------------------------------------------------------------------------
3264| Returns 1 if the double-precision floating-point value `a' is less than or
3265| equal to the corresponding value `b', and 0 otherwise.  The comparison is
3266| performed according to the IEC/IEEE Standard for Binary Floating-Point
3267| Arithmetic.
3268*----------------------------------------------------------------------------*/
3269
3270int float64_le( float64 a, float64 b STATUS_PARAM )
3271{
3272    flag aSign, bSign;
3273    bits64 av, bv;
3274
3275    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3276         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3277       ) {
3278        float_raise( float_flag_invalid STATUS_VAR);
3279        return 0;
3280    }
3281    aSign = extractFloat64Sign( a );
3282    bSign = extractFloat64Sign( b );
3283    av = float64_val(a);
3284    bv = float64_val(b);
3285    if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3286    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3287
3288}
3289
3290/*----------------------------------------------------------------------------
3291| Returns 1 if the double-precision floating-point value `a' is less than
3292| the corresponding value `b', and 0 otherwise.  The comparison is performed
3293| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3294*----------------------------------------------------------------------------*/
3295
3296int float64_lt( float64 a, float64 b STATUS_PARAM )
3297{
3298    flag aSign, bSign;
3299    bits64 av, bv;
3300
3301    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3302         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3303       ) {
3304        float_raise( float_flag_invalid STATUS_VAR);
3305        return 0;
3306    }
3307    aSign = extractFloat64Sign( a );
3308    bSign = extractFloat64Sign( b );
3309    av = float64_val(a);
3310    bv = float64_val(b);
3311    if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3312    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3313
3314}
3315
3316/*----------------------------------------------------------------------------
3317| Returns 1 if the double-precision floating-point value `a' is equal to the
3318| corresponding value `b', and 0 otherwise.  The invalid exception is raised
3319| if either operand is a NaN.  Otherwise, the comparison is performed
3320| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3321*----------------------------------------------------------------------------*/
3322
3323int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3324{
3325    bits64 av, bv;
3326
3327    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3328         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3329       ) {
3330        float_raise( float_flag_invalid STATUS_VAR);
3331        return 0;
3332    }
3333    av = float64_val(a);
3334    bv = float64_val(b);
3335    return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3336
3337}
3338
3339/*----------------------------------------------------------------------------
3340| Returns 1 if the double-precision floating-point value `a' is less than or
3341| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3342| cause an exception.  Otherwise, the comparison is performed according to the
3343| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3344*----------------------------------------------------------------------------*/
3345
3346int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3347{
3348    flag aSign, bSign;
3349    bits64 av, bv;
3350
3351    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3352         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3353       ) {
3354        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3355            float_raise( float_flag_invalid STATUS_VAR);
3356        }
3357        return 0;
3358    }
3359    aSign = extractFloat64Sign( a );
3360    bSign = extractFloat64Sign( b );
3361    av = float64_val(a);
3362    bv = float64_val(b);
3363    if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3364    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3365
3366}
3367
3368/*----------------------------------------------------------------------------
3369| Returns 1 if the double-precision floating-point value `a' is less than
3370| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3371| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3372| Standard for Binary Floating-Point Arithmetic.
3373*----------------------------------------------------------------------------*/
3374
3375int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3376{
3377    flag aSign, bSign;
3378    bits64 av, bv;
3379
3380    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3381         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3382       ) {
3383        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3384            float_raise( float_flag_invalid STATUS_VAR);
3385        }
3386        return 0;
3387    }
3388    aSign = extractFloat64Sign( a );
3389    bSign = extractFloat64Sign( b );
3390    av = float64_val(a);
3391    bv = float64_val(b);
3392    if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3393    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3394
3395}
3396
3397#ifdef FLOATX80
3398
3399/*----------------------------------------------------------------------------
3400| Returns the result of converting the extended double-precision floating-
3401| point value `a' to the 32-bit two's complement integer format.  The
3402| conversion is performed according to the IEC/IEEE Standard for Binary
3403| Floating-Point Arithmetic---which means in particular that the conversion
3404| is rounded according to the current rounding mode.  If `a' is a NaN, the
3405| largest positive integer is returned.  Otherwise, if the conversion
3406| overflows, the largest integer with the same sign as `a' is returned.
3407*----------------------------------------------------------------------------*/
3408
3409int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3410{
3411    flag aSign;
3412    int32 aExp, shiftCount;
3413    bits64 aSig;
3414
3415    aSig = extractFloatx80Frac( a );
3416    aExp = extractFloatx80Exp( a );
3417    aSign = extractFloatx80Sign( a );
3418    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3419    shiftCount = 0x4037 - aExp;
3420    if ( shiftCount <= 0 ) shiftCount = 1;
3421    shift64RightJamming( aSig, shiftCount, &aSig );
3422    return roundAndPackInt32( aSign, aSig STATUS_VAR );
3423
3424}
3425
3426/*----------------------------------------------------------------------------
3427| Returns the result of converting the extended double-precision floating-
3428| point value `a' to the 32-bit two's complement integer format.  The
3429| conversion is performed according to the IEC/IEEE Standard for Binary
3430| Floating-Point Arithmetic, except that the conversion is always rounded
3431| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3432| Otherwise, if the conversion overflows, the largest integer with the same
3433| sign as `a' is returned.
3434*----------------------------------------------------------------------------*/
3435
3436int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3437{
3438    flag aSign;
3439    int32 aExp, shiftCount;
3440    bits64 aSig, savedASig;
3441    int32 z;
3442
3443    aSig = extractFloatx80Frac( a );
3444    aExp = extractFloatx80Exp( a );
3445    aSign = extractFloatx80Sign( a );
3446    if ( 0x401E < aExp ) {
3447        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3448        goto invalid;
3449    }
3450    else if ( aExp < 0x3FFF ) {
3451        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3452        return 0;
3453    }
3454    shiftCount = 0x403E - aExp;
3455    savedASig = aSig;
3456    aSig >>= shiftCount;
3457    z = aSig;
3458    if ( aSign ) z = - z;
3459    if ( ( z < 0 ) ^ aSign ) {
3460 invalid:
3461        float_raise( float_flag_invalid STATUS_VAR);
3462        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3463    }
3464    if ( ( aSig<<shiftCount ) != savedASig ) {
3465        STATUS(float_exception_flags) |= float_flag_inexact;
3466    }
3467    return z;
3468
3469}
3470
3471/*----------------------------------------------------------------------------
3472| Returns the result of converting the extended double-precision floating-
3473| point value `a' to the 64-bit two's complement integer format.  The
3474| conversion is performed according to the IEC/IEEE Standard for Binary
3475| Floating-Point Arithmetic---which means in particular that the conversion
3476| is rounded according to the current rounding mode.  If `a' is a NaN,
3477| the largest positive integer is returned.  Otherwise, if the conversion
3478| overflows, the largest integer with the same sign as `a' is returned.
3479*----------------------------------------------------------------------------*/
3480
3481int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3482{
3483    flag aSign;
3484    int32 aExp, shiftCount;
3485    bits64 aSig, aSigExtra;
3486
3487    aSig = extractFloatx80Frac( a );
3488    aExp = extractFloatx80Exp( a );
3489    aSign = extractFloatx80Sign( a );
3490    shiftCount = 0x403E - aExp;
3491    if ( shiftCount <= 0 ) {
3492        if ( shiftCount ) {
3493            float_raise( float_flag_invalid STATUS_VAR);
3494            if (    ! aSign
3495                 || (    ( aExp == 0x7FFF )
3496                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
3497               ) {
3498                return LIT64( 0x7FFFFFFFFFFFFFFF );
3499            }
3500            return (sbits64) LIT64( 0x8000000000000000 );
3501        }
3502        aSigExtra = 0;
3503    }
3504    else {
3505        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3506    }
3507    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3508
3509}
3510
3511/*----------------------------------------------------------------------------
3512| Returns the result of converting the extended double-precision floating-
3513| point value `a' to the 64-bit two's complement integer format.  The
3514| conversion is performed according to the IEC/IEEE Standard for Binary
3515| Floating-Point Arithmetic, except that the conversion is always rounded
3516| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3517| Otherwise, if the conversion overflows, the largest integer with the same
3518| sign as `a' is returned.
3519*----------------------------------------------------------------------------*/
3520
3521int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3522{
3523    flag aSign;
3524    int32 aExp, shiftCount;
3525    bits64 aSig;
3526    int64 z;
3527
3528    aSig = extractFloatx80Frac( a );
3529    aExp = extractFloatx80Exp( a );
3530    aSign = extractFloatx80Sign( a );
3531    shiftCount = aExp - 0x403E;
3532    if ( 0 <= shiftCount ) {
3533        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3534        if ( ( a.high != 0xC03E ) || aSig ) {
3535            float_raise( float_flag_invalid STATUS_VAR);
3536            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3537                return LIT64( 0x7FFFFFFFFFFFFFFF );
3538            }
3539        }
3540        return (sbits64) LIT64( 0x8000000000000000 );
3541    }
3542    else if ( aExp < 0x3FFF ) {
3543        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3544        return 0;
3545    }
3546    z = aSig>>( - shiftCount );
3547    if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3548        STATUS(float_exception_flags) |= float_flag_inexact;
3549    }
3550    if ( aSign ) z = - z;
3551    return z;
3552
3553}
3554
3555/*----------------------------------------------------------------------------
3556| Returns the result of converting the extended double-precision floating-
3557| point value `a' to the single-precision floating-point format.  The
3558| conversion is performed according to the IEC/IEEE Standard for Binary
3559| Floating-Point Arithmetic.
3560*----------------------------------------------------------------------------*/
3561
3562float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3563{
3564    flag aSign;
3565    int32 aExp;
3566    bits64 aSig;
3567
3568    aSig = extractFloatx80Frac( a );
3569    aExp = extractFloatx80Exp( a );
3570    aSign = extractFloatx80Sign( a );
3571    if ( aExp == 0x7FFF ) {
3572        if ( (bits64) ( aSig<<1 ) ) {
3573            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3574        }
3575        return packFloat32( aSign, 0xFF, 0 );
3576    }
3577    shift64RightJamming( aSig, 33, &aSig );
3578    if ( aExp || aSig ) aExp -= 0x3F81;
3579    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3580
3581}
3582
3583/*----------------------------------------------------------------------------
3584| Returns the result of converting the extended double-precision floating-
3585| point value `a' to the double-precision floating-point format.  The
3586| conversion is performed according to the IEC/IEEE Standard for Binary
3587| Floating-Point Arithmetic.
3588*----------------------------------------------------------------------------*/
3589
3590float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3591{
3592    flag aSign;
3593    int32 aExp;
3594    bits64 aSig, zSig;
3595
3596    aSig = extractFloatx80Frac( a );
3597    aExp = extractFloatx80Exp( a );
3598    aSign = extractFloatx80Sign( a );
3599    if ( aExp == 0x7FFF ) {
3600        if ( (bits64) ( aSig<<1 ) ) {
3601            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3602        }
3603        return packFloat64( aSign, 0x7FF, 0 );
3604    }
3605    shift64RightJamming( aSig, 1, &zSig );
3606    if ( aExp || aSig ) aExp -= 0x3C01;
3607    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3608
3609}
3610
3611#ifdef FLOAT128
3612
3613/*----------------------------------------------------------------------------
3614| Returns the result of converting the extended double-precision floating-
3615| point value `a' to the quadruple-precision floating-point format.  The
3616| conversion is performed according to the IEC/IEEE Standard for Binary
3617| Floating-Point Arithmetic.
3618*----------------------------------------------------------------------------*/
3619
3620float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3621{
3622    flag aSign;
3623    int16 aExp;
3624    bits64 aSig, zSig0, zSig1;
3625
3626    aSig = extractFloatx80Frac( a );
3627    aExp = extractFloatx80Exp( a );
3628    aSign = extractFloatx80Sign( a );
3629    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3630        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3631    }
3632    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3633    return packFloat128( aSign, aExp, zSig0, zSig1 );
3634
3635}
3636
3637#endif
3638
3639/*----------------------------------------------------------------------------
3640| Rounds the extended double-precision floating-point value `a' to an integer,
3641| and returns the result as an extended quadruple-precision floating-point
3642| value.  The operation is performed according to the IEC/IEEE Standard for
3643| Binary Floating-Point Arithmetic.
3644*----------------------------------------------------------------------------*/
3645
3646floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3647{
3648    flag aSign;
3649    int32 aExp;
3650    bits64 lastBitMask, roundBitsMask;
3651    int8 roundingMode;
3652    floatx80 z;
3653
3654    aExp = extractFloatx80Exp( a );
3655    if ( 0x403E <= aExp ) {
3656        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3657            return propagateFloatx80NaN( a, a STATUS_VAR );
3658        }
3659        return a;
3660    }
3661    if ( aExp < 0x3FFF ) {
3662        if (    ( aExp == 0 )
3663             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3664            return a;
3665        }
3666        STATUS(float_exception_flags) |= float_flag_inexact;
3667        aSign = extractFloatx80Sign( a );
3668        switch ( STATUS(float_rounding_mode) ) {
3669         case float_round_nearest_even:
3670            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3671               ) {
3672                return
3673                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3674            }
3675            break;
3676         case float_round_down:
3677            return
3678                  aSign ?
3679                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3680                : packFloatx80( 0, 0, 0 );
3681         case float_round_up:
3682            return
3683                  aSign ? packFloatx80( 1, 0, 0 )
3684                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3685        }
3686        return packFloatx80( aSign, 0, 0 );
3687    }
3688    lastBitMask = 1;
3689    lastBitMask <<= 0x403E - aExp;
3690    roundBitsMask = lastBitMask - 1;
3691    z = a;
3692    roundingMode = STATUS(float_rounding_mode);
3693    if ( roundingMode == float_round_nearest_even ) {
3694        z.low += lastBitMask>>1;
3695        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3696    }
3697    else if ( roundingMode != float_round_to_zero ) {
3698        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3699            z.low += roundBitsMask;
3700        }
3701    }
3702    z.low &= ~ roundBitsMask;
3703    if ( z.low == 0 ) {
3704        ++z.high;
3705        z.low = LIT64( 0x8000000000000000 );
3706    }
3707    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3708    return z;
3709
3710}
3711
3712/*----------------------------------------------------------------------------
3713| Returns the result of adding the absolute values of the extended double-
3714| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3715| negated before being returned.  `zSign' is ignored if the result is a NaN.
3716| The addition is performed according to the IEC/IEEE Standard for Binary
3717| Floating-Point Arithmetic.
3718*----------------------------------------------------------------------------*/
3719
3720static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3721{
3722    int32 aExp, bExp, zExp;
3723    bits64 aSig, bSig, zSig0, zSig1;
3724    int32 expDiff;
3725
3726    aSig = extractFloatx80Frac( a );
3727    aExp = extractFloatx80Exp( a );
3728    bSig = extractFloatx80Frac( b );
3729    bExp = extractFloatx80Exp( b );
3730    expDiff = aExp - bExp;
3731    if ( 0 < expDiff ) {
3732        if ( aExp == 0x7FFF ) {
3733            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3734            return a;
3735        }
3736        if ( bExp == 0 ) --expDiff;
3737        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3738        zExp = aExp;
3739    }
3740    else if ( expDiff < 0 ) {
3741        if ( bExp == 0x7FFF ) {
3742            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3743            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3744        }
3745        if ( aExp == 0 ) ++expDiff;
3746        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3747        zExp = bExp;
3748    }
3749    else {
3750        if ( aExp == 0x7FFF ) {
3751            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3752                return propagateFloatx80NaN( a, b STATUS_VAR );
3753            }
3754            return a;
3755        }
3756        zSig1 = 0;
3757        zSig0 = aSig + bSig;
3758        if ( aExp == 0 ) {
3759            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3760            goto roundAndPack;
3761        }
3762        zExp = aExp;
3763        goto shiftRight1;
3764    }
3765    zSig0 = aSig + bSig;
3766    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3767 shiftRight1:
3768    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3769    zSig0 |= LIT64( 0x8000000000000000 );
3770    ++zExp;
3771 roundAndPack:
3772    return
3773        roundAndPackFloatx80(
3774            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3775
3776}
3777
3778/*----------------------------------------------------------------------------
3779| Returns the result of subtracting the absolute values of the extended
3780| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3781| difference is negated before being returned.  `zSign' is ignored if the
3782| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3783| Standard for Binary Floating-Point Arithmetic.
3784*----------------------------------------------------------------------------*/
3785
3786static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3787{
3788    int32 aExp, bExp, zExp;
3789    bits64 aSig, bSig, zSig0, zSig1;
3790    int32 expDiff;
3791    floatx80 z;
3792
3793    aSig = extractFloatx80Frac( a );
3794    aExp = extractFloatx80Exp( a );
3795    bSig = extractFloatx80Frac( b );
3796    bExp = extractFloatx80Exp( b );
3797    expDiff = aExp - bExp;
3798    if ( 0 < expDiff ) goto aExpBigger;
3799    if ( expDiff < 0 ) goto bExpBigger;
3800    if ( aExp == 0x7FFF ) {
3801        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3802            return propagateFloatx80NaN( a, b STATUS_VAR );
3803        }
3804        float_raise( float_flag_invalid STATUS_VAR);
3805        z.low = floatx80_default_nan_low;
3806        z.high = floatx80_default_nan_high;
3807        return z;
3808    }
3809    if ( aExp == 0 ) {
3810        aExp = 1;
3811        bExp = 1;
3812    }
3813    zSig1 = 0;
3814    if ( bSig < aSig ) goto aBigger;
3815    if ( aSig < bSig ) goto bBigger;
3816    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3817 bExpBigger:
3818    if ( bExp == 0x7FFF ) {
3819        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3820        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3821    }
3822    if ( aExp == 0 ) ++expDiff;
3823    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3824 bBigger:
3825    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3826    zExp = bExp;
3827    zSign ^= 1;
3828    goto normalizeRoundAndPack;
3829 aExpBigger:
3830    if ( aExp == 0x7FFF ) {
3831        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3832        return a;
3833    }
3834    if ( bExp == 0 ) --expDiff;
3835    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3836 aBigger:
3837    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3838    zExp = aExp;
3839 normalizeRoundAndPack:
3840    return
3841        normalizeRoundAndPackFloatx80(
3842            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3843
3844}
3845
3846/*----------------------------------------------------------------------------
3847| Returns the result of adding the extended double-precision floating-point
3848| values `a' and `b'.  The operation is performed according to the IEC/IEEE
3849| Standard for Binary Floating-Point Arithmetic.
3850*----------------------------------------------------------------------------*/
3851
3852floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3853{
3854    flag aSign, bSign;
3855
3856    aSign = extractFloatx80Sign( a );
3857    bSign = extractFloatx80Sign( b );
3858    if ( aSign == bSign ) {
3859        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3860    }
3861    else {
3862        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3863    }
3864
3865}
3866
3867/*----------------------------------------------------------------------------
3868| Returns the result of subtracting the extended double-precision floating-
3869| point values `a' and `b'.  The operation is performed according to the
3870| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3871*----------------------------------------------------------------------------*/
3872
3873floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3874{
3875    flag aSign, bSign;
3876
3877    aSign = extractFloatx80Sign( a );
3878    bSign = extractFloatx80Sign( b );
3879    if ( aSign == bSign ) {
3880        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3881    }
3882    else {
3883        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3884    }
3885
3886}
3887
3888/*----------------------------------------------------------------------------
3889| Returns the result of multiplying the extended double-precision floating-
3890| point values `a' and `b'.  The operation is performed according to the
3891| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3892*----------------------------------------------------------------------------*/
3893
3894floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3895{
3896    flag aSign, bSign, zSign;
3897    int32 aExp, bExp, zExp;
3898    bits64 aSig, bSig, zSig0, zSig1;
3899    floatx80 z;
3900
3901    aSig = extractFloatx80Frac( a );
3902    aExp = extractFloatx80Exp( a );
3903    aSign = extractFloatx80Sign( a );
3904    bSig = extractFloatx80Frac( b );
3905    bExp = extractFloatx80Exp( b );
3906    bSign = extractFloatx80Sign( b );
3907    zSign = aSign ^ bSign;
3908    if ( aExp == 0x7FFF ) {
3909        if (    (bits64) ( aSig<<1 )
3910             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3911            return propagateFloatx80NaN( a, b STATUS_VAR );
3912        }
3913        if ( ( bExp | bSig ) == 0 ) goto invalid;
3914        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3915    }
3916    if ( bExp == 0x7FFF ) {
3917        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3918        if ( ( aExp | aSig ) == 0 ) {
3919 invalid:
3920            float_raise( float_flag_invalid STATUS_VAR);
3921            z.low = floatx80_default_nan_low;
3922            z.high = floatx80_default_nan_high;
3923            return z;
3924        }
3925        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3926    }
3927    if ( aExp == 0 ) {
3928        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3929        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3930    }
3931    if ( bExp == 0 ) {
3932        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3933        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3934    }
3935    zExp = aExp + bExp - 0x3FFE;
3936    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3937    if ( 0 < (sbits64) zSig0 ) {
3938        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3939        --zExp;
3940    }
3941    return
3942        roundAndPackFloatx80(
3943            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3944
3945}
3946
3947/*----------------------------------------------------------------------------
3948| Returns the result of dividing the extended double-precision floating-point
3949| value `a' by the corresponding value `b'.  The operation is performed
3950| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3951*----------------------------------------------------------------------------*/
3952
3953floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3954{
3955    flag aSign, bSign, zSign;
3956    int32 aExp, bExp, zExp;
3957    bits64 aSig, bSig, zSig0, zSig1;
3958    bits64 rem0, rem1, rem2, term0, term1, term2;
3959    floatx80 z;
3960
3961    aSig = extractFloatx80Frac( a );
3962    aExp = extractFloatx80Exp( a );
3963    aSign = extractFloatx80Sign( a );
3964    bSig = extractFloatx80Frac( b );
3965    bExp = extractFloatx80Exp( b );
3966    bSign = extractFloatx80Sign( b );
3967    zSign = aSign ^ bSign;
3968    if ( aExp == 0x7FFF ) {
3969        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3970        if ( bExp == 0x7FFF ) {
3971            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3972            goto invalid;
3973        }
3974        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3975    }
3976    if ( bExp == 0x7FFF ) {
3977        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3978        return packFloatx80( zSign, 0, 0 );
3979    }
3980    if ( bExp == 0 ) {
3981        if ( bSig == 0 ) {
3982            if ( ( aExp | aSig ) == 0 ) {
3983 invalid:
3984                float_raise( float_flag_invalid STATUS_VAR);
3985                z.low = floatx80_default_nan_low;
3986                z.high = floatx80_default_nan_high;
3987                return z;
3988            }
3989            float_raise( float_flag_divbyzero STATUS_VAR);
3990            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3991        }
3992        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3993    }
3994    if ( aExp == 0 ) {
3995        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3996        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3997    }
3998    zExp = aExp - bExp + 0x3FFE;
3999    rem1 = 0;
4000    if ( bSig <= aSig ) {
4001        shift128Right( aSig, 0, 1, &aSig, &rem1 );
4002        ++zExp;
4003    }
4004    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4005    mul64To128( bSig, zSig0, &term0, &term1 );
4006    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4007    while ( (sbits64) rem0 < 0 ) {
4008        --zSig0;
4009        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4010    }
4011    zSig1 = estimateDiv128To64( rem1, 0, bSig );
4012    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4013        mul64To128( bSig, zSig1, &term1, &term2 );
4014        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4015        while ( (sbits64) rem1 < 0 ) {
4016            --zSig1;
4017            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4018        }
4019        zSig1 |= ( ( rem1 | rem2 ) != 0 );
4020    }
4021    return
4022        roundAndPackFloatx80(
4023            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4024
4025}
4026
4027/*----------------------------------------------------------------------------
4028| Returns the remainder of the extended double-precision floating-point value
4029| `a' with respect to the corresponding value `b'.  The operation is performed
4030| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4031*----------------------------------------------------------------------------*/
4032
4033floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4034{
4035    flag aSign, bSign, zSign;
4036    int32 aExp, bExp, expDiff;
4037    bits64 aSig0, aSig1, bSig;
4038    bits64 q, term0, term1, alternateASig0, alternateASig1;
4039    floatx80 z;
4040
4041    aSig0 = extractFloatx80Frac( a );
4042    aExp = extractFloatx80Exp( a );
4043    aSign = extractFloatx80Sign( a );
4044    bSig = extractFloatx80Frac( b );
4045    bExp = extractFloatx80Exp( b );
4046    bSign = extractFloatx80Sign( b );
4047    if ( aExp == 0x7FFF ) {
4048        if (    (bits64) ( aSig0<<1 )
4049             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4050            return propagateFloatx80NaN( a, b STATUS_VAR );
4051        }
4052        goto invalid;
4053    }
4054    if ( bExp == 0x7FFF ) {
4055        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4056        return a;
4057    }
4058    if ( bExp == 0 ) {
4059        if ( bSig == 0 ) {
4060 invalid:
4061            float_raise( float_flag_invalid STATUS_VAR);
4062            z.low = floatx80_default_nan_low;
4063            z.high = floatx80_default_nan_high;
4064            return z;
4065        }
4066        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4067    }
4068    if ( aExp == 0 ) {
4069        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4070        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4071    }
4072    bSig |= LIT64( 0x8000000000000000 );
4073    zSign = aSign;
4074    expDiff = aExp - bExp;
4075    aSig1 = 0;
4076    if ( expDiff < 0 ) {
4077        if ( expDiff < -1 ) return a;
4078        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4079        expDiff = 0;
4080    }
4081    q = ( bSig <= aSig0 );
4082    if ( q ) aSig0 -= bSig;
4083    expDiff -= 64;
4084    while ( 0 < expDiff ) {
4085        q = estimateDiv128To64( aSig0, aSig1, bSig );
4086        q = ( 2 < q ) ? q - 2 : 0;
4087        mul64To128( bSig, q, &term0, &term1 );
4088        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4089        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4090        expDiff -= 62;
4091    }
4092    expDiff += 64;
4093    if ( 0 < expDiff ) {
4094        q = estimateDiv128To64( aSig0, aSig1, bSig );
4095        q = ( 2 < q ) ? q - 2 : 0;
4096        q >>= 64 - expDiff;
4097        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4098        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4099        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4100        while ( le128( term0, term1, aSig0, aSig1 ) ) {
4101            ++q;
4102            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4103        }
4104    }
4105    else {
4106        term1 = 0;
4107        term0 = bSig;
4108    }
4109    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4110    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4111         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4112              && ( q & 1 ) )
4113       ) {
4114        aSig0 = alternateASig0;
4115        aSig1 = alternateASig1;
4116        zSign = ! zSign;
4117    }
4118    return
4119        normalizeRoundAndPackFloatx80(
4120            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4121
4122}
4123
4124/*----------------------------------------------------------------------------
4125| Returns the square root of the extended double-precision floating-point
4126| value `a'.  The operation is performed according to the IEC/IEEE Standard
4127| for Binary Floating-Point Arithmetic.
4128*----------------------------------------------------------------------------*/
4129
4130floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4131{
4132    flag aSign;
4133    int32 aExp, zExp;
4134    bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4135    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4136    floatx80 z;
4137
4138    aSig0 = extractFloatx80Frac( a );
4139    aExp = extractFloatx80Exp( a );
4140    aSign = extractFloatx80Sign( a );
4141    if ( aExp == 0x7FFF ) {
4142        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4143        if ( ! aSign ) return a;
4144        goto invalid;
4145    }
4146    if ( aSign ) {
4147        if ( ( aExp | aSig0 ) == 0 ) return a;
4148 invalid:
4149        float_raise( float_flag_invalid STATUS_VAR);
4150        z.low = floatx80_default_nan_low;
4151        z.high = floatx80_default_nan_high;
4152        return z;
4153    }
4154    if ( aExp == 0 ) {
4155        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4156        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4157    }
4158    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4159    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4160    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4161    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4162    doubleZSig0 = zSig0<<1;
4163    mul64To128( zSig0, zSig0, &term0, &term1 );
4164    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4165    while ( (sbits64) rem0 < 0 ) {
4166        --zSig0;
4167        doubleZSig0 -= 2;
4168        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4169    }
4170    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4171    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4172        if ( zSig1 == 0 ) zSig1 = 1;
4173        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4174        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4175        mul64To128( zSig1, zSig1, &term2, &term3 );
4176        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4177        while ( (sbits64) rem1 < 0 ) {
4178            --zSig1;
4179            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4180            term3 |= 1;
4181            term2 |= doubleZSig0;
4182            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4183        }
4184        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4185    }
4186    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4187    zSig0 |= doubleZSig0;
4188    return
4189        roundAndPackFloatx80(
4190            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4191
4192}
4193
4194/*----------------------------------------------------------------------------
4195| Returns 1 if the extended double-precision floating-point value `a' is
4196| equal to the corresponding value `b', and 0 otherwise.  The comparison is
4197| performed according to the IEC/IEEE Standard for Binary Floating-Point
4198| Arithmetic.
4199*----------------------------------------------------------------------------*/
4200
4201int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4202{
4203
4204    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4205              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4206         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4207              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4208       ) {
4209        if (    floatx80_is_signaling_nan( a )
4210             || floatx80_is_signaling_nan( b ) ) {
4211            float_raise( float_flag_invalid STATUS_VAR);
4212        }
4213        return 0;
4214    }
4215    return
4216           ( a.low == b.low )
4217        && (    ( a.high == b.high )
4218             || (    ( a.low == 0 )
4219                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4220           );
4221
4222}
4223
4224/*----------------------------------------------------------------------------
4225| Returns 1 if the extended double-precision floating-point value `a' is
4226| less than or equal to the corresponding value `b', and 0 otherwise.  The
4227| comparison is performed according to the IEC/IEEE Standard for Binary
4228| Floating-Point Arithmetic.
4229*----------------------------------------------------------------------------*/
4230
4231int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4232{
4233    flag aSign, bSign;
4234
4235    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4236              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4237         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4238              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4239       ) {
4240        float_raise( float_flag_invalid STATUS_VAR);
4241        return 0;
4242    }
4243    aSign = extractFloatx80Sign( a );
4244    bSign = extractFloatx80Sign( b );
4245    if ( aSign != bSign ) {
4246        return
4247               aSign
4248            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4249                 == 0 );
4250    }
4251    return
4252          aSign ? le128( b.high, b.low, a.high, a.low )
4253        : le128( a.high, a.low, b.high, b.low );
4254
4255}
4256
4257/*----------------------------------------------------------------------------
4258| Returns 1 if the extended double-precision floating-point value `a' is
4259| less than the corresponding value `b', and 0 otherwise.  The comparison
4260| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4261| Arithmetic.
4262*----------------------------------------------------------------------------*/
4263
4264int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4265{
4266    flag aSign, bSign;
4267
4268    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4269              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4270         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4271              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4272       ) {
4273        float_raise( float_flag_invalid STATUS_VAR);
4274        return 0;
4275    }
4276    aSign = extractFloatx80Sign( a );
4277    bSign = extractFloatx80Sign( b );
4278    if ( aSign != bSign ) {
4279        return
4280               aSign
4281            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4282                 != 0 );
4283    }
4284    return
4285          aSign ? lt128( b.high, b.low, a.high, a.low )
4286        : lt128( a.high, a.low, b.high, b.low );
4287
4288}
4289
4290/*----------------------------------------------------------------------------
4291| Returns 1 if the extended double-precision floating-point value `a' is equal
4292| to the corresponding value `b', and 0 otherwise.  The invalid exception is
4293| raised if either operand is a NaN.  Otherwise, the comparison is performed
4294| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4295*----------------------------------------------------------------------------*/
4296
4297int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4298{
4299
4300    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4301              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4302         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4303              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4304       ) {
4305        float_raise( float_flag_invalid STATUS_VAR);
4306        return 0;
4307    }
4308    return
4309           ( a.low == b.low )
4310        && (    ( a.high == b.high )
4311             || (    ( a.low == 0 )
4312                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4313           );
4314
4315}
4316
4317/*----------------------------------------------------------------------------
4318| Returns 1 if the extended double-precision floating-point value `a' is less
4319| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4320| do not cause an exception.  Otherwise, the comparison is performed according
4321| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4322*----------------------------------------------------------------------------*/
4323
4324int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4325{
4326    flag aSign, bSign;
4327
4328    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4329              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4330         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4331              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4332       ) {
4333        if (    floatx80_is_signaling_nan( a )
4334             || floatx80_is_signaling_nan( b ) ) {
4335            float_raise( float_flag_invalid STATUS_VAR);
4336        }
4337        return 0;
4338    }
4339    aSign = extractFloatx80Sign( a );
4340    bSign = extractFloatx80Sign( b );
4341    if ( aSign != bSign ) {
4342        return
4343               aSign
4344            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4345                 == 0 );
4346    }
4347    return
4348          aSign ? le128( b.high, b.low, a.high, a.low )
4349        : le128( a.high, a.low, b.high, b.low );
4350
4351}
4352
4353/*----------------------------------------------------------------------------
4354| Returns 1 if the extended double-precision floating-point value `a' is less
4355| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4356| an exception.  Otherwise, the comparison is performed according to the
4357| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4358*----------------------------------------------------------------------------*/
4359
4360int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4361{
4362    flag aSign, bSign;
4363
4364    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4365              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4366         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4367              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4368       ) {
4369        if (    floatx80_is_signaling_nan( a )
4370             || floatx80_is_signaling_nan( b ) ) {
4371            float_raise( float_flag_invalid STATUS_VAR);
4372        }
4373        return 0;
4374    }
4375    aSign = extractFloatx80Sign( a );
4376    bSign = extractFloatx80Sign( b );
4377    if ( aSign != bSign ) {
4378        return
4379               aSign
4380            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4381                 != 0 );
4382    }
4383    return
4384          aSign ? lt128( b.high, b.low, a.high, a.low )
4385        : lt128( a.high, a.low, b.high, b.low );
4386
4387}
4388
4389#endif
4390
4391#ifdef FLOAT128
4392
4393/*----------------------------------------------------------------------------
4394| Returns the result of converting the quadruple-precision floating-point
4395| value `a' to the 32-bit two's complement integer format.  The conversion
4396| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4397| Arithmetic---which means in particular that the conversion is rounded
4398| according to the current rounding mode.  If `a' is a NaN, the largest
4399| positive integer is returned.  Otherwise, if the conversion overflows, the
4400| largest integer with the same sign as `a' is returned.
4401*----------------------------------------------------------------------------*/
4402
4403int32 float128_to_int32( float128 a STATUS_PARAM )
4404{
4405    flag aSign;
4406    int32 aExp, shiftCount;
4407    bits64 aSig0, aSig1;
4408
4409    aSig1 = extractFloat128Frac1( a );
4410    aSig0 = extractFloat128Frac0( a );
4411    aExp = extractFloat128Exp( a );
4412    aSign = extractFloat128Sign( a );
4413    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4414    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4415    aSig0 |= ( aSig1 != 0 );
4416    shiftCount = 0x4028 - aExp;
4417    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4418    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4419
4420}
4421
4422/*----------------------------------------------------------------------------
4423| Returns the result of converting the quadruple-precision floating-point
4424| value `a' to the 32-bit two's complement integer format.  The conversion
4425| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4426| Arithmetic, except that the conversion is always rounded toward zero.  If
4427| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4428| conversion overflows, the largest integer with the same sign as `a' is
4429| returned.
4430*----------------------------------------------------------------------------*/
4431
4432int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4433{
4434    flag aSign;
4435    int32 aExp, shiftCount;
4436    bits64 aSig0, aSig1, savedASig;
4437    int32 z;
4438
4439    aSig1 = extractFloat128Frac1( a );
4440    aSig0 = extractFloat128Frac0( a );
4441    aExp = extractFloat128Exp( a );
4442    aSign = extractFloat128Sign( a );
4443    aSig0 |= ( aSig1 != 0 );
4444    if ( 0x401E < aExp ) {
4445        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4446        goto invalid;
4447    }
4448    else if ( aExp < 0x3FFF ) {
4449        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4450        return 0;
4451    }
4452    aSig0 |= LIT64( 0x0001000000000000 );
4453    shiftCount = 0x402F - aExp;
4454    savedASig = aSig0;
4455    aSig0 >>= shiftCount;
4456    z = aSig0;
4457    if ( aSign ) z = - z;
4458    if ( ( z < 0 ) ^ aSign ) {
4459 invalid:
4460        float_raise( float_flag_invalid STATUS_VAR);
4461        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4462    }
4463    if ( ( aSig0<<shiftCount ) != savedASig ) {
4464        STATUS(float_exception_flags) |= float_flag_inexact;
4465    }
4466    return z;
4467
4468}
4469
4470/*----------------------------------------------------------------------------
4471| Returns the result of converting the quadruple-precision floating-point
4472| value `a' to the 64-bit two's complement integer format.  The conversion
4473| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4474| Arithmetic---which means in particular that the conversion is rounded
4475| according to the current rounding mode.  If `a' is a NaN, the largest
4476| positive integer is returned.  Otherwise, if the conversion overflows, the
4477| largest integer with the same sign as `a' is returned.
4478*----------------------------------------------------------------------------*/
4479
4480int64 float128_to_int64( float128 a STATUS_PARAM )
4481{
4482    flag aSign;
4483    int32 aExp, shiftCount;
4484    bits64 aSig0, aSig1;
4485
4486    aSig1 = extractFloat128Frac1( a );
4487    aSig0 = extractFloat128Frac0( a );
4488    aExp = extractFloat128Exp( a );
4489    aSign = extractFloat128Sign( a );
4490    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4491    shiftCount = 0x402F - aExp;
4492    if ( shiftCount <= 0 ) {
4493        if ( 0x403E < aExp ) {
4494            float_raise( float_flag_invalid STATUS_VAR);
4495            if (    ! aSign
4496                 || (    ( aExp == 0x7FFF )
4497                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4498                    )
4499               ) {
4500                return LIT64( 0x7FFFFFFFFFFFFFFF );
4501            }
4502            return (sbits64) LIT64( 0x8000000000000000 );
4503        }
4504        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4505    }
4506    else {
4507        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4508    }
4509    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4510
4511}
4512
4513/*----------------------------------------------------------------------------
4514| Returns the result of converting the quadruple-precision floating-point
4515| value `a' to the 64-bit two's complement integer format.  The conversion
4516| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4517| Arithmetic, except that the conversion is always rounded toward zero.
4518| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4519| the conversion overflows, the largest integer with the same sign as `a' is
4520| returned.
4521*----------------------------------------------------------------------------*/
4522
4523int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4524{
4525    flag aSign;
4526    int32 aExp, shiftCount;
4527    bits64 aSig0, aSig1;
4528    int64 z;
4529
4530    aSig1 = extractFloat128Frac1( a );
4531    aSig0 = extractFloat128Frac0( a );
4532    aExp = extractFloat128Exp( a );
4533    aSign = extractFloat128Sign( a );
4534    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4535    shiftCount = aExp - 0x402F;
4536    if ( 0 < shiftCount ) {
4537        if ( 0x403E <= aExp ) {
4538            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4539            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4540                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4541                if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4542            }
4543            else {
4544                float_raise( float_flag_invalid STATUS_VAR);
4545                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4546                    return LIT64( 0x7FFFFFFFFFFFFFFF );
4547                }
4548            }
4549            return (sbits64) LIT64( 0x8000000000000000 );
4550        }
4551        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4552        if ( (bits64) ( aSig1<<shiftCount ) ) {
4553            STATUS(float_exception_flags) |= float_flag_inexact;
4554        }
4555    }
4556    else {
4557        if ( aExp < 0x3FFF ) {
4558            if ( aExp | aSig0 | aSig1 ) {
4559                STATUS(float_exception_flags) |= float_flag_inexact;
4560            }
4561            return 0;
4562        }
4563        z = aSig0>>( - shiftCount );
4564        if (    aSig1
4565             || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4566            STATUS(float_exception_flags) |= float_flag_inexact;
4567        }
4568    }
4569    if ( aSign ) z = - z;
4570    return z;
4571
4572}
4573
4574/*----------------------------------------------------------------------------
4575| Returns the result of converting the quadruple-precision floating-point
4576| value `a' to the single-precision floating-point format.  The conversion
4577| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4578| Arithmetic.
4579*----------------------------------------------------------------------------*/
4580
4581float32 float128_to_float32( float128 a STATUS_PARAM )
4582{
4583    flag aSign;
4584    int32 aExp;
4585    bits64 aSig0, aSig1;
4586    bits32 zSig;
4587
4588    aSig1 = extractFloat128Frac1( a );
4589    aSig0 = extractFloat128Frac0( a );
4590    aExp = extractFloat128Exp( a );
4591    aSign = extractFloat128Sign( a );
4592    if ( aExp == 0x7FFF ) {
4593        if ( aSig0 | aSig1 ) {
4594            return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4595        }
4596        return packFloat32( aSign, 0xFF, 0 );
4597    }
4598    aSig0 |= ( aSig1 != 0 );
4599    shift64RightJamming( aSig0, 18, &aSig0 );
4600    zSig = aSig0;
4601    if ( aExp || zSig ) {
4602        zSig |= 0x40000000;
4603        aExp -= 0x3F81;
4604    }
4605    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4606
4607}
4608
4609/*----------------------------------------------------------------------------
4610| Returns the result of converting the quadruple-precision floating-point
4611| value `a' to the double-precision floating-point format.  The conversion
4612| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4613| Arithmetic.
4614*----------------------------------------------------------------------------*/
4615
4616float64 float128_to_float64( float128 a STATUS_PARAM )
4617{
4618    flag aSign;
4619    int32 aExp;
4620    bits64 aSig0, aSig1;
4621
4622    aSig1 = extractFloat128Frac1( a );
4623    aSig0 = extractFloat128Frac0( a );
4624    aExp = extractFloat128Exp( a );
4625    aSign = extractFloat128Sign( a );
4626    if ( aExp == 0x7FFF ) {
4627        if ( aSig0 | aSig1 ) {
4628            return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4629        }
4630        return packFloat64( aSign, 0x7FF, 0 );
4631    }
4632    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4633    aSig0 |= ( aSig1 != 0 );
4634    if ( aExp || aSig0 ) {
4635        aSig0 |= LIT64( 0x4000000000000000 );
4636        aExp -= 0x3C01;
4637    }
4638    return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4639
4640}
4641
4642#ifdef FLOATX80
4643
4644/*----------------------------------------------------------------------------
4645| Returns the result of converting the quadruple-precision floating-point
4646| value `a' to the extended double-precision floating-point format.  The
4647| conversion is performed according to the IEC/IEEE Standard for Binary
4648| Floating-Point Arithmetic.
4649*----------------------------------------------------------------------------*/
4650
4651floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4652{
4653    flag aSign;
4654    int32 aExp;
4655    bits64 aSig0, aSig1;
4656
4657    aSig1 = extractFloat128Frac1( a );
4658    aSig0 = extractFloat128Frac0( a );
4659    aExp = extractFloat128Exp( a );
4660    aSign = extractFloat128Sign( a );
4661    if ( aExp == 0x7FFF ) {
4662        if ( aSig0 | aSig1 ) {
4663            return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4664        }
4665        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4666    }
4667    if ( aExp == 0 ) {
4668        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4669        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4670    }
4671    else {
4672        aSig0 |= LIT64( 0x0001000000000000 );
4673    }
4674    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4675    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4676
4677}
4678
4679#endif
4680
4681/*----------------------------------------------------------------------------
4682| Rounds the quadruple-precision floating-point value `a' to an integer, and
4683| returns the result as a quadruple-precision floating-point value.  The
4684| operation is performed according to the IEC/IEEE Standard for Binary
4685| Floating-Point Arithmetic.
4686*----------------------------------------------------------------------------*/
4687
4688float128 float128_round_to_int( float128 a STATUS_PARAM )
4689{
4690    flag aSign;
4691    int32 aExp;
4692    bits64 lastBitMask, roundBitsMask;
4693    int8 roundingMode;
4694    float128 z;
4695
4696    aExp = extractFloat128Exp( a );
4697    if ( 0x402F <= aExp ) {
4698        if ( 0x406F <= aExp ) {
4699            if (    ( aExp == 0x7FFF )
4700                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4701               ) {
4702                return propagateFloat128NaN( a, a STATUS_VAR );
4703            }
4704            return a;
4705        }
4706        lastBitMask = 1;
4707        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4708        roundBitsMask = lastBitMask - 1;
4709        z = a;
4710        roundingMode = STATUS(float_rounding_mode);
4711        if ( roundingMode == float_round_nearest_even ) {
4712            if ( lastBitMask ) {
4713                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4714                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4715            }
4716            else {
4717                if ( (sbits64) z.low < 0 ) {
4718                    ++z.high;
4719                    if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4720                }
4721            }
4722        }
4723        else if ( roundingMode != float_round_to_zero ) {
4724            if (   extractFloat128Sign( z )
4725                 ^ ( roundingMode == float_round_up ) ) {
4726                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4727            }
4728        }
4729        z.low &= ~ roundBitsMask;
4730    }
4731    else {
4732        if ( aExp < 0x3FFF ) {
4733            if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4734            STATUS(float_exception_flags) |= float_flag_inexact;
4735            aSign = extractFloat128Sign( a );
4736            switch ( STATUS(float_rounding_mode) ) {
4737             case float_round_nearest_even:
4738                if (    ( aExp == 0x3FFE )
4739                     && (   extractFloat128Frac0( a )
4740                          | extractFloat128Frac1( a ) )
4741                   ) {
4742                    return packFloat128( aSign, 0x3FFF, 0, 0 );
4743                }
4744                break;
4745             case float_round_down:
4746                return
4747                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4748                    : packFloat128( 0, 0, 0, 0 );
4749             case float_round_up:
4750                return
4751                      aSign ? packFloat128( 1, 0, 0, 0 )
4752                    : packFloat128( 0, 0x3FFF, 0, 0 );
4753            }
4754            return packFloat128( aSign, 0, 0, 0 );
4755        }
4756        lastBitMask = 1;
4757        lastBitMask <<= 0x402F - aExp;
4758        roundBitsMask = lastBitMask - 1;
4759        z.low = 0;
4760        z.high = a.high;
4761        roundingMode = STATUS(float_rounding_mode);
4762        if ( roundingMode == float_round_nearest_even ) {
4763            z.high += lastBitMask>>1;
4764            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4765                z.high &= ~ lastBitMask;
4766            }
4767        }
4768        else if ( roundingMode != float_round_to_zero ) {
4769            if (   extractFloat128Sign( z )
4770                 ^ ( roundingMode == float_round_up ) ) {
4771                z.high |= ( a.low != 0 );
4772                z.high += roundBitsMask;
4773            }
4774        }
4775        z.high &= ~ roundBitsMask;
4776    }
4777    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4778        STATUS(float_exception_flags) |= float_flag_inexact;
4779    }
4780    return z;
4781
4782}
4783
4784/*----------------------------------------------------------------------------
4785| Returns the result of adding the absolute values of the quadruple-precision
4786| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4787| before being returned.  `zSign' is ignored if the result is a NaN.
4788| The addition is performed according to the IEC/IEEE Standard for Binary
4789| Floating-Point Arithmetic.
4790*----------------------------------------------------------------------------*/
4791
4792static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4793{
4794    int32 aExp, bExp, zExp;
4795    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4796    int32 expDiff;
4797
4798    aSig1 = extractFloat128Frac1( a );
4799    aSig0 = extractFloat128Frac0( a );
4800    aExp = extractFloat128Exp( a );
4801    bSig1 = extractFloat128Frac1( b );
4802    bSig0 = extractFloat128Frac0( b );
4803    bExp = extractFloat128Exp( b );
4804    expDiff = aExp - bExp;
4805    if ( 0 < expDiff ) {
4806        if ( aExp == 0x7FFF ) {
4807            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4808            return a;
4809        }
4810        if ( bExp == 0 ) {
4811            --expDiff;
4812        }
4813        else {
4814            bSig0 |= LIT64( 0x0001000000000000 );
4815        }
4816        shift128ExtraRightJamming(
4817            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4818        zExp = aExp;
4819    }
4820    else if ( expDiff < 0 ) {
4821        if ( bExp == 0x7FFF ) {
4822            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4823            return packFloat128( zSign, 0x7FFF, 0, 0 );
4824        }
4825        if ( aExp == 0 ) {
4826            ++expDiff;
4827        }
4828        else {
4829            aSig0 |= LIT64( 0x0001000000000000 );
4830        }
4831        shift128ExtraRightJamming(
4832            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4833        zExp = bExp;
4834    }
4835    else {
4836        if ( aExp == 0x7FFF ) {
4837            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4838                return propagateFloat128NaN( a, b STATUS_VAR );
4839            }
4840            return a;
4841        }
4842        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4843        if ( aExp == 0 ) {
4844            if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
4845            return packFloat128( zSign, 0, zSig0, zSig1 );
4846        }
4847        zSig2 = 0;
4848        zSig0 |= LIT64( 0x0002000000000000 );
4849        zExp = aExp;
4850        goto shiftRight1;
4851    }
4852    aSig0 |= LIT64( 0x0001000000000000 );
4853    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4854    --zExp;
4855    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4856    ++zExp;
4857 shiftRight1:
4858    shift128ExtraRightJamming(
4859        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4860 roundAndPack:
4861    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4862
4863}
4864
4865/*----------------------------------------------------------------------------
4866| Returns the result of subtracting the absolute values of the quadruple-
4867| precision floating-point values `a' and `b'.  If `zSign' is 1, the
4868| difference is negated before being returned.  `zSign' is ignored if the
4869| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4870| Standard for Binary Floating-Point Arithmetic.
4871*----------------------------------------------------------------------------*/
4872
4873static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4874{
4875    int32 aExp, bExp, zExp;
4876    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4877    int32 expDiff;
4878    float128 z;
4879
4880    aSig1 = extractFloat128Frac1( a );
4881    aSig0 = extractFloat128Frac0( a );
4882    aExp = extractFloat128Exp( a );
4883    bSig1 = extractFloat128Frac1( b );
4884    bSig0 = extractFloat128Frac0( b );
4885    bExp = extractFloat128Exp( b );
4886    expDiff = aExp - bExp;
4887    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4888    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4889    if ( 0 < expDiff ) goto aExpBigger;
4890    if ( expDiff < 0 ) goto bExpBigger;
4891    if ( aExp == 0x7FFF ) {
4892        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4893            return propagateFloat128NaN( a, b STATUS_VAR );
4894        }
4895        float_raise( float_flag_invalid STATUS_VAR);
4896        z.low = float128_default_nan_low;
4897        z.high = float128_default_nan_high;
4898        return z;
4899    }
4900    if ( aExp == 0 ) {
4901        aExp = 1;
4902        bExp = 1;
4903    }
4904    if ( bSig0 < aSig0 ) goto aBigger;
4905    if ( aSig0 < bSig0 ) goto bBigger;
4906    if ( bSig1 < aSig1 ) goto aBigger;
4907    if ( aSig1 < bSig1 ) goto bBigger;
4908    return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4909 bExpBigger:
4910    if ( bExp == 0x7FFF ) {
4911        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4912        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4913    }
4914    if ( aExp == 0 ) {
4915        ++expDiff;
4916    }
4917    else {
4918        aSig0 |= LIT64( 0x4000000000000000 );
4919    }
4920    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4921    bSig0 |= LIT64( 0x4000000000000000 );
4922 bBigger:
4923    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4924    zExp = bExp;
4925    zSign ^= 1;
4926    goto normalizeRoundAndPack;
4927 aExpBigger:
4928    if ( aExp == 0x7FFF ) {
4929        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4930        return a;
4931    }
4932    if ( bExp == 0 ) {
4933        --expDiff;
4934    }
4935    else {
4936        bSig0 |= LIT64( 0x4000000000000000 );
4937    }
4938    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4939    aSig0 |= LIT64( 0x4000000000000000 );
4940 aBigger:
4941    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4942    zExp = aExp;
4943 normalizeRoundAndPack:
4944    --zExp;
4945    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4946
4947}
4948
4949/*----------------------------------------------------------------------------
4950| Returns the result of adding the quadruple-precision floating-point values
4951| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4952| for Binary Floating-Point Arithmetic.
4953*----------------------------------------------------------------------------*/
4954
4955float128 float128_add( float128 a, float128 b STATUS_PARAM )
4956{
4957    flag aSign, bSign;
4958
4959    aSign = extractFloat128Sign( a );
4960    bSign = extractFloat128Sign( b );
4961    if ( aSign == bSign ) {
4962        return addFloat128Sigs( a, b, aSign STATUS_VAR );
4963    }
4964    else {
4965        return subFloat128Sigs( a, b, aSign STATUS_VAR );
4966    }
4967
4968}
4969
4970/*----------------------------------------------------------------------------
4971| Returns the result of subtracting the quadruple-precision floating-point
4972| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4973| Standard for Binary Floating-Point Arithmetic.
4974*----------------------------------------------------------------------------*/
4975
4976float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4977{
4978    flag aSign, bSign;
4979
4980    aSign = extractFloat128Sign( a );
4981    bSign = extractFloat128Sign( b );
4982    if ( aSign == bSign ) {
4983        return subFloat128Sigs( a, b, aSign STATUS_VAR );
4984    }
4985    else {
4986        return addFloat128Sigs( a, b, aSign STATUS_VAR );
4987    }
4988
4989}
4990
4991/*----------------------------------------------------------------------------
4992| Returns the result of multiplying the quadruple-precision floating-point
4993| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4994| Standard for Binary Floating-Point Arithmetic.
4995*----------------------------------------------------------------------------*/
4996
4997float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4998{
4999    flag aSign, bSign, zSign;
5000    int32 aExp, bExp, zExp;
5001    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5002    float128 z;
5003
5004    aSig1 = extractFloat128Frac1( a );
5005    aSig0 = extractFloat128Frac0( a );
5006    aExp = extractFloat128Exp( a );
5007    aSign = extractFloat128Sign( a );
5008    bSig1 = extractFloat128Frac1( b );
5009    bSig0 = extractFloat128Frac0( b );
5010    bExp = extractFloat128Exp( b );
5011    bSign = extractFloat128Sign( b );
5012    zSign = aSign ^ bSign;
5013    if ( aExp == 0x7FFF ) {
5014        if (    ( aSig0 | aSig1 )
5015             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5016            return propagateFloat128NaN( a, b STATUS_VAR );
5017        }
5018        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5019        return packFloat128( zSign, 0x7FFF, 0, 0 );
5020    }
5021    if ( bExp == 0x7FFF ) {
5022        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5023        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5024 invalid:
5025            float_raise( float_flag_invalid STATUS_VAR);
5026            z.low = float128_default_nan_low;
5027            z.high = float128_default_nan_high;
5028            return z;
5029        }
5030        return packFloat128( zSign, 0x7FFF, 0, 0 );
5031    }
5032    if ( aExp == 0 ) {
5033        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5034        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5035    }
5036    if ( bExp == 0 ) {
5037        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5038        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5039    }
5040    zExp = aExp + bExp - 0x4000;
5041    aSig0 |= LIT64( 0x0001000000000000 );
5042    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5043    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5044    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5045    zSig2 |= ( zSig3 != 0 );
5046    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5047        shift128ExtraRightJamming(
5048            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5049        ++zExp;
5050    }
5051    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5052
5053}
5054
5055/*----------------------------------------------------------------------------
5056| Returns the result of dividing the quadruple-precision floating-point value
5057| `a' by the corresponding value `b'.  The operation is performed according to
5058| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5059*----------------------------------------------------------------------------*/
5060
5061float128 float128_div( float128 a, float128 b STATUS_PARAM )
5062{
5063    flag aSign, bSign, zSign;
5064    int32 aExp, bExp, zExp;
5065    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5066    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5067    float128 z;
5068
5069    aSig1 = extractFloat128Frac1( a );
5070    aSig0 = extractFloat128Frac0( a );
5071    aExp = extractFloat128Exp( a );
5072    aSign = extractFloat128Sign( a );
5073    bSig1 = extractFloat128Frac1( b );
5074    bSig0 = extractFloat128Frac0( b );
5075    bExp = extractFloat128Exp( b );
5076    bSign = extractFloat128Sign( b );
5077    zSign = aSign ^ bSign;
5078    if ( aExp == 0x7FFF ) {
5079        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5080        if ( bExp == 0x7FFF ) {
5081            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5082            goto invalid;
5083        }
5084        return packFloat128( zSign, 0x7FFF, 0, 0 );
5085    }
5086    if ( bExp == 0x7FFF ) {
5087        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5088        return packFloat128( zSign, 0, 0, 0 );
5089    }
5090    if ( bExp == 0 ) {
5091        if ( ( bSig0 | bSig1 ) == 0 ) {
5092            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5093 invalid:
5094                float_raise( float_flag_invalid STATUS_VAR);
5095                z.low = float128_default_nan_low;
5096                z.high = float128_default_nan_high;
5097                return z;
5098            }
5099            float_raise( float_flag_divbyzero STATUS_VAR);
5100            return packFloat128( zSign, 0x7FFF, 0, 0 );
5101        }
5102        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5103    }
5104    if ( aExp == 0 ) {
5105        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5106        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5107    }
5108    zExp = aExp - bExp + 0x3FFD;
5109    shortShift128Left(
5110        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5111    shortShift128Left(
5112        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5113    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5114        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5115        ++zExp;
5116    }
5117    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5118    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5119    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5120    while ( (sbits64) rem0 < 0 ) {
5121        --zSig0;
5122        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5123    }
5124    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5125    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5126        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5127        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5128        while ( (sbits64) rem1 < 0 ) {
5129            --zSig1;
5130            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5131        }
5132        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5133    }
5134    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5135    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5136
5137}
5138
5139/*----------------------------------------------------------------------------
5140| Returns the remainder of the quadruple-precision floating-point value `a'
5141| with respect to the corresponding value `b'.  The operation is performed
5142| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5143*----------------------------------------------------------------------------*/
5144
5145float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5146{
5147    flag aSign, bSign, zSign;
5148    int32 aExp, bExp, expDiff;
5149    bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5150    bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5151    sbits64 sigMean0;
5152    float128 z;
5153
5154    aSig1 = extractFloat128Frac1( a );
5155    aSig0 = extractFloat128Frac0( a );
5156    aExp = extractFloat128Exp( a );
5157    aSign = extractFloat128Sign( a );
5158    bSig1 = extractFloat128Frac1( b );
5159    bSig0 = extractFloat128Frac0( b );
5160    bExp = extractFloat128Exp( b );
5161    bSign = extractFloat128Sign( b );
5162    if ( aExp == 0x7FFF ) {
5163        if (    ( aSig0 | aSig1 )
5164             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5165            return propagateFloat128NaN( a, b STATUS_VAR );
5166        }
5167        goto invalid;
5168    }
5169    if ( bExp == 0x7FFF ) {
5170        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5171        return a;
5172    }
5173    if ( bExp == 0 ) {
5174        if ( ( bSig0 | bSig1 ) == 0 ) {
5175 invalid:
5176            float_raise( float_flag_invalid STATUS_VAR);
5177            z.low = float128_default_nan_low;
5178            z.high = float128_default_nan_high;
5179            return z;
5180        }
5181        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5182    }
5183    if ( aExp == 0 ) {
5184        if ( ( aSig0 | aSig1 ) == 0 ) return a;
5185        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5186    }
5187    expDiff = aExp - bExp;
5188    if ( expDiff < -1 ) return a;
5189    shortShift128Left(
5190        aSig0 | LIT64( 0x0001000000000000 ),
5191        aSig1,
5192        15 - ( expDiff < 0 ),
5193        &aSig0,
5194        &aSig1
5195    );
5196    shortShift128Left(
5197        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5198    q = le128( bSig0, bSig1, aSig0, aSig1 );
5199    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5200    expDiff -= 64;
5201    while ( 0 < expDiff ) {
5202        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5203        q = ( 4 < q ) ? q - 4 : 0;
5204        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5205        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5206        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5207        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5208        expDiff -= 61;
5209    }
5210    if ( -64 < expDiff ) {
5211        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5212        q = ( 4 < q ) ? q - 4 : 0;
5213        q >>= - expDiff;
5214        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5215        expDiff += 52;
5216        if ( expDiff < 0 ) {
5217            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5218        }
5219        else {
5220            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5221        }
5222        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5223        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5224    }
5225    else {
5226        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5227        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5228    }
5229    do {
5230        alternateASig0 = aSig0;
5231        alternateASig1 = aSig1;
5232        ++q;
5233        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5234    } while ( 0 <= (sbits64) aSig0 );
5235    add128(
5236        aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5237    if (    ( sigMean0 < 0 )
5238         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5239        aSig0 = alternateASig0;
5240        aSig1 = alternateASig1;
5241    }
5242    zSign = ( (sbits64) aSig0 < 0 );
5243    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5244    return
5245        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5246
5247}
5248
5249/*----------------------------------------------------------------------------
5250| Returns the square root of the quadruple-precision floating-point value `a'.
5251| The operation is performed according to the IEC/IEEE Standard for Binary
5252| Floating-Point Arithmetic.
5253*----------------------------------------------------------------------------*/
5254
5255float128 float128_sqrt( float128 a STATUS_PARAM )
5256{
5257    flag aSign;
5258    int32 aExp, zExp;
5259    bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5260    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5261    float128 z;
5262
5263    aSig1 = extractFloat128Frac1( a );
5264    aSig0 = extractFloat128Frac0( a );
5265    aExp = extractFloat128Exp( a );
5266    aSign = extractFloat128Sign( a );
5267    if ( aExp == 0x7FFF ) {
5268        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5269        if ( ! aSign ) return a;
5270        goto invalid;
5271    }
5272    if ( aSign ) {
5273        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5274 invalid:
5275        float_raise( float_flag_invalid STATUS_VAR);
5276        z.low = float128_default_nan_low;
5277        z.high = float128_default_nan_high;
5278        return z;
5279    }
5280    if ( aExp == 0 ) {
5281        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5282        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5283    }
5284    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5285    aSig0 |= LIT64( 0x0001000000000000 );
5286    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5287    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5288    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5289    doubleZSig0 = zSig0<<1;
5290    mul64To128( zSig0, zSig0, &term0, &term1 );
5291    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5292    while ( (sbits64) rem0 < 0 ) {
5293        --zSig0;
5294        doubleZSig0 -= 2;
5295        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5296    }
5297    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5298    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5299        if ( zSig1 == 0 ) zSig1 = 1;
5300        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5301        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5302        mul64To128( zSig1, zSig1, &term2, &term3 );
5303        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5304        while ( (sbits64) rem1 < 0 ) {
5305            --zSig1;
5306            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5307            term3 |= 1;
5308            term2 |= doubleZSig0;
5309            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5310        }
5311        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5312    }
5313    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5314    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5315
5316}
5317
5318/*----------------------------------------------------------------------------
5319| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5320| the corresponding value `b', and 0 otherwise.  The comparison is performed
5321| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5322*----------------------------------------------------------------------------*/
5323
5324int float128_eq( float128 a, float128 b STATUS_PARAM )
5325{
5326
5327    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5328              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5329         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5330              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5331       ) {
5332        if (    float128_is_signaling_nan( a )
5333             || float128_is_signaling_nan( b ) ) {
5334            float_raise( float_flag_invalid STATUS_VAR);
5335        }
5336        return 0;
5337    }
5338    return
5339           ( a.low == b.low )
5340        && (    ( a.high == b.high )
5341             || (    ( a.low == 0 )
5342                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5343           );
5344
5345}
5346
5347/*----------------------------------------------------------------------------
5348| Returns 1 if the quadruple-precision floating-point value `a' is less than
5349| or equal to the corresponding value `b', and 0 otherwise.  The comparison
5350| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5351| Arithmetic.
5352*----------------------------------------------------------------------------*/
5353
5354int float128_le( float128 a, float128 b STATUS_PARAM )
5355{
5356    flag aSign, bSign;
5357
5358    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5359              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5360         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5361              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5362       ) {
5363        float_raise( float_flag_invalid STATUS_VAR);
5364        return 0;
5365    }
5366    aSign = extractFloat128Sign( a );
5367    bSign = extractFloat128Sign( b );
5368    if ( aSign != bSign ) {
5369        return
5370               aSign
5371            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5372                 == 0 );
5373    }
5374    return
5375          aSign ? le128( b.high, b.low, a.high, a.low )
5376        : le128( a.high, a.low, b.high, b.low );
5377
5378}
5379
5380/*----------------------------------------------------------------------------
5381| Returns 1 if the quadruple-precision floating-point value `a' is less than
5382| the corresponding value `b', and 0 otherwise.  The comparison is performed
5383| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5384*----------------------------------------------------------------------------*/
5385
5386int float128_lt( float128 a, float128 b STATUS_PARAM )
5387{
5388    flag aSign, bSign;
5389
5390    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5391              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5392         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5393              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5394       ) {
5395        float_raise( float_flag_invalid STATUS_VAR);
5396        return 0;
5397    }
5398    aSign = extractFloat128Sign( a );
5399    bSign = extractFloat128Sign( b );
5400    if ( aSign != bSign ) {
5401        return
5402               aSign
5403            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5404                 != 0 );
5405    }
5406    return
5407          aSign ? lt128( b.high, b.low, a.high, a.low )
5408        : lt128( a.high, a.low, b.high, b.low );
5409
5410}
5411
5412/*----------------------------------------------------------------------------
5413| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5414| the corresponding value `b', and 0 otherwise.  The invalid exception is
5415| raised if either operand is a NaN.  Otherwise, the comparison is performed
5416| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5417*----------------------------------------------------------------------------*/
5418
5419int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5420{
5421
5422    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5423              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5424         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5425              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5426       ) {
5427        float_raise( float_flag_invalid STATUS_VAR);
5428        return 0;
5429    }
5430    return
5431           ( a.low == b.low )
5432        && (    ( a.high == b.high )
5433             || (    ( a.low == 0 )
5434                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5435           );
5436
5437}
5438
5439/*----------------------------------------------------------------------------
5440| Returns 1 if the quadruple-precision floating-point value `a' is less than
5441| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5442| cause an exception.  Otherwise, the comparison is performed according to the
5443| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5444*----------------------------------------------------------------------------*/
5445
5446int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5447{
5448    flag aSign, bSign;
5449
5450    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5451              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5452         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5453              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5454       ) {
5455        if (    float128_is_signaling_nan( a )
5456             || float128_is_signaling_nan( b ) ) {
5457            float_raise( float_flag_invalid STATUS_VAR);
5458        }
5459        return 0;
5460    }
5461    aSign = extractFloat128Sign( a );
5462    bSign = extractFloat128Sign( b );
5463    if ( aSign != bSign ) {
5464        return
5465               aSign
5466            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5467                 == 0 );
5468    }
5469    return
5470          aSign ? le128( b.high, b.low, a.high, a.low )
5471        : le128( a.high, a.low, b.high, b.low );
5472
5473}
5474
5475/*----------------------------------------------------------------------------
5476| Returns 1 if the quadruple-precision floating-point value `a' is less than
5477| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5478| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5479| Standard for Binary Floating-Point Arithmetic.
5480*----------------------------------------------------------------------------*/
5481
5482int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5483{
5484    flag aSign, bSign;
5485
5486    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5487              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5488         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5489              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5490       ) {
5491        if (    float128_is_signaling_nan( a )
5492             || float128_is_signaling_nan( b ) ) {
5493            float_raise( float_flag_invalid STATUS_VAR);
5494        }
5495        return 0;
5496    }
5497    aSign = extractFloat128Sign( a );
5498    bSign = extractFloat128Sign( b );
5499    if ( aSign != bSign ) {
5500        return
5501               aSign
5502            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5503                 != 0 );
5504    }
5505    return
5506          aSign ? lt128( b.high, b.low, a.high, a.low )
5507        : lt128( a.high, a.low, b.high, b.low );
5508
5509}
5510
5511#endif
5512
5513/* misc functions */
5514float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5515{
5516    return int64_to_float32(a STATUS_VAR);
5517}
5518
5519float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5520{
5521    return int64_to_float64(a STATUS_VAR);
5522}
5523
5524unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5525{
5526    int64_t v;
5527    unsigned int res;
5528
5529    v = float32_to_int64(a STATUS_VAR);
5530    if (v < 0) {
5531        res = 0;
5532        float_raise( float_flag_invalid STATUS_VAR);
5533    } else if (v > 0xffffffff) {
5534        res = 0xffffffff;
5535        float_raise( float_flag_invalid STATUS_VAR);
5536    } else {
5537        res = v;
5538    }
5539    return res;
5540}
5541
5542unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5543{
5544    int64_t v;
5545    unsigned int res;
5546
5547    v = float32_to_int64_round_to_zero(a STATUS_VAR);
5548    if (v < 0) {
5549        res = 0;
5550        float_raise( float_flag_invalid STATUS_VAR);
5551    } else if (v > 0xffffffff) {
5552        res = 0xffffffff;
5553        float_raise( float_flag_invalid STATUS_VAR);
5554    } else {
5555        res = v;
5556    }
5557    return res;
5558}
5559
5560unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5561{
5562    int64_t v;
5563    unsigned int res;
5564
5565    v = float64_to_int64(a STATUS_VAR);
5566    if (v < 0) {
5567        res = 0;
5568        float_raise( float_flag_invalid STATUS_VAR);
5569    } else if (v > 0xffffffff) {
5570        res = 0xffffffff;
5571        float_raise( float_flag_invalid STATUS_VAR);
5572    } else {
5573        res = v;
5574    }
5575    return res;
5576}
5577
5578unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5579{
5580    int64_t v;
5581    unsigned int res;
5582
5583    v = float64_to_int64_round_to_zero(a STATUS_VAR);
5584    if (v < 0) {
5585        res = 0;
5586        float_raise( float_flag_invalid STATUS_VAR);
5587    } else if (v > 0xffffffff) {
5588        res = 0xffffffff;
5589        float_raise( float_flag_invalid STATUS_VAR);
5590    } else {
5591        res = v;
5592    }
5593    return res;
5594}
5595
5596/* FIXME: This looks broken.  */
5597uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5598{
5599    int64_t v;
5600
5601    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5602    v += float64_val(a);
5603    v = float64_to_int64(make_float64(v) STATUS_VAR);
5604
5605    return v - INT64_MIN;
5606}
5607
5608uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5609{
5610    int64_t v;
5611
5612    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5613    v += float64_val(a);
5614    v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5615
5616    return v - INT64_MIN;
5617}
5618
5619#define COMPARE(s, nan_exp)                                                  \
5620INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5621                                      int is_quiet STATUS_PARAM )            \
5622{                                                                            \
5623    flag aSign, bSign;                                                       \
5624    bits ## s av, bv;                                                        \
5625                                                                             \
5626    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5627         extractFloat ## s ## Frac( a ) ) ||                                 \
5628        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5629          extractFloat ## s ## Frac( b ) )) {                                \
5630        if (!is_quiet ||                                                     \
5631            float ## s ## _is_signaling_nan( a ) ||                          \
5632            float ## s ## _is_signaling_nan( b ) ) {                         \
5633            float_raise( float_flag_invalid STATUS_VAR);                     \
5634        }                                                                    \
5635        return float_relation_unordered;                                     \
5636    }                                                                        \
5637    aSign = extractFloat ## s ## Sign( a );                                  \
5638    bSign = extractFloat ## s ## Sign( b );                                  \
5639    av = float ## s ## _val(a);                                              \
5640    bv = float ## s ## _val(b);                                              \
5641    if ( aSign != bSign ) {                                                  \
5642        if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
5643            /* zero case */                                                  \
5644            return float_relation_equal;                                     \
5645        } else {                                                             \
5646            return 1 - (2 * aSign);                                          \
5647        }                                                                    \
5648    } else {                                                                 \
5649        if (av == bv) {                                                      \
5650            return float_relation_equal;                                     \
5651        } else {                                                             \
5652            return 1 - 2 * (aSign ^ ( av < bv ));                            \
5653        }                                                                    \
5654    }                                                                        \
5655}                                                                            \
5656                                                                             \
5657int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
5658{                                                                            \
5659    return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5660}                                                                            \
5661                                                                             \
5662int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
5663{                                                                            \
5664    return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
5665}
5666
5667COMPARE(32, 0xff)
5668COMPARE(64, 0x7ff)
5669
5670INLINE int float128_compare_internal( float128 a, float128 b,
5671                                      int is_quiet STATUS_PARAM )
5672{
5673    flag aSign, bSign;
5674
5675    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5676          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
5677        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
5678          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
5679        if (!is_quiet ||
5680            float128_is_signaling_nan( a ) ||
5681            float128_is_signaling_nan( b ) ) {
5682            float_raise( float_flag_invalid STATUS_VAR);
5683        }
5684        return float_relation_unordered;
5685    }
5686    aSign = extractFloat128Sign( a );
5687    bSign = extractFloat128Sign( b );
5688    if ( aSign != bSign ) {
5689        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
5690            /* zero case */
5691            return float_relation_equal;
5692        } else {
5693            return 1 - (2 * aSign);
5694        }
5695    } else {
5696        if (a.low == b.low && a.high == b.high) {
5697            return float_relation_equal;
5698        } else {
5699            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
5700        }
5701    }
5702}
5703
5704int float128_compare( float128 a, float128 b STATUS_PARAM )
5705{
5706    return float128_compare_internal(a, b, 0 STATUS_VAR);
5707}
5708
5709int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
5710{
5711    return float128_compare_internal(a, b, 1 STATUS_VAR);
5712}
5713
5714/* Multiply A by 2 raised to the power N.  */
5715float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5716{
5717    flag aSign;
5718    int16 aExp;
5719    bits32 aSig;
5720
5721    aSig = extractFloat32Frac( a );
5722    aExp = extractFloat32Exp( a );
5723    aSign = extractFloat32Sign( a );
5724
5725    if ( aExp == 0xFF ) {
5726        return a;
5727    }
5728    if ( aExp != 0 )
5729        aSig |= 0x00800000;
5730    else if ( aSig == 0 )
5731        return a;
5732
5733    aExp += n - 1;
5734    aSig <<= 7;
5735    return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5736}
5737
5738float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5739{
5740    flag aSign;
5741    int16 aExp;
5742    bits64 aSig;
5743
5744    aSig = extractFloat64Frac( a );
5745    aExp = extractFloat64Exp( a );
5746    aSign = extractFloat64Sign( a );
5747
5748    if ( aExp == 0x7FF ) {
5749        return a;
5750    }
5751    if ( aExp != 0 )
5752        aSig |= LIT64( 0x0010000000000000 );
5753    else if ( aSig == 0 )
5754        return a;
5755
5756    aExp += n - 1;
5757    aSig <<= 10;
5758    return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5759}
5760
5761#ifdef FLOATX80
5762floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5763{
5764    flag aSign;
5765    int16 aExp;
5766    bits64 aSig;
5767
5768    aSig = extractFloatx80Frac( a );
5769    aExp = extractFloatx80Exp( a );
5770    aSign = extractFloatx80Sign( a );
5771
5772    if ( aExp == 0x7FF ) {
5773        return a;
5774    }
5775    if (aExp == 0 && aSig == 0)
5776        return a;
5777
5778    aExp += n;
5779    return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5780                                          aSign, aExp, aSig, 0 STATUS_VAR );
5781}
5782#endif
5783
5784#ifdef FLOAT128
5785float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5786{
5787    flag aSign;
5788    int32 aExp;
5789    bits64 aSig0, aSig1;
5790
5791    aSig1 = extractFloat128Frac1( a );
5792    aSig0 = extractFloat128Frac0( a );
5793    aExp = extractFloat128Exp( a );
5794    aSign = extractFloat128Sign( a );
5795    if ( aExp == 0x7FFF ) {
5796        return a;
5797    }
5798    if ( aExp != 0 )
5799        aSig0 |= LIT64( 0x0001000000000000 );
5800    else if ( aSig0 == 0 && aSig1 == 0 )
5801        return a;
5802
5803    aExp += n - 1;
5804    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
5805                                          STATUS_VAR );
5806
5807}
5808#endif
5809