qemu/fpu/softfloat.c
<<
>>
Prefs
   1
   2/*============================================================================
   3
   4This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
   5Package, Release 2b.
   6
   7Written by John R. Hauser.  This work was made possible in part by the
   8International Computer Science Institute, located at Suite 600, 1947 Center
   9Street, Berkeley, California 94704.  Funding was partially provided by the
  10National Science Foundation under grant MIP-9311980.  The original version
  11of this code was written as part of a project to build a fixed-point vector
  12processor in collaboration with the University of California at Berkeley,
  13overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15arithmetic/SoftFloat.html'.
  16
  17THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  18been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25
  26Derivative works are acceptable, even for commercial purposes, so long as
  27(1) the source code for the derivative work includes prominent notice that
  28the work is derivative, and (2) the source code includes prominent notice with
  29these four paragraphs for those parts of this code that are retained.
  30
  31=============================================================================*/
  32
  33/* FIXME: Flush-To-Zero only effects results.  Denormal inputs should also
  34   be flushed to zero.  */
  35#include "softfloat.h"
  36
  37/*----------------------------------------------------------------------------
  38| Primitive arithmetic functions, including multi-word arithmetic, and
  39| division and square root approximations.  (Can be specialized to target if
  40| desired.)
  41*----------------------------------------------------------------------------*/
  42#include "softfloat-macros.h"
  43
  44/*----------------------------------------------------------------------------
  45| Functions and definitions to determine:  (1) whether tininess for underflow
  46| is detected before or after rounding by default, (2) what (if anything)
  47| happens when exceptions are raised, (3) how signaling NaNs are distinguished
  48| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  49| are propagated from function inputs to output.  These details are target-
  50| specific.
  51*----------------------------------------------------------------------------*/
  52#include "softfloat-specialize.h"
  53
  54void set_float_rounding_mode(int val STATUS_PARAM)
  55{
  56    STATUS(float_rounding_mode) = val;
  57}
  58
  59void set_float_exception_flags(int val STATUS_PARAM)
  60{
  61    STATUS(float_exception_flags) = val;
  62}
  63
  64#ifdef FLOATX80
  65void set_floatx80_rounding_precision(int val STATUS_PARAM)
  66{
  67    STATUS(floatx80_rounding_precision) = val;
  68}
  69#endif
  70
  71/*----------------------------------------------------------------------------
  72| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  73| and 7, and returns the properly rounded 32-bit integer corresponding to the
  74| input.  If `zSign' is 1, the input is negated before being converted to an
  75| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
  76| is simply rounded to an integer, with the inexact exception raised if the
  77| input cannot be represented exactly as an integer.  However, if the fixed-
  78| point input is too large, the invalid exception is raised and the largest
  79| positive or negative integer is returned.
  80*----------------------------------------------------------------------------*/
  81
  82static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
  83{
  84    int8 roundingMode;
  85    flag roundNearestEven;
  86    int8 roundIncrement, roundBits;
  87    int32 z;
  88
  89    roundingMode = STATUS(float_rounding_mode);
  90    roundNearestEven = ( roundingMode == float_round_nearest_even );
  91    roundIncrement = 0x40;
  92    if ( ! roundNearestEven ) {
  93        if ( roundingMode == float_round_to_zero ) {
  94            roundIncrement = 0;
  95        }
  96        else {
  97            roundIncrement = 0x7F;
  98            if ( zSign ) {
  99                if ( roundingMode == float_round_up ) roundIncrement = 0;
 100            }
 101            else {
 102                if ( roundingMode == float_round_down ) roundIncrement = 0;
 103            }
 104        }
 105    }
 106    roundBits = absZ & 0x7F;
 107    absZ = ( absZ + roundIncrement )>>7;
 108    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 109    z = absZ;
 110    if ( zSign ) z = - z;
 111    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 112        float_raise( float_flag_invalid STATUS_VAR);
 113        return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 114    }
 115    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 116    return z;
 117
 118}
 119
 120/*----------------------------------------------------------------------------
 121| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 122| `absZ1', with binary point between bits 63 and 64 (between the input words),
 123| and returns the properly rounded 64-bit integer corresponding to the input.
 124| If `zSign' is 1, the input is negated before being converted to an integer.
 125| Ordinarily, the fixed-point input is simply rounded to an integer, with
 126| the inexact exception raised if the input cannot be represented exactly as
 127| an integer.  However, if the fixed-point input is too large, the invalid
 128| exception is raised and the largest positive or negative integer is
 129| returned.
 130*----------------------------------------------------------------------------*/
 131
 132static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
 133{
 134    int8 roundingMode;
 135    flag roundNearestEven, increment;
 136    int64 z;
 137
 138    roundingMode = STATUS(float_rounding_mode);
 139    roundNearestEven = ( roundingMode == float_round_nearest_even );
 140    increment = ( (sbits64) absZ1 < 0 );
 141    if ( ! roundNearestEven ) {
 142        if ( roundingMode == float_round_to_zero ) {
 143            increment = 0;
 144        }
 145        else {
 146            if ( zSign ) {
 147                increment = ( roundingMode == float_round_down ) && absZ1;
 148            }
 149            else {
 150                increment = ( roundingMode == float_round_up ) && absZ1;
 151            }
 152        }
 153    }
 154    if ( increment ) {
 155        ++absZ0;
 156        if ( absZ0 == 0 ) goto overflow;
 157        absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 158    }
 159    z = absZ0;
 160    if ( zSign ) z = - z;
 161    if ( z && ( ( z < 0 ) ^ zSign ) ) {
 162 overflow:
 163        float_raise( float_flag_invalid STATUS_VAR);
 164        return
 165              zSign ? (sbits64) LIT64( 0x8000000000000000 )
 166            : LIT64( 0x7FFFFFFFFFFFFFFF );
 167    }
 168    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 169    return z;
 170
 171}
 172
 173/*----------------------------------------------------------------------------
 174| Returns the fraction bits of the single-precision floating-point value `a'.
 175*----------------------------------------------------------------------------*/
 176
 177INLINE bits32 extractFloat32Frac( float32 a )
 178{
 179
 180    return float32_val(a) & 0x007FFFFF;
 181
 182}
 183
 184/*----------------------------------------------------------------------------
 185| Returns the exponent bits of the single-precision floating-point value `a'.
 186*----------------------------------------------------------------------------*/
 187
 188INLINE int16 extractFloat32Exp( float32 a )
 189{
 190
 191    return ( float32_val(a)>>23 ) & 0xFF;
 192
 193}
 194
 195/*----------------------------------------------------------------------------
 196| Returns the sign bit of the single-precision floating-point value `a'.
 197*----------------------------------------------------------------------------*/
 198
 199INLINE flag extractFloat32Sign( float32 a )
 200{
 201
 202    return float32_val(a)>>31;
 203
 204}
 205
 206/*----------------------------------------------------------------------------
 207| Normalizes the subnormal single-precision floating-point value represented
 208| by the denormalized significand `aSig'.  The normalized exponent and
 209| significand are stored at the locations pointed to by `zExpPtr' and
 210| `zSigPtr', respectively.
 211*----------------------------------------------------------------------------*/
 212
 213static void
 214 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 215{
 216    int8 shiftCount;
 217
 218    shiftCount = countLeadingZeros32( aSig ) - 8;
 219    *zSigPtr = aSig<<shiftCount;
 220    *zExpPtr = 1 - shiftCount;
 221
 222}
 223
 224/*----------------------------------------------------------------------------
 225| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 226| single-precision floating-point value, returning the result.  After being
 227| shifted into the proper positions, the three fields are simply added
 228| together to form the result.  This means that any integer portion of `zSig'
 229| will be added into the exponent.  Since a properly normalized significand
 230| will have an integer portion equal to 1, the `zExp' input should be 1 less
 231| than the desired result exponent whenever `zSig' is a complete, normalized
 232| significand.
 233*----------------------------------------------------------------------------*/
 234
 235INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 236{
 237
 238    return make_float32(
 239          ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
 240
 241}
 242
 243/*----------------------------------------------------------------------------
 244| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 245| and significand `zSig', and returns the proper single-precision floating-
 246| point value corresponding to the abstract input.  Ordinarily, the abstract
 247| value is simply rounded and packed into the single-precision format, with
 248| the inexact exception raised if the abstract input cannot be represented
 249| exactly.  However, if the abstract value is too large, the overflow and
 250| inexact exceptions are raised and an infinity or maximal finite value is
 251| returned.  If the abstract value is too small, the input value is rounded to
 252| a subnormal number, and the underflow and inexact exceptions are raised if
 253| the abstract input cannot be represented exactly as a subnormal single-
 254| precision floating-point number.
 255|     The input significand `zSig' has its binary point between bits 30
 256| and 29, which is 7 bits to the left of the usual location.  This shifted
 257| significand must be normalized or smaller.  If `zSig' is not normalized,
 258| `zExp' must be 0; in that case, the result returned is a subnormal number,
 259| and it must not require rounding.  In the usual case that `zSig' is
 260| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 261| The handling of underflow and overflow follows the IEC/IEEE Standard for
 262| Binary Floating-Point Arithmetic.
 263*----------------------------------------------------------------------------*/
 264
 265static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 266{
 267    int8 roundingMode;
 268    flag roundNearestEven;
 269    int8 roundIncrement, roundBits;
 270    flag isTiny;
 271
 272    roundingMode = STATUS(float_rounding_mode);
 273    roundNearestEven = ( roundingMode == float_round_nearest_even );
 274    roundIncrement = 0x40;
 275    if ( ! roundNearestEven ) {
 276        if ( roundingMode == float_round_to_zero ) {
 277            roundIncrement = 0;
 278        }
 279        else {
 280            roundIncrement = 0x7F;
 281            if ( zSign ) {
 282                if ( roundingMode == float_round_up ) roundIncrement = 0;
 283            }
 284            else {
 285                if ( roundingMode == float_round_down ) roundIncrement = 0;
 286            }
 287        }
 288    }
 289    roundBits = zSig & 0x7F;
 290    if ( 0xFD <= (bits16) zExp ) {
 291        if (    ( 0xFD < zExp )
 292             || (    ( zExp == 0xFD )
 293                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 294           ) {
 295            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 296            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 297        }
 298        if ( zExp < 0 ) {
 299            if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
 300            isTiny =
 301                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 302                || ( zExp < -1 )
 303                || ( zSig + roundIncrement < 0x80000000 );
 304            shift32RightJamming( zSig, - zExp, &zSig );
 305            zExp = 0;
 306            roundBits = zSig & 0x7F;
 307            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 308        }
 309    }
 310    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 311    zSig = ( zSig + roundIncrement )>>7;
 312    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 313    if ( zSig == 0 ) zExp = 0;
 314    return packFloat32( zSign, zExp, zSig );
 315
 316}
 317
 318/*----------------------------------------------------------------------------
 319| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 320| and significand `zSig', and returns the proper single-precision floating-
 321| point value corresponding to the abstract input.  This routine is just like
 322| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 323| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 324| floating-point exponent.
 325*----------------------------------------------------------------------------*/
 326
 327static float32
 328 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 329{
 330    int8 shiftCount;
 331
 332    shiftCount = countLeadingZeros32( zSig ) - 1;
 333    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 334
 335}
 336
 337/*----------------------------------------------------------------------------
 338| Returns the fraction bits of the double-precision floating-point value `a'.
 339*----------------------------------------------------------------------------*/
 340
 341INLINE bits64 extractFloat64Frac( float64 a )
 342{
 343
 344    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 345
 346}
 347
 348/*----------------------------------------------------------------------------
 349| Returns the exponent bits of the double-precision floating-point value `a'.
 350*----------------------------------------------------------------------------*/
 351
 352INLINE int16 extractFloat64Exp( float64 a )
 353{
 354
 355    return ( float64_val(a)>>52 ) & 0x7FF;
 356
 357}
 358
 359/*----------------------------------------------------------------------------
 360| Returns the sign bit of the double-precision floating-point value `a'.
 361*----------------------------------------------------------------------------*/
 362
 363INLINE flag extractFloat64Sign( float64 a )
 364{
 365
 366    return float64_val(a)>>63;
 367
 368}
 369
 370/*----------------------------------------------------------------------------
 371| Normalizes the subnormal double-precision floating-point value represented
 372| by the denormalized significand `aSig'.  The normalized exponent and
 373| significand are stored at the locations pointed to by `zExpPtr' and
 374| `zSigPtr', respectively.
 375*----------------------------------------------------------------------------*/
 376
 377static void
 378 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 379{
 380    int8 shiftCount;
 381
 382    shiftCount = countLeadingZeros64( aSig ) - 11;
 383    *zSigPtr = aSig<<shiftCount;
 384    *zExpPtr = 1 - shiftCount;
 385
 386}
 387
 388/*----------------------------------------------------------------------------
 389| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 390| double-precision floating-point value, returning the result.  After being
 391| shifted into the proper positions, the three fields are simply added
 392| together to form the result.  This means that any integer portion of `zSig'
 393| will be added into the exponent.  Since a properly normalized significand
 394| will have an integer portion equal to 1, the `zExp' input should be 1 less
 395| than the desired result exponent whenever `zSig' is a complete, normalized
 396| significand.
 397*----------------------------------------------------------------------------*/
 398
 399INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 400{
 401
 402    return make_float64(
 403        ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
 404
 405}
 406
 407/*----------------------------------------------------------------------------
 408| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 409| and significand `zSig', and returns the proper double-precision floating-
 410| point value corresponding to the abstract input.  Ordinarily, the abstract
 411| value is simply rounded and packed into the double-precision format, with
 412| the inexact exception raised if the abstract input cannot be represented
 413| exactly.  However, if the abstract value is too large, the overflow and
 414| inexact exceptions are raised and an infinity or maximal finite value is
 415| returned.  If the abstract value is too small, the input value is rounded
 416| to a subnormal number, and the underflow and inexact exceptions are raised
 417| if the abstract input cannot be represented exactly as a subnormal double-
 418| precision floating-point number.
 419|     The input significand `zSig' has its binary point between bits 62
 420| and 61, which is 10 bits to the left of the usual location.  This shifted
 421| significand must be normalized or smaller.  If `zSig' is not normalized,
 422| `zExp' must be 0; in that case, the result returned is a subnormal number,
 423| and it must not require rounding.  In the usual case that `zSig' is
 424| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 425| The handling of underflow and overflow follows the IEC/IEEE Standard for
 426| Binary Floating-Point Arithmetic.
 427*----------------------------------------------------------------------------*/
 428
 429static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 430{
 431    int8 roundingMode;
 432    flag roundNearestEven;
 433    int16 roundIncrement, roundBits;
 434    flag isTiny;
 435
 436    roundingMode = STATUS(float_rounding_mode);
 437    roundNearestEven = ( roundingMode == float_round_nearest_even );
 438    roundIncrement = 0x200;
 439    if ( ! roundNearestEven ) {
 440        if ( roundingMode == float_round_to_zero ) {
 441            roundIncrement = 0;
 442        }
 443        else {
 444            roundIncrement = 0x3FF;
 445            if ( zSign ) {
 446                if ( roundingMode == float_round_up ) roundIncrement = 0;
 447            }
 448            else {
 449                if ( roundingMode == float_round_down ) roundIncrement = 0;
 450            }
 451        }
 452    }
 453    roundBits = zSig & 0x3FF;
 454    if ( 0x7FD <= (bits16) zExp ) {
 455        if (    ( 0x7FD < zExp )
 456             || (    ( zExp == 0x7FD )
 457                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 458           ) {
 459            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 460            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 461        }
 462        if ( zExp < 0 ) {
 463            if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
 464            isTiny =
 465                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 466                || ( zExp < -1 )
 467                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 468            shift64RightJamming( zSig, - zExp, &zSig );
 469            zExp = 0;
 470            roundBits = zSig & 0x3FF;
 471            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 472        }
 473    }
 474    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 475    zSig = ( zSig + roundIncrement )>>10;
 476    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 477    if ( zSig == 0 ) zExp = 0;
 478    return packFloat64( zSign, zExp, zSig );
 479
 480}
 481
 482/*----------------------------------------------------------------------------
 483| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 484| and significand `zSig', and returns the proper double-precision floating-
 485| point value corresponding to the abstract input.  This routine is just like
 486| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 487| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 488| floating-point exponent.
 489*----------------------------------------------------------------------------*/
 490
 491static float64
 492 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 493{
 494    int8 shiftCount;
 495
 496    shiftCount = countLeadingZeros64( zSig ) - 1;
 497    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 498
 499}
 500
 501#ifdef FLOATX80
 502
 503/*----------------------------------------------------------------------------
 504| Returns the fraction bits of the extended double-precision floating-point
 505| value `a'.
 506*----------------------------------------------------------------------------*/
 507
 508INLINE bits64 extractFloatx80Frac( floatx80 a )
 509{
 510
 511    return a.low;
 512
 513}
 514
 515/*----------------------------------------------------------------------------
 516| Returns the exponent bits of the extended double-precision floating-point
 517| value `a'.
 518*----------------------------------------------------------------------------*/
 519
 520INLINE int32 extractFloatx80Exp( floatx80 a )
 521{
 522
 523    return a.high & 0x7FFF;
 524
 525}
 526
 527/*----------------------------------------------------------------------------
 528| Returns the sign bit of the extended double-precision floating-point value
 529| `a'.
 530*----------------------------------------------------------------------------*/
 531
 532INLINE flag extractFloatx80Sign( floatx80 a )
 533{
 534
 535    return a.high>>15;
 536
 537}
 538
 539/*----------------------------------------------------------------------------
 540| Normalizes the subnormal extended double-precision floating-point value
 541| represented by the denormalized significand `aSig'.  The normalized exponent
 542| and significand are stored at the locations pointed to by `zExpPtr' and
 543| `zSigPtr', respectively.
 544*----------------------------------------------------------------------------*/
 545
 546static void
 547 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 548{
 549    int8 shiftCount;
 550
 551    shiftCount = countLeadingZeros64( aSig );
 552    *zSigPtr = aSig<<shiftCount;
 553    *zExpPtr = 1 - shiftCount;
 554
 555}
 556
 557/*----------------------------------------------------------------------------
 558| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 559| extended double-precision floating-point value, returning the result.
 560*----------------------------------------------------------------------------*/
 561
 562INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 563{
 564    floatx80 z;
 565
 566    z.low = zSig;
 567    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 568    return z;
 569
 570}
 571
 572/*----------------------------------------------------------------------------
 573| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 574| and extended significand formed by the concatenation of `zSig0' and `zSig1',
 575| and returns the proper extended double-precision floating-point value
 576| corresponding to the abstract input.  Ordinarily, the abstract value is
 577| rounded and packed into the extended double-precision format, with the
 578| inexact exception raised if the abstract input cannot be represented
 579| exactly.  However, if the abstract value is too large, the overflow and
 580| inexact exceptions are raised and an infinity or maximal finite value is
 581| returned.  If the abstract value is too small, the input value is rounded to
 582| a subnormal number, and the underflow and inexact exceptions are raised if
 583| the abstract input cannot be represented exactly as a subnormal extended
 584| double-precision floating-point number.
 585|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 586| number of bits as single or double precision, respectively.  Otherwise, the
 587| result is rounded to the full precision of the extended double-precision
 588| format.
 589|     The input significand must be normalized or smaller.  If the input
 590| significand is not normalized, `zExp' must be 0; in that case, the result
 591| returned is a subnormal number, and it must not require rounding.  The
 592| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 593| Floating-Point Arithmetic.
 594*----------------------------------------------------------------------------*/
 595
 596static floatx80
 597 roundAndPackFloatx80(
 598     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 599 STATUS_PARAM)
 600{
 601    int8 roundingMode;
 602    flag roundNearestEven, increment, isTiny;
 603    int64 roundIncrement, roundMask, roundBits;
 604
 605    roundingMode = STATUS(float_rounding_mode);
 606    roundNearestEven = ( roundingMode == float_round_nearest_even );
 607    if ( roundingPrecision == 80 ) goto precision80;
 608    if ( roundingPrecision == 64 ) {
 609        roundIncrement = LIT64( 0x0000000000000400 );
 610        roundMask = LIT64( 0x00000000000007FF );
 611    }
 612    else if ( roundingPrecision == 32 ) {
 613        roundIncrement = LIT64( 0x0000008000000000 );
 614        roundMask = LIT64( 0x000000FFFFFFFFFF );
 615    }
 616    else {
 617        goto precision80;
 618    }
 619    zSig0 |= ( zSig1 != 0 );
 620    if ( ! roundNearestEven ) {
 621        if ( roundingMode == float_round_to_zero ) {
 622            roundIncrement = 0;
 623        }
 624        else {
 625            roundIncrement = roundMask;
 626            if ( zSign ) {
 627                if ( roundingMode == float_round_up ) roundIncrement = 0;
 628            }
 629            else {
 630                if ( roundingMode == float_round_down ) roundIncrement = 0;
 631            }
 632        }
 633    }
 634    roundBits = zSig0 & roundMask;
 635    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 636        if (    ( 0x7FFE < zExp )
 637             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 638           ) {
 639            goto overflow;
 640        }
 641        if ( zExp <= 0 ) {
 642            if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
 643            isTiny =
 644                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 645                || ( zExp < 0 )
 646                || ( zSig0 <= zSig0 + roundIncrement );
 647            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 648            zExp = 0;
 649            roundBits = zSig0 & roundMask;
 650            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 651            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 652            zSig0 += roundIncrement;
 653            if ( (sbits64) zSig0 < 0 ) zExp = 1;
 654            roundIncrement = roundMask + 1;
 655            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 656                roundMask |= roundIncrement;
 657            }
 658            zSig0 &= ~ roundMask;
 659            return packFloatx80( zSign, zExp, zSig0 );
 660        }
 661    }
 662    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 663    zSig0 += roundIncrement;
 664    if ( zSig0 < roundIncrement ) {
 665        ++zExp;
 666        zSig0 = LIT64( 0x8000000000000000 );
 667    }
 668    roundIncrement = roundMask + 1;
 669    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 670        roundMask |= roundIncrement;
 671    }
 672    zSig0 &= ~ roundMask;
 673    if ( zSig0 == 0 ) zExp = 0;
 674    return packFloatx80( zSign, zExp, zSig0 );
 675 precision80:
 676    increment = ( (sbits64) zSig1 < 0 );
 677    if ( ! roundNearestEven ) {
 678        if ( roundingMode == float_round_to_zero ) {
 679            increment = 0;
 680        }
 681        else {
 682            if ( zSign ) {
 683                increment = ( roundingMode == float_round_down ) && zSig1;
 684            }
 685            else {
 686                increment = ( roundingMode == float_round_up ) && zSig1;
 687            }
 688        }
 689    }
 690    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 691        if (    ( 0x7FFE < zExp )
 692             || (    ( zExp == 0x7FFE )
 693                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 694                  && increment
 695                )
 696           ) {
 697            roundMask = 0;
 698 overflow:
 699            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 700            if (    ( roundingMode == float_round_to_zero )
 701                 || ( zSign && ( roundingMode == float_round_up ) )
 702                 || ( ! zSign && ( roundingMode == float_round_down ) )
 703               ) {
 704                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 705            }
 706            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 707        }
 708        if ( zExp <= 0 ) {
 709            isTiny =
 710                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 711                || ( zExp < 0 )
 712                || ! increment
 713                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 714            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 715            zExp = 0;
 716            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 717            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 718            if ( roundNearestEven ) {
 719                increment = ( (sbits64) zSig1 < 0 );
 720            }
 721            else {
 722                if ( zSign ) {
 723                    increment = ( roundingMode == float_round_down ) && zSig1;
 724                }
 725                else {
 726                    increment = ( roundingMode == float_round_up ) && zSig1;
 727                }
 728            }
 729            if ( increment ) {
 730                ++zSig0;
 731                zSig0 &=
 732                    ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 733                if ( (sbits64) zSig0 < 0 ) zExp = 1;
 734            }
 735            return packFloatx80( zSign, zExp, zSig0 );
 736        }
 737    }
 738    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 739    if ( increment ) {
 740        ++zSig0;
 741        if ( zSig0 == 0 ) {
 742            ++zExp;
 743            zSig0 = LIT64( 0x8000000000000000 );
 744        }
 745        else {
 746            zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 747        }
 748    }
 749    else {
 750        if ( zSig0 == 0 ) zExp = 0;
 751    }
 752    return packFloatx80( zSign, zExp, zSig0 );
 753
 754}
 755
 756/*----------------------------------------------------------------------------
 757| Takes an abstract floating-point value having sign `zSign', exponent
 758| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 759| and returns the proper extended double-precision floating-point value
 760| corresponding to the abstract input.  This routine is just like
 761| `roundAndPackFloatx80' except that the input significand does not have to be
 762| normalized.
 763*----------------------------------------------------------------------------*/
 764
 765static floatx80
 766 normalizeRoundAndPackFloatx80(
 767     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 768 STATUS_PARAM)
 769{
 770    int8 shiftCount;
 771
 772    if ( zSig0 == 0 ) {
 773        zSig0 = zSig1;
 774        zSig1 = 0;
 775        zExp -= 64;
 776    }
 777    shiftCount = countLeadingZeros64( zSig0 );
 778    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 779    zExp -= shiftCount;
 780    return
 781        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 782
 783}
 784
 785#endif
 786
 787#ifdef FLOAT128
 788
 789/*----------------------------------------------------------------------------
 790| Returns the least-significant 64 fraction bits of the quadruple-precision
 791| floating-point value `a'.
 792*----------------------------------------------------------------------------*/
 793
 794INLINE bits64 extractFloat128Frac1( float128 a )
 795{
 796
 797    return a.low;
 798
 799}
 800
 801/*----------------------------------------------------------------------------
 802| Returns the most-significant 48 fraction bits of the quadruple-precision
 803| floating-point value `a'.
 804*----------------------------------------------------------------------------*/
 805
 806INLINE bits64 extractFloat128Frac0( float128 a )
 807{
 808
 809    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 810
 811}
 812
 813/*----------------------------------------------------------------------------
 814| Returns the exponent bits of the quadruple-precision floating-point value
 815| `a'.
 816*----------------------------------------------------------------------------*/
 817
 818INLINE int32 extractFloat128Exp( float128 a )
 819{
 820
 821    return ( a.high>>48 ) & 0x7FFF;
 822
 823}
 824
 825/*----------------------------------------------------------------------------
 826| Returns the sign bit of the quadruple-precision floating-point value `a'.
 827*----------------------------------------------------------------------------*/
 828
 829INLINE flag extractFloat128Sign( float128 a )
 830{
 831
 832    return a.high>>63;
 833
 834}
 835
 836/*----------------------------------------------------------------------------
 837| Normalizes the subnormal quadruple-precision floating-point value
 838| represented by the denormalized significand formed by the concatenation of
 839| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 840| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 841| significand are stored at the location pointed to by `zSig0Ptr', and the
 842| least significant 64 bits of the normalized significand are stored at the
 843| location pointed to by `zSig1Ptr'.
 844*----------------------------------------------------------------------------*/
 845
 846static void
 847 normalizeFloat128Subnormal(
 848     bits64 aSig0,
 849     bits64 aSig1,
 850     int32 *zExpPtr,
 851     bits64 *zSig0Ptr,
 852     bits64 *zSig1Ptr
 853 )
 854{
 855    int8 shiftCount;
 856
 857    if ( aSig0 == 0 ) {
 858        shiftCount = countLeadingZeros64( aSig1 ) - 15;
 859        if ( shiftCount < 0 ) {
 860            *zSig0Ptr = aSig1>>( - shiftCount );
 861            *zSig1Ptr = aSig1<<( shiftCount & 63 );
 862        }
 863        else {
 864            *zSig0Ptr = aSig1<<shiftCount;
 865            *zSig1Ptr = 0;
 866        }
 867        *zExpPtr = - shiftCount - 63;
 868    }
 869    else {
 870        shiftCount = countLeadingZeros64( aSig0 ) - 15;
 871        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 872        *zExpPtr = 1 - shiftCount;
 873    }
 874
 875}
 876
 877/*----------------------------------------------------------------------------
 878| Packs the sign `zSign', the exponent `zExp', and the significand formed
 879| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 880| floating-point value, returning the result.  After being shifted into the
 881| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 882| added together to form the most significant 32 bits of the result.  This
 883| means that any integer portion of `zSig0' will be added into the exponent.
 884| Since a properly normalized significand will have an integer portion equal
 885| to 1, the `zExp' input should be 1 less than the desired result exponent
 886| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 887| significand.
 888*----------------------------------------------------------------------------*/
 889
 890INLINE float128
 891 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 892{
 893    float128 z;
 894
 895    z.low = zSig1;
 896    z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 897    return z;
 898
 899}
 900
 901/*----------------------------------------------------------------------------
 902| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 903| and extended significand formed by the concatenation of `zSig0', `zSig1',
 904| and `zSig2', and returns the proper quadruple-precision floating-point value
 905| corresponding to the abstract input.  Ordinarily, the abstract value is
 906| simply rounded and packed into the quadruple-precision format, with the
 907| inexact exception raised if the abstract input cannot be represented
 908| exactly.  However, if the abstract value is too large, the overflow and
 909| inexact exceptions are raised and an infinity or maximal finite value is
 910| returned.  If the abstract value is too small, the input value is rounded to
 911| a subnormal number, and the underflow and inexact exceptions are raised if
 912| the abstract input cannot be represented exactly as a subnormal quadruple-
 913| precision floating-point number.
 914|     The input significand must be normalized or smaller.  If the input
 915| significand is not normalized, `zExp' must be 0; in that case, the result
 916| returned is a subnormal number, and it must not require rounding.  In the
 917| usual case that the input significand is normalized, `zExp' must be 1 less
 918| than the ``true'' floating-point exponent.  The handling of underflow and
 919| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 920*----------------------------------------------------------------------------*/
 921
 922static float128
 923 roundAndPackFloat128(
 924     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
 925{
 926    int8 roundingMode;
 927    flag roundNearestEven, increment, isTiny;
 928
 929    roundingMode = STATUS(float_rounding_mode);
 930    roundNearestEven = ( roundingMode == float_round_nearest_even );
 931    increment = ( (sbits64) zSig2 < 0 );
 932    if ( ! roundNearestEven ) {
 933        if ( roundingMode == float_round_to_zero ) {
 934            increment = 0;
 935        }
 936        else {
 937            if ( zSign ) {
 938                increment = ( roundingMode == float_round_down ) && zSig2;
 939            }
 940            else {
 941                increment = ( roundingMode == float_round_up ) && zSig2;
 942            }
 943        }
 944    }
 945    if ( 0x7FFD <= (bits32) zExp ) {
 946        if (    ( 0x7FFD < zExp )
 947             || (    ( zExp == 0x7FFD )
 948                  && eq128(
 949                         LIT64( 0x0001FFFFFFFFFFFF ),
 950                         LIT64( 0xFFFFFFFFFFFFFFFF ),
 951                         zSig0,
 952                         zSig1
 953                     )
 954                  && increment
 955                )
 956           ) {
 957            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 958            if (    ( roundingMode == float_round_to_zero )
 959                 || ( zSign && ( roundingMode == float_round_up ) )
 960                 || ( ! zSign && ( roundingMode == float_round_down ) )
 961               ) {
 962                return
 963                    packFloat128(
 964                        zSign,
 965                        0x7FFE,
 966                        LIT64( 0x0000FFFFFFFFFFFF ),
 967                        LIT64( 0xFFFFFFFFFFFFFFFF )
 968                    );
 969            }
 970            return packFloat128( zSign, 0x7FFF, 0, 0 );
 971        }
 972        if ( zExp < 0 ) {
 973            if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
 974            isTiny =
 975                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 976                || ( zExp < -1 )
 977                || ! increment
 978                || lt128(
 979                       zSig0,
 980                       zSig1,
 981                       LIT64( 0x0001FFFFFFFFFFFF ),
 982                       LIT64( 0xFFFFFFFFFFFFFFFF )
 983                   );
 984            shift128ExtraRightJamming(
 985                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
 986            zExp = 0;
 987            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
 988            if ( roundNearestEven ) {
 989                increment = ( (sbits64) zSig2 < 0 );
 990            }
 991            else {
 992                if ( zSign ) {
 993                    increment = ( roundingMode == float_round_down ) && zSig2;
 994                }
 995                else {
 996                    increment = ( roundingMode == float_round_up ) && zSig2;
 997                }
 998            }
 999        }
1000    }
1001    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1002    if ( increment ) {
1003        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1004        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1005    }
1006    else {
1007        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1008    }
1009    return packFloat128( zSign, zExp, zSig0, zSig1 );
1010
1011}
1012
1013/*----------------------------------------------------------------------------
1014| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1015| and significand formed by the concatenation of `zSig0' and `zSig1', and
1016| returns the proper quadruple-precision floating-point value corresponding
1017| to the abstract input.  This routine is just like `roundAndPackFloat128'
1018| except that the input significand has fewer bits and does not have to be
1019| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1020| point exponent.
1021*----------------------------------------------------------------------------*/
1022
1023static float128
1024 normalizeRoundAndPackFloat128(
1025     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1026{
1027    int8 shiftCount;
1028    bits64 zSig2;
1029
1030    if ( zSig0 == 0 ) {
1031        zSig0 = zSig1;
1032        zSig1 = 0;
1033        zExp -= 64;
1034    }
1035    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1036    if ( 0 <= shiftCount ) {
1037        zSig2 = 0;
1038        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1039    }
1040    else {
1041        shift128ExtraRightJamming(
1042            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1043    }
1044    zExp -= shiftCount;
1045    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1046
1047}
1048
1049#endif
1050
1051/*----------------------------------------------------------------------------
1052| Returns the result of converting the 32-bit two's complement integer `a'
1053| to the single-precision floating-point format.  The conversion is performed
1054| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1055*----------------------------------------------------------------------------*/
1056
1057float32 int32_to_float32( int32 a STATUS_PARAM )
1058{
1059    flag zSign;
1060
1061    if ( a == 0 ) return float32_zero;
1062    if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1063    zSign = ( a < 0 );
1064    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1065
1066}
1067
1068/*----------------------------------------------------------------------------
1069| Returns the result of converting the 32-bit two's complement integer `a'
1070| to the double-precision floating-point format.  The conversion is performed
1071| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1072*----------------------------------------------------------------------------*/
1073
1074float64 int32_to_float64( int32 a STATUS_PARAM )
1075{
1076    flag zSign;
1077    uint32 absA;
1078    int8 shiftCount;
1079    bits64 zSig;
1080
1081    if ( a == 0 ) return float64_zero;
1082    zSign = ( a < 0 );
1083    absA = zSign ? - a : a;
1084    shiftCount = countLeadingZeros32( absA ) + 21;
1085    zSig = absA;
1086    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1087
1088}
1089
1090#ifdef FLOATX80
1091
1092/*----------------------------------------------------------------------------
1093| Returns the result of converting the 32-bit two's complement integer `a'
1094| to the extended double-precision floating-point format.  The conversion
1095| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1096| Arithmetic.
1097*----------------------------------------------------------------------------*/
1098
1099floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1100{
1101    flag zSign;
1102    uint32 absA;
1103    int8 shiftCount;
1104    bits64 zSig;
1105
1106    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1107    zSign = ( a < 0 );
1108    absA = zSign ? - a : a;
1109    shiftCount = countLeadingZeros32( absA ) + 32;
1110    zSig = absA;
1111    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1112
1113}
1114
1115#endif
1116
1117#ifdef FLOAT128
1118
1119/*----------------------------------------------------------------------------
1120| Returns the result of converting the 32-bit two's complement integer `a' to
1121| the quadruple-precision floating-point format.  The conversion is performed
1122| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1123*----------------------------------------------------------------------------*/
1124
1125float128 int32_to_float128( int32 a STATUS_PARAM )
1126{
1127    flag zSign;
1128    uint32 absA;
1129    int8 shiftCount;
1130    bits64 zSig0;
1131
1132    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1133    zSign = ( a < 0 );
1134    absA = zSign ? - a : a;
1135    shiftCount = countLeadingZeros32( absA ) + 17;
1136    zSig0 = absA;
1137    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1138
1139}
1140
1141#endif
1142
1143/*----------------------------------------------------------------------------
1144| Returns the result of converting the 64-bit two's complement integer `a'
1145| to the single-precision floating-point format.  The conversion is performed
1146| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1147*----------------------------------------------------------------------------*/
1148
1149float32 int64_to_float32( int64 a STATUS_PARAM )
1150{
1151    flag zSign;
1152    uint64 absA;
1153    int8 shiftCount;
1154
1155    if ( a == 0 ) return float32_zero;
1156    zSign = ( a < 0 );
1157    absA = zSign ? - a : a;
1158    shiftCount = countLeadingZeros64( absA ) - 40;
1159    if ( 0 <= shiftCount ) {
1160        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1161    }
1162    else {
1163        shiftCount += 7;
1164        if ( shiftCount < 0 ) {
1165            shift64RightJamming( absA, - shiftCount, &absA );
1166        }
1167        else {
1168            absA <<= shiftCount;
1169        }
1170        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1171    }
1172
1173}
1174
1175float32 uint64_to_float32( uint64 a STATUS_PARAM )
1176{
1177    int8 shiftCount;
1178
1179    if ( a == 0 ) return float32_zero;
1180    shiftCount = countLeadingZeros64( a ) - 40;
1181    if ( 0 <= shiftCount ) {
1182        return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1183    }
1184    else {
1185        shiftCount += 7;
1186        if ( shiftCount < 0 ) {
1187            shift64RightJamming( a, - shiftCount, &a );
1188        }
1189        else {
1190            a <<= shiftCount;
1191        }
1192        return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1193    }
1194}
1195
1196/*----------------------------------------------------------------------------
1197| Returns the result of converting the 64-bit two's complement integer `a'
1198| to the double-precision floating-point format.  The conversion is performed
1199| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1200*----------------------------------------------------------------------------*/
1201
1202float64 int64_to_float64( int64 a STATUS_PARAM )
1203{
1204    flag zSign;
1205
1206    if ( a == 0 ) return float64_zero;
1207    if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1208        return packFloat64( 1, 0x43E, 0 );
1209    }
1210    zSign = ( a < 0 );
1211    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1212
1213}
1214
1215float64 uint64_to_float64( uint64 a STATUS_PARAM )
1216{
1217    if ( a == 0 ) return float64_zero;
1218    return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1219
1220}
1221
1222#ifdef FLOATX80
1223
1224/*----------------------------------------------------------------------------
1225| Returns the result of converting the 64-bit two's complement integer `a'
1226| to the extended double-precision floating-point format.  The conversion
1227| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1228| Arithmetic.
1229*----------------------------------------------------------------------------*/
1230
1231floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1232{
1233    flag zSign;
1234    uint64 absA;
1235    int8 shiftCount;
1236
1237    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1238    zSign = ( a < 0 );
1239    absA = zSign ? - a : a;
1240    shiftCount = countLeadingZeros64( absA );
1241    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1242
1243}
1244
1245#endif
1246
1247#ifdef FLOAT128
1248
1249/*----------------------------------------------------------------------------
1250| Returns the result of converting the 64-bit two's complement integer `a' to
1251| the quadruple-precision floating-point format.  The conversion is performed
1252| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1253*----------------------------------------------------------------------------*/
1254
1255float128 int64_to_float128( int64 a STATUS_PARAM )
1256{
1257    flag zSign;
1258    uint64 absA;
1259    int8 shiftCount;
1260    int32 zExp;
1261    bits64 zSig0, zSig1;
1262
1263    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1264    zSign = ( a < 0 );
1265    absA = zSign ? - a : a;
1266    shiftCount = countLeadingZeros64( absA ) + 49;
1267    zExp = 0x406E - shiftCount;
1268    if ( 64 <= shiftCount ) {
1269        zSig1 = 0;
1270        zSig0 = absA;
1271        shiftCount -= 64;
1272    }
1273    else {
1274        zSig1 = absA;
1275        zSig0 = 0;
1276    }
1277    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1278    return packFloat128( zSign, zExp, zSig0, zSig1 );
1279
1280}
1281
1282#endif
1283
1284/*----------------------------------------------------------------------------
1285| Returns the result of converting the single-precision floating-point value
1286| `a' to the 32-bit two's complement integer format.  The conversion is
1287| performed according to the IEC/IEEE Standard for Binary Floating-Point
1288| Arithmetic---which means in particular that the conversion is rounded
1289| according to the current rounding mode.  If `a' is a NaN, the largest
1290| positive integer is returned.  Otherwise, if the conversion overflows, the
1291| largest integer with the same sign as `a' is returned.
1292*----------------------------------------------------------------------------*/
1293
1294int32 float32_to_int32( float32 a STATUS_PARAM )
1295{
1296    flag aSign;
1297    int16 aExp, shiftCount;
1298    bits32 aSig;
1299    bits64 aSig64;
1300
1301    aSig = extractFloat32Frac( a );
1302    aExp = extractFloat32Exp( a );
1303    aSign = extractFloat32Sign( a );
1304    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1305    if ( aExp ) aSig |= 0x00800000;
1306    shiftCount = 0xAF - aExp;
1307    aSig64 = aSig;
1308    aSig64 <<= 32;
1309    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1310    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1311
1312}
1313
1314/*----------------------------------------------------------------------------
1315| Returns the result of converting the single-precision floating-point value
1316| `a' to the 32-bit two's complement integer format.  The conversion is
1317| performed according to the IEC/IEEE Standard for Binary Floating-Point
1318| Arithmetic, except that the conversion is always rounded toward zero.
1319| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1320| the conversion overflows, the largest integer with the same sign as `a' is
1321| returned.
1322*----------------------------------------------------------------------------*/
1323
1324int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1325{
1326    flag aSign;
1327    int16 aExp, shiftCount;
1328    bits32 aSig;
1329    int32 z;
1330
1331    aSig = extractFloat32Frac( a );
1332    aExp = extractFloat32Exp( a );
1333    aSign = extractFloat32Sign( a );
1334    shiftCount = aExp - 0x9E;
1335    if ( 0 <= shiftCount ) {
1336        if ( float32_val(a) != 0xCF000000 ) {
1337            float_raise( float_flag_invalid STATUS_VAR);
1338            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1339        }
1340        return (sbits32) 0x80000000;
1341    }
1342    else if ( aExp <= 0x7E ) {
1343        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1344        return 0;
1345    }
1346    aSig = ( aSig | 0x00800000 )<<8;
1347    z = aSig>>( - shiftCount );
1348    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1349        STATUS(float_exception_flags) |= float_flag_inexact;
1350    }
1351    if ( aSign ) z = - z;
1352    return z;
1353
1354}
1355
1356/*----------------------------------------------------------------------------
1357| Returns the result of converting the single-precision floating-point value
1358| `a' to the 64-bit two's complement integer format.  The conversion is
1359| performed according to the IEC/IEEE Standard for Binary Floating-Point
1360| Arithmetic---which means in particular that the conversion is rounded
1361| according to the current rounding mode.  If `a' is a NaN, the largest
1362| positive integer is returned.  Otherwise, if the conversion overflows, the
1363| largest integer with the same sign as `a' is returned.
1364*----------------------------------------------------------------------------*/
1365
1366int64 float32_to_int64( float32 a STATUS_PARAM )
1367{
1368    flag aSign;
1369    int16 aExp, shiftCount;
1370    bits32 aSig;
1371    bits64 aSig64, aSigExtra;
1372
1373    aSig = extractFloat32Frac( a );
1374    aExp = extractFloat32Exp( a );
1375    aSign = extractFloat32Sign( a );
1376    shiftCount = 0xBE - aExp;
1377    if ( shiftCount < 0 ) {
1378        float_raise( float_flag_invalid STATUS_VAR);
1379        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1380            return LIT64( 0x7FFFFFFFFFFFFFFF );
1381        }
1382        return (sbits64) LIT64( 0x8000000000000000 );
1383    }
1384    if ( aExp ) aSig |= 0x00800000;
1385    aSig64 = aSig;
1386    aSig64 <<= 40;
1387    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1388    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1389
1390}
1391
1392/*----------------------------------------------------------------------------
1393| Returns the result of converting the single-precision floating-point value
1394| `a' to the 64-bit two's complement integer format.  The conversion is
1395| performed according to the IEC/IEEE Standard for Binary Floating-Point
1396| Arithmetic, except that the conversion is always rounded toward zero.  If
1397| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1398| conversion overflows, the largest integer with the same sign as `a' is
1399| returned.
1400*----------------------------------------------------------------------------*/
1401
1402int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1403{
1404    flag aSign;
1405    int16 aExp, shiftCount;
1406    bits32 aSig;
1407    bits64 aSig64;
1408    int64 z;
1409
1410    aSig = extractFloat32Frac( a );
1411    aExp = extractFloat32Exp( a );
1412    aSign = extractFloat32Sign( a );
1413    shiftCount = aExp - 0xBE;
1414    if ( 0 <= shiftCount ) {
1415        if ( float32_val(a) != 0xDF000000 ) {
1416            float_raise( float_flag_invalid STATUS_VAR);
1417            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1418                return LIT64( 0x7FFFFFFFFFFFFFFF );
1419            }
1420        }
1421        return (sbits64) LIT64( 0x8000000000000000 );
1422    }
1423    else if ( aExp <= 0x7E ) {
1424        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1425        return 0;
1426    }
1427    aSig64 = aSig | 0x00800000;
1428    aSig64 <<= 40;
1429    z = aSig64>>( - shiftCount );
1430    if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1431        STATUS(float_exception_flags) |= float_flag_inexact;
1432    }
1433    if ( aSign ) z = - z;
1434    return z;
1435
1436}
1437
1438/*----------------------------------------------------------------------------
1439| Returns the result of converting the single-precision floating-point value
1440| `a' to the double-precision floating-point format.  The conversion is
1441| performed according to the IEC/IEEE Standard for Binary Floating-Point
1442| Arithmetic.
1443*----------------------------------------------------------------------------*/
1444
1445float64 float32_to_float64( float32 a STATUS_PARAM )
1446{
1447    flag aSign;
1448    int16 aExp;
1449    bits32 aSig;
1450
1451    aSig = extractFloat32Frac( a );
1452    aExp = extractFloat32Exp( a );
1453    aSign = extractFloat32Sign( a );
1454    if ( aExp == 0xFF ) {
1455        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1456        return packFloat64( aSign, 0x7FF, 0 );
1457    }
1458    if ( aExp == 0 ) {
1459        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1460        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1461        --aExp;
1462    }
1463    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1464
1465}
1466
1467#ifdef FLOATX80
1468
1469/*----------------------------------------------------------------------------
1470| Returns the result of converting the single-precision floating-point value
1471| `a' to the extended double-precision floating-point format.  The conversion
1472| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1473| Arithmetic.
1474*----------------------------------------------------------------------------*/
1475
1476floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1477{
1478    flag aSign;
1479    int16 aExp;
1480    bits32 aSig;
1481
1482    aSig = extractFloat32Frac( a );
1483    aExp = extractFloat32Exp( a );
1484    aSign = extractFloat32Sign( a );
1485    if ( aExp == 0xFF ) {
1486        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1487        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1488    }
1489    if ( aExp == 0 ) {
1490        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1491        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1492    }
1493    aSig |= 0x00800000;
1494    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1495
1496}
1497
1498#endif
1499
1500#ifdef FLOAT128
1501
1502/*----------------------------------------------------------------------------
1503| Returns the result of converting the single-precision floating-point value
1504| `a' to the double-precision floating-point format.  The conversion is
1505| performed according to the IEC/IEEE Standard for Binary Floating-Point
1506| Arithmetic.
1507*----------------------------------------------------------------------------*/
1508
1509float128 float32_to_float128( float32 a STATUS_PARAM )
1510{
1511    flag aSign;
1512    int16 aExp;
1513    bits32 aSig;
1514
1515    aSig = extractFloat32Frac( a );
1516    aExp = extractFloat32Exp( a );
1517    aSign = extractFloat32Sign( a );
1518    if ( aExp == 0xFF ) {
1519        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1520        return packFloat128( aSign, 0x7FFF, 0, 0 );
1521    }
1522    if ( aExp == 0 ) {
1523        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1524        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1525        --aExp;
1526    }
1527    return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1528
1529}
1530
1531#endif
1532
1533/*----------------------------------------------------------------------------
1534| Rounds the single-precision floating-point value `a' to an integer, and
1535| returns the result as a single-precision floating-point value.  The
1536| operation is performed according to the IEC/IEEE Standard for Binary
1537| Floating-Point Arithmetic.
1538*----------------------------------------------------------------------------*/
1539
1540float32 float32_round_to_int( float32 a STATUS_PARAM)
1541{
1542    flag aSign;
1543    int16 aExp;
1544    bits32 lastBitMask, roundBitsMask;
1545    int8 roundingMode;
1546    bits32 z;
1547
1548    aExp = extractFloat32Exp( a );
1549    if ( 0x96 <= aExp ) {
1550        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1551            return propagateFloat32NaN( a, a STATUS_VAR );
1552        }
1553        return a;
1554    }
1555    if ( aExp <= 0x7E ) {
1556        if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1557        STATUS(float_exception_flags) |= float_flag_inexact;
1558        aSign = extractFloat32Sign( a );
1559        switch ( STATUS(float_rounding_mode) ) {
1560         case float_round_nearest_even:
1561            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1562                return packFloat32( aSign, 0x7F, 0 );
1563            }
1564            break;
1565         case float_round_down:
1566            return make_float32(aSign ? 0xBF800000 : 0);
1567         case float_round_up:
1568            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1569        }
1570        return packFloat32( aSign, 0, 0 );
1571    }
1572    lastBitMask = 1;
1573    lastBitMask <<= 0x96 - aExp;
1574    roundBitsMask = lastBitMask - 1;
1575    z = float32_val(a);
1576    roundingMode = STATUS(float_rounding_mode);
1577    if ( roundingMode == float_round_nearest_even ) {
1578        z += lastBitMask>>1;
1579        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1580    }
1581    else if ( roundingMode != float_round_to_zero ) {
1582        if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1583            z += roundBitsMask;
1584        }
1585    }
1586    z &= ~ roundBitsMask;
1587    if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1588    return make_float32(z);
1589
1590}
1591
1592/*----------------------------------------------------------------------------
1593| Returns the result of adding the absolute values of the single-precision
1594| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1595| before being returned.  `zSign' is ignored if the result is a NaN.
1596| The addition is performed according to the IEC/IEEE Standard for Binary
1597| Floating-Point Arithmetic.
1598*----------------------------------------------------------------------------*/
1599
1600static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1601{
1602    int16 aExp, bExp, zExp;
1603    bits32 aSig, bSig, zSig;
1604    int16 expDiff;
1605
1606    aSig = extractFloat32Frac( a );
1607    aExp = extractFloat32Exp( a );
1608    bSig = extractFloat32Frac( b );
1609    bExp = extractFloat32Exp( b );
1610    expDiff = aExp - bExp;
1611    aSig <<= 6;
1612    bSig <<= 6;
1613    if ( 0 < expDiff ) {
1614        if ( aExp == 0xFF ) {
1615            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1616            return a;
1617        }
1618        if ( bExp == 0 ) {
1619            --expDiff;
1620        }
1621        else {
1622            bSig |= 0x20000000;
1623        }
1624        shift32RightJamming( bSig, expDiff, &bSig );
1625        zExp = aExp;
1626    }
1627    else if ( expDiff < 0 ) {
1628        if ( bExp == 0xFF ) {
1629            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1630            return packFloat32( zSign, 0xFF, 0 );
1631        }
1632        if ( aExp == 0 ) {
1633            ++expDiff;
1634        }
1635        else {
1636            aSig |= 0x20000000;
1637        }
1638        shift32RightJamming( aSig, - expDiff, &aSig );
1639        zExp = bExp;
1640    }
1641    else {
1642        if ( aExp == 0xFF ) {
1643            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1644            return a;
1645        }
1646        if ( aExp == 0 ) {
1647            if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1648            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1649        }
1650        zSig = 0x40000000 + aSig + bSig;
1651        zExp = aExp;
1652        goto roundAndPack;
1653    }
1654    aSig |= 0x20000000;
1655    zSig = ( aSig + bSig )<<1;
1656    --zExp;
1657    if ( (sbits32) zSig < 0 ) {
1658        zSig = aSig + bSig;
1659        ++zExp;
1660    }
1661 roundAndPack:
1662    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1663
1664}
1665
1666/*----------------------------------------------------------------------------
1667| Returns the result of subtracting the absolute values of the single-
1668| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1669| difference is negated before being returned.  `zSign' is ignored if the
1670| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1671| Standard for Binary Floating-Point Arithmetic.
1672*----------------------------------------------------------------------------*/
1673
1674static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1675{
1676    int16 aExp, bExp, zExp;
1677    bits32 aSig, bSig, zSig;
1678    int16 expDiff;
1679
1680    aSig = extractFloat32Frac( a );
1681    aExp = extractFloat32Exp( a );
1682    bSig = extractFloat32Frac( b );
1683    bExp = extractFloat32Exp( b );
1684    expDiff = aExp - bExp;
1685    aSig <<= 7;
1686    bSig <<= 7;
1687    if ( 0 < expDiff ) goto aExpBigger;
1688    if ( expDiff < 0 ) goto bExpBigger;
1689    if ( aExp == 0xFF ) {
1690        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1691        float_raise( float_flag_invalid STATUS_VAR);
1692        return float32_default_nan;
1693    }
1694    if ( aExp == 0 ) {
1695        aExp = 1;
1696        bExp = 1;
1697    }
1698    if ( bSig < aSig ) goto aBigger;
1699    if ( aSig < bSig ) goto bBigger;
1700    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1701 bExpBigger:
1702    if ( bExp == 0xFF ) {
1703        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1704        return packFloat32( zSign ^ 1, 0xFF, 0 );
1705    }
1706    if ( aExp == 0 ) {
1707        ++expDiff;
1708    }
1709    else {
1710        aSig |= 0x40000000;
1711    }
1712    shift32RightJamming( aSig, - expDiff, &aSig );
1713    bSig |= 0x40000000;
1714 bBigger:
1715    zSig = bSig - aSig;
1716    zExp = bExp;
1717    zSign ^= 1;
1718    goto normalizeRoundAndPack;
1719 aExpBigger:
1720    if ( aExp == 0xFF ) {
1721        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1722        return a;
1723    }
1724    if ( bExp == 0 ) {
1725        --expDiff;
1726    }
1727    else {
1728        bSig |= 0x40000000;
1729    }
1730    shift32RightJamming( bSig, expDiff, &bSig );
1731    aSig |= 0x40000000;
1732 aBigger:
1733    zSig = aSig - bSig;
1734    zExp = aExp;
1735 normalizeRoundAndPack:
1736    --zExp;
1737    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1738
1739}
1740
1741/*----------------------------------------------------------------------------
1742| Returns the result of adding the single-precision floating-point values `a'
1743| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1744| Binary Floating-Point Arithmetic.
1745*----------------------------------------------------------------------------*/
1746
1747float32 float32_add( float32 a, float32 b STATUS_PARAM )
1748{
1749    flag aSign, bSign;
1750
1751    aSign = extractFloat32Sign( a );
1752    bSign = extractFloat32Sign( b );
1753    if ( aSign == bSign ) {
1754        return addFloat32Sigs( a, b, aSign STATUS_VAR);
1755    }
1756    else {
1757        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1758    }
1759
1760}
1761
1762/*----------------------------------------------------------------------------
1763| Returns the result of subtracting the single-precision floating-point values
1764| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1765| for Binary Floating-Point Arithmetic.
1766*----------------------------------------------------------------------------*/
1767
1768float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1769{
1770    flag aSign, bSign;
1771
1772    aSign = extractFloat32Sign( a );
1773    bSign = extractFloat32Sign( b );
1774    if ( aSign == bSign ) {
1775        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1776    }
1777    else {
1778        return addFloat32Sigs( a, b, aSign STATUS_VAR );
1779    }
1780
1781}
1782
1783/*----------------------------------------------------------------------------
1784| Returns the result of multiplying the single-precision floating-point values
1785| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1786| for Binary Floating-Point Arithmetic.
1787*----------------------------------------------------------------------------*/
1788
1789float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1790{
1791    flag aSign, bSign, zSign;
1792    int16 aExp, bExp, zExp;
1793    bits32 aSig, bSig;
1794    bits64 zSig64;
1795    bits32 zSig;
1796
1797    aSig = extractFloat32Frac( a );
1798    aExp = extractFloat32Exp( a );
1799    aSign = extractFloat32Sign( a );
1800    bSig = extractFloat32Frac( b );
1801    bExp = extractFloat32Exp( b );
1802    bSign = extractFloat32Sign( b );
1803    zSign = aSign ^ bSign;
1804    if ( aExp == 0xFF ) {
1805        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1806            return propagateFloat32NaN( a, b STATUS_VAR );
1807        }
1808        if ( ( bExp | bSig ) == 0 ) {
1809            float_raise( float_flag_invalid STATUS_VAR);
1810            return float32_default_nan;
1811        }
1812        return packFloat32( zSign, 0xFF, 0 );
1813    }
1814    if ( bExp == 0xFF ) {
1815        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1816        if ( ( aExp | aSig ) == 0 ) {
1817            float_raise( float_flag_invalid STATUS_VAR);
1818            return float32_default_nan;
1819        }
1820        return packFloat32( zSign, 0xFF, 0 );
1821    }
1822    if ( aExp == 0 ) {
1823        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1824        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1825    }
1826    if ( bExp == 0 ) {
1827        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1828        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1829    }
1830    zExp = aExp + bExp - 0x7F;
1831    aSig = ( aSig | 0x00800000 )<<7;
1832    bSig = ( bSig | 0x00800000 )<<8;
1833    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1834    zSig = zSig64;
1835    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1836        zSig <<= 1;
1837        --zExp;
1838    }
1839    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1840
1841}
1842
1843/*----------------------------------------------------------------------------
1844| Returns the result of dividing the single-precision floating-point value `a'
1845| by the corresponding value `b'.  The operation is performed according to the
1846| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1847*----------------------------------------------------------------------------*/
1848
1849float32 float32_div( float32 a, float32 b STATUS_PARAM )
1850{
1851    flag aSign, bSign, zSign;
1852    int16 aExp, bExp, zExp;
1853    bits32 aSig, bSig, zSig;
1854
1855    aSig = extractFloat32Frac( a );
1856    aExp = extractFloat32Exp( a );
1857    aSign = extractFloat32Sign( a );
1858    bSig = extractFloat32Frac( b );
1859    bExp = extractFloat32Exp( b );
1860    bSign = extractFloat32Sign( b );
1861    zSign = aSign ^ bSign;
1862    if ( aExp == 0xFF ) {
1863        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1864        if ( bExp == 0xFF ) {
1865            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1866            float_raise( float_flag_invalid STATUS_VAR);
1867            return float32_default_nan;
1868        }
1869        return packFloat32( zSign, 0xFF, 0 );
1870    }
1871    if ( bExp == 0xFF ) {
1872        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1873        return packFloat32( zSign, 0, 0 );
1874    }
1875    if ( bExp == 0 ) {
1876        if ( bSig == 0 ) {
1877            if ( ( aExp | aSig ) == 0 ) {
1878                float_raise( float_flag_invalid STATUS_VAR);
1879                return float32_default_nan;
1880            }
1881            float_raise( float_flag_divbyzero STATUS_VAR);
1882            return packFloat32( zSign, 0xFF, 0 );
1883        }
1884        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1885    }
1886    if ( aExp == 0 ) {
1887        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1888        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1889    }
1890    zExp = aExp - bExp + 0x7D;
1891    aSig = ( aSig | 0x00800000 )<<7;
1892    bSig = ( bSig | 0x00800000 )<<8;
1893    if ( bSig <= ( aSig + aSig ) ) {
1894        aSig >>= 1;
1895        ++zExp;
1896    }
1897    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1898    if ( ( zSig & 0x3F ) == 0 ) {
1899        zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1900    }
1901    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1902
1903}
1904
1905/*----------------------------------------------------------------------------
1906| Returns the remainder of the single-precision floating-point value `a'
1907| with respect to the corresponding value `b'.  The operation is performed
1908| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1909*----------------------------------------------------------------------------*/
1910
1911float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1912{
1913    flag aSign, bSign, zSign;
1914    int16 aExp, bExp, expDiff;
1915    bits32 aSig, bSig;
1916    bits32 q;
1917    bits64 aSig64, bSig64, q64;
1918    bits32 alternateASig;
1919    sbits32 sigMean;
1920
1921    aSig = extractFloat32Frac( a );
1922    aExp = extractFloat32Exp( a );
1923    aSign = extractFloat32Sign( a );
1924    bSig = extractFloat32Frac( b );
1925    bExp = extractFloat32Exp( b );
1926    bSign = extractFloat32Sign( b );
1927    if ( aExp == 0xFF ) {
1928        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1929            return propagateFloat32NaN( a, b STATUS_VAR );
1930        }
1931        float_raise( float_flag_invalid STATUS_VAR);
1932        return float32_default_nan;
1933    }
1934    if ( bExp == 0xFF ) {
1935        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1936        return a;
1937    }
1938    if ( bExp == 0 ) {
1939        if ( bSig == 0 ) {
1940            float_raise( float_flag_invalid STATUS_VAR);
1941            return float32_default_nan;
1942        }
1943        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1944    }
1945    if ( aExp == 0 ) {
1946        if ( aSig == 0 ) return a;
1947        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1948    }
1949    expDiff = aExp - bExp;
1950    aSig |= 0x00800000;
1951    bSig |= 0x00800000;
1952    if ( expDiff < 32 ) {
1953        aSig <<= 8;
1954        bSig <<= 8;
1955        if ( expDiff < 0 ) {
1956            if ( expDiff < -1 ) return a;
1957            aSig >>= 1;
1958        }
1959        q = ( bSig <= aSig );
1960        if ( q ) aSig -= bSig;
1961        if ( 0 < expDiff ) {
1962            q = ( ( (bits64) aSig )<<32 ) / bSig;
1963            q >>= 32 - expDiff;
1964            bSig >>= 2;
1965            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1966        }
1967        else {
1968            aSig >>= 2;
1969            bSig >>= 2;
1970        }
1971    }
1972    else {
1973        if ( bSig <= aSig ) aSig -= bSig;
1974        aSig64 = ( (bits64) aSig )<<40;
1975        bSig64 = ( (bits64) bSig )<<40;
1976        expDiff -= 64;
1977        while ( 0 < expDiff ) {
1978            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1979            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1980            aSig64 = - ( ( bSig * q64 )<<38 );
1981            expDiff -= 62;
1982        }
1983        expDiff += 64;
1984        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1985        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1986        q = q64>>( 64 - expDiff );
1987        bSig <<= 6;
1988        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1989    }
1990    do {
1991        alternateASig = aSig;
1992        ++q;
1993        aSig -= bSig;
1994    } while ( 0 <= (sbits32) aSig );
1995    sigMean = aSig + alternateASig;
1996    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1997        aSig = alternateASig;
1998    }
1999    zSign = ( (sbits32) aSig < 0 );
2000    if ( zSign ) aSig = - aSig;
2001    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2002
2003}
2004
2005/*----------------------------------------------------------------------------
2006| Returns the square root of the single-precision floating-point value `a'.
2007| The operation is performed according to the IEC/IEEE Standard for Binary
2008| Floating-Point Arithmetic.
2009*----------------------------------------------------------------------------*/
2010
2011float32 float32_sqrt( float32 a STATUS_PARAM )
2012{
2013    flag aSign;
2014    int16 aExp, zExp;
2015    bits32 aSig, zSig;
2016    bits64 rem, term;
2017
2018    aSig = extractFloat32Frac( a );
2019    aExp = extractFloat32Exp( a );
2020    aSign = extractFloat32Sign( a );
2021    if ( aExp == 0xFF ) {
2022        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2023        if ( ! aSign ) return a;
2024        float_raise( float_flag_invalid STATUS_VAR);
2025        return float32_default_nan;
2026    }
2027    if ( aSign ) {
2028        if ( ( aExp | aSig ) == 0 ) return a;
2029        float_raise( float_flag_invalid STATUS_VAR);
2030        return float32_default_nan;
2031    }
2032    if ( aExp == 0 ) {
2033        if ( aSig == 0 ) return float32_zero;
2034        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2035    }
2036    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2037    aSig = ( aSig | 0x00800000 )<<8;
2038    zSig = estimateSqrt32( aExp, aSig ) + 2;
2039    if ( ( zSig & 0x7F ) <= 5 ) {
2040        if ( zSig < 2 ) {
2041            zSig = 0x7FFFFFFF;
2042            goto roundAndPack;
2043        }
2044        aSig >>= aExp & 1;
2045        term = ( (bits64) zSig ) * zSig;
2046        rem = ( ( (bits64) aSig )<<32 ) - term;
2047        while ( (sbits64) rem < 0 ) {
2048            --zSig;
2049            rem += ( ( (bits64) zSig )<<1 ) | 1;
2050        }
2051        zSig |= ( rem != 0 );
2052    }
2053    shift32RightJamming( zSig, 1, &zSig );
2054 roundAndPack:
2055    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2056
2057}
2058
2059/*----------------------------------------------------------------------------
2060| Returns the binary log of the single-precision floating-point value `a'.
2061| The operation is performed according to the IEC/IEEE Standard for Binary
2062| Floating-Point Arithmetic.
2063*----------------------------------------------------------------------------*/
2064float32 float32_log2( float32 a STATUS_PARAM )
2065{
2066    flag aSign, zSign;
2067    int16 aExp;
2068    bits32 aSig, zSig, i;
2069
2070    aSig = extractFloat32Frac( a );
2071    aExp = extractFloat32Exp( a );
2072    aSign = extractFloat32Sign( a );
2073
2074    if ( aExp == 0 ) {
2075        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2076        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2077    }
2078    if ( aSign ) {
2079        float_raise( float_flag_invalid STATUS_VAR);
2080        return float32_default_nan;
2081    }
2082    if ( aExp == 0xFF ) {
2083        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2084        return a;
2085    }
2086
2087    aExp -= 0x7F;
2088    aSig |= 0x00800000;
2089    zSign = aExp < 0;
2090    zSig = aExp << 23;
2091
2092    for (i = 1 << 22; i > 0; i >>= 1) {
2093        aSig = ( (bits64)aSig * aSig ) >> 23;
2094        if ( aSig & 0x01000000 ) {
2095            aSig >>= 1;
2096            zSig |= i;
2097        }
2098    }
2099
2100    if ( zSign )
2101        zSig = -zSig;
2102
2103    return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2104}
2105
2106/*----------------------------------------------------------------------------
2107| Returns 1 if the single-precision floating-point value `a' is equal to
2108| the corresponding value `b', and 0 otherwise.  The comparison is performed
2109| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2110*----------------------------------------------------------------------------*/
2111
2112int float32_eq( float32 a, float32 b STATUS_PARAM )
2113{
2114
2115    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2116         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2117       ) {
2118        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2119            float_raise( float_flag_invalid STATUS_VAR);
2120        }
2121        return 0;
2122    }
2123    return ( float32_val(a) == float32_val(b) ) ||
2124            ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2125
2126}
2127
2128/*----------------------------------------------------------------------------
2129| Returns 1 if the single-precision floating-point value `a' is less than
2130| or equal to the corresponding value `b', and 0 otherwise.  The comparison
2131| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2132| Arithmetic.
2133*----------------------------------------------------------------------------*/
2134
2135int float32_le( float32 a, float32 b STATUS_PARAM )
2136{
2137    flag aSign, bSign;
2138    bits32 av, bv;
2139
2140    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2141         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2142       ) {
2143        float_raise( float_flag_invalid STATUS_VAR);
2144        return 0;
2145    }
2146    aSign = extractFloat32Sign( a );
2147    bSign = extractFloat32Sign( b );
2148    av = float32_val(a);
2149    bv = float32_val(b);
2150    if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2151    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2152
2153}
2154
2155/*----------------------------------------------------------------------------
2156| Returns 1 if the single-precision floating-point value `a' is less than
2157| the corresponding value `b', and 0 otherwise.  The comparison is performed
2158| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2159*----------------------------------------------------------------------------*/
2160
2161int float32_lt( float32 a, float32 b STATUS_PARAM )
2162{
2163    flag aSign, bSign;
2164    bits32 av, bv;
2165
2166    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2167         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2168       ) {
2169        float_raise( float_flag_invalid STATUS_VAR);
2170        return 0;
2171    }
2172    aSign = extractFloat32Sign( a );
2173    bSign = extractFloat32Sign( b );
2174    av = float32_val(a);
2175    bv = float32_val(b);
2176    if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2177    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2178
2179}
2180
2181/*----------------------------------------------------------------------------
2182| Returns 1 if the single-precision floating-point value `a' is equal to
2183| the corresponding value `b', and 0 otherwise.  The invalid exception is
2184| raised if either operand is a NaN.  Otherwise, the comparison is performed
2185| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2186*----------------------------------------------------------------------------*/
2187
2188int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2189{
2190    bits32 av, bv;
2191
2192    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2193         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2194       ) {
2195        float_raise( float_flag_invalid STATUS_VAR);
2196        return 0;
2197    }
2198    av = float32_val(a);
2199    bv = float32_val(b);
2200    return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2201
2202}
2203
2204/*----------------------------------------------------------------------------
2205| Returns 1 if the single-precision floating-point value `a' is less than or
2206| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2207| cause an exception.  Otherwise, the comparison is performed according to the
2208| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2209*----------------------------------------------------------------------------*/
2210
2211int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2212{
2213    flag aSign, bSign;
2214    bits32 av, bv;
2215
2216    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2217         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2218       ) {
2219        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2220            float_raise( float_flag_invalid STATUS_VAR);
2221        }
2222        return 0;
2223    }
2224    aSign = extractFloat32Sign( a );
2225    bSign = extractFloat32Sign( b );
2226    av = float32_val(a);
2227    bv = float32_val(b);
2228    if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2229    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2230
2231}
2232
2233/*----------------------------------------------------------------------------
2234| Returns 1 if the single-precision floating-point value `a' is less than
2235| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2236| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2237| Standard for Binary Floating-Point Arithmetic.
2238*----------------------------------------------------------------------------*/
2239
2240int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2241{
2242    flag aSign, bSign;
2243    bits32 av, bv;
2244
2245    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2246         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2247       ) {
2248        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2249            float_raise( float_flag_invalid STATUS_VAR);
2250        }
2251        return 0;
2252    }
2253    aSign = extractFloat32Sign( a );
2254    bSign = extractFloat32Sign( b );
2255    av = float32_val(a);
2256    bv = float32_val(b);
2257    if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2258    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2259
2260}
2261
2262/*----------------------------------------------------------------------------
2263| Returns the result of converting the double-precision floating-point value
2264| `a' to the 32-bit two's complement integer format.  The conversion is
2265| performed according to the IEC/IEEE Standard for Binary Floating-Point
2266| Arithmetic---which means in particular that the conversion is rounded
2267| according to the current rounding mode.  If `a' is a NaN, the largest
2268| positive integer is returned.  Otherwise, if the conversion overflows, the
2269| largest integer with the same sign as `a' is returned.
2270*----------------------------------------------------------------------------*/
2271
2272int32 float64_to_int32( float64 a STATUS_PARAM )
2273{
2274    flag aSign;
2275    int16 aExp, shiftCount;
2276    bits64 aSig;
2277
2278    aSig = extractFloat64Frac( a );
2279    aExp = extractFloat64Exp( a );
2280    aSign = extractFloat64Sign( a );
2281    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2282    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2283    shiftCount = 0x42C - aExp;
2284    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2285    return roundAndPackInt32( aSign, aSig STATUS_VAR );
2286
2287}
2288
2289/*----------------------------------------------------------------------------
2290| Returns the result of converting the double-precision floating-point value
2291| `a' to the 32-bit two's complement integer format.  The conversion is
2292| performed according to the IEC/IEEE Standard for Binary Floating-Point
2293| Arithmetic, except that the conversion is always rounded toward zero.
2294| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2295| the conversion overflows, the largest integer with the same sign as `a' is
2296| returned.
2297*----------------------------------------------------------------------------*/
2298
2299int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2300{
2301    flag aSign;
2302    int16 aExp, shiftCount;
2303    bits64 aSig, savedASig;
2304    int32 z;
2305
2306    aSig = extractFloat64Frac( a );
2307    aExp = extractFloat64Exp( a );
2308    aSign = extractFloat64Sign( a );
2309    if ( 0x41E < aExp ) {
2310        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2311        goto invalid;
2312    }
2313    else if ( aExp < 0x3FF ) {
2314        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2315        return 0;
2316    }
2317    aSig |= LIT64( 0x0010000000000000 );
2318    shiftCount = 0x433 - aExp;
2319    savedASig = aSig;
2320    aSig >>= shiftCount;
2321    z = aSig;
2322    if ( aSign ) z = - z;
2323    if ( ( z < 0 ) ^ aSign ) {
2324 invalid:
2325        float_raise( float_flag_invalid STATUS_VAR);
2326        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2327    }
2328    if ( ( aSig<<shiftCount ) != savedASig ) {
2329        STATUS(float_exception_flags) |= float_flag_inexact;
2330    }
2331    return z;
2332
2333}
2334
2335/*----------------------------------------------------------------------------
2336| Returns the result of converting the double-precision floating-point value
2337| `a' to the 64-bit two's complement integer format.  The conversion is
2338| performed according to the IEC/IEEE Standard for Binary Floating-Point
2339| Arithmetic---which means in particular that the conversion is rounded
2340| according to the current rounding mode.  If `a' is a NaN, the largest
2341| positive integer is returned.  Otherwise, if the conversion overflows, the
2342| largest integer with the same sign as `a' is returned.
2343*----------------------------------------------------------------------------*/
2344
2345int64 float64_to_int64( float64 a STATUS_PARAM )
2346{
2347    flag aSign;
2348    int16 aExp, shiftCount;
2349    bits64 aSig, aSigExtra;
2350
2351    aSig = extractFloat64Frac( a );
2352    aExp = extractFloat64Exp( a );
2353    aSign = extractFloat64Sign( a );
2354    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2355    shiftCount = 0x433 - aExp;
2356    if ( shiftCount <= 0 ) {
2357        if ( 0x43E < aExp ) {
2358            float_raise( float_flag_invalid STATUS_VAR);
2359            if (    ! aSign
2360                 || (    ( aExp == 0x7FF )
2361                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2362               ) {
2363                return LIT64( 0x7FFFFFFFFFFFFFFF );
2364            }
2365            return (sbits64) LIT64( 0x8000000000000000 );
2366        }
2367        aSigExtra = 0;
2368        aSig <<= - shiftCount;
2369    }
2370    else {
2371        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2372    }
2373    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2374
2375}
2376
2377/*----------------------------------------------------------------------------
2378| Returns the result of converting the double-precision floating-point value
2379| `a' to the 64-bit two's complement integer format.  The conversion is
2380| performed according to the IEC/IEEE Standard for Binary Floating-Point
2381| Arithmetic, except that the conversion is always rounded toward zero.
2382| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2383| the conversion overflows, the largest integer with the same sign as `a' is
2384| returned.
2385*----------------------------------------------------------------------------*/
2386
2387int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2388{
2389    flag aSign;
2390    int16 aExp, shiftCount;
2391    bits64 aSig;
2392    int64 z;
2393
2394    aSig = extractFloat64Frac( a );
2395    aExp = extractFloat64Exp( a );
2396    aSign = extractFloat64Sign( a );
2397    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2398    shiftCount = aExp - 0x433;
2399    if ( 0 <= shiftCount ) {
2400        if ( 0x43E <= aExp ) {
2401            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2402                float_raise( float_flag_invalid STATUS_VAR);
2403                if (    ! aSign
2404                     || (    ( aExp == 0x7FF )
2405                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2406                   ) {
2407                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2408                }
2409            }
2410            return (sbits64) LIT64( 0x8000000000000000 );
2411        }
2412        z = aSig<<shiftCount;
2413    }
2414    else {
2415        if ( aExp < 0x3FE ) {
2416            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2417            return 0;
2418        }
2419        z = aSig>>( - shiftCount );
2420        if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2421            STATUS(float_exception_flags) |= float_flag_inexact;
2422        }
2423    }
2424    if ( aSign ) z = - z;
2425    return z;
2426
2427}
2428
2429/*----------------------------------------------------------------------------
2430| Returns the result of converting the double-precision floating-point value
2431| `a' to the single-precision floating-point format.  The conversion is
2432| performed according to the IEC/IEEE Standard for Binary Floating-Point
2433| Arithmetic.
2434*----------------------------------------------------------------------------*/
2435
2436float32 float64_to_float32( float64 a STATUS_PARAM )
2437{
2438    flag aSign;
2439    int16 aExp;
2440    bits64 aSig;
2441    bits32 zSig;
2442
2443    aSig = extractFloat64Frac( a );
2444    aExp = extractFloat64Exp( a );
2445    aSign = extractFloat64Sign( a );
2446    if ( aExp == 0x7FF ) {
2447        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2448        return packFloat32( aSign, 0xFF, 0 );
2449    }
2450    shift64RightJamming( aSig, 22, &aSig );
2451    zSig = aSig;
2452    if ( aExp || zSig ) {
2453        zSig |= 0x40000000;
2454        aExp -= 0x381;
2455    }
2456    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2457
2458}
2459
2460#ifdef FLOATX80
2461
2462/*----------------------------------------------------------------------------
2463| Returns the result of converting the double-precision floating-point value
2464| `a' to the extended double-precision floating-point format.  The conversion
2465| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2466| Arithmetic.
2467*----------------------------------------------------------------------------*/
2468
2469floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2470{
2471    flag aSign;
2472    int16 aExp;
2473    bits64 aSig;
2474
2475    aSig = extractFloat64Frac( a );
2476    aExp = extractFloat64Exp( a );
2477    aSign = extractFloat64Sign( a );
2478    if ( aExp == 0x7FF ) {
2479        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2480        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2481    }
2482    if ( aExp == 0 ) {
2483        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2484        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2485    }
2486    return
2487        packFloatx80(
2488            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2489
2490}
2491
2492#endif
2493
2494#ifdef FLOAT128
2495
2496/*----------------------------------------------------------------------------
2497| Returns the result of converting the double-precision floating-point value
2498| `a' to the quadruple-precision floating-point format.  The conversion is
2499| performed according to the IEC/IEEE Standard for Binary Floating-Point
2500| Arithmetic.
2501*----------------------------------------------------------------------------*/
2502
2503float128 float64_to_float128( float64 a STATUS_PARAM )
2504{
2505    flag aSign;
2506    int16 aExp;
2507    bits64 aSig, zSig0, zSig1;
2508
2509    aSig = extractFloat64Frac( a );
2510    aExp = extractFloat64Exp( a );
2511    aSign = extractFloat64Sign( a );
2512    if ( aExp == 0x7FF ) {
2513        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2514        return packFloat128( aSign, 0x7FFF, 0, 0 );
2515    }
2516    if ( aExp == 0 ) {
2517        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2518        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2519        --aExp;
2520    }
2521    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2522    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2523
2524}
2525
2526#endif
2527
2528/*----------------------------------------------------------------------------
2529| Rounds the double-precision floating-point value `a' to an integer, and
2530| returns the result as a double-precision floating-point value.  The
2531| operation is performed according to the IEC/IEEE Standard for Binary
2532| Floating-Point Arithmetic.
2533*----------------------------------------------------------------------------*/
2534
2535float64 float64_round_to_int( float64 a STATUS_PARAM )
2536{
2537    flag aSign;
2538    int16 aExp;
2539    bits64 lastBitMask, roundBitsMask;
2540    int8 roundingMode;
2541    bits64 z;
2542
2543    aExp = extractFloat64Exp( a );
2544    if ( 0x433 <= aExp ) {
2545        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2546            return propagateFloat64NaN( a, a STATUS_VAR );
2547        }
2548        return a;
2549    }
2550    if ( aExp < 0x3FF ) {
2551        if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2552        STATUS(float_exception_flags) |= float_flag_inexact;
2553        aSign = extractFloat64Sign( a );
2554        switch ( STATUS(float_rounding_mode) ) {
2555         case float_round_nearest_even:
2556            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2557                return packFloat64( aSign, 0x3FF, 0 );
2558            }
2559            break;
2560         case float_round_down:
2561            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2562         case float_round_up:
2563            return make_float64(
2564            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2565        }
2566        return packFloat64( aSign, 0, 0 );
2567    }
2568    lastBitMask = 1;
2569    lastBitMask <<= 0x433 - aExp;
2570    roundBitsMask = lastBitMask - 1;
2571    z = float64_val(a);
2572    roundingMode = STATUS(float_rounding_mode);
2573    if ( roundingMode == float_round_nearest_even ) {
2574        z += lastBitMask>>1;
2575        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2576    }
2577    else if ( roundingMode != float_round_to_zero ) {
2578        if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2579            z += roundBitsMask;
2580        }
2581    }
2582    z &= ~ roundBitsMask;
2583    if ( z != float64_val(a) )
2584        STATUS(float_exception_flags) |= float_flag_inexact;
2585    return make_float64(z);
2586
2587}
2588
2589float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2590{
2591    int oldmode;
2592    float64 res;
2593    oldmode = STATUS(float_rounding_mode);
2594    STATUS(float_rounding_mode) = float_round_to_zero;
2595    res = float64_round_to_int(a STATUS_VAR);
2596    STATUS(float_rounding_mode) = oldmode;
2597    return res;
2598}
2599
2600/*----------------------------------------------------------------------------
2601| Returns the result of adding the absolute values of the double-precision
2602| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2603| before being returned.  `zSign' is ignored if the result is a NaN.
2604| The addition is performed according to the IEC/IEEE Standard for Binary
2605| Floating-Point Arithmetic.
2606*----------------------------------------------------------------------------*/
2607
2608static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2609{
2610    int16 aExp, bExp, zExp;
2611    bits64 aSig, bSig, zSig;
2612    int16 expDiff;
2613
2614    aSig = extractFloat64Frac( a );
2615    aExp = extractFloat64Exp( a );
2616    bSig = extractFloat64Frac( b );
2617    bExp = extractFloat64Exp( b );
2618    expDiff = aExp - bExp;
2619    aSig <<= 9;
2620    bSig <<= 9;
2621    if ( 0 < expDiff ) {
2622        if ( aExp == 0x7FF ) {
2623            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2624            return a;
2625        }
2626        if ( bExp == 0 ) {
2627            --expDiff;
2628        }
2629        else {
2630            bSig |= LIT64( 0x2000000000000000 );
2631        }
2632        shift64RightJamming( bSig, expDiff, &bSig );
2633        zExp = aExp;
2634    }
2635    else if ( expDiff < 0 ) {
2636        if ( bExp == 0x7FF ) {
2637            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2638            return packFloat64( zSign, 0x7FF, 0 );
2639        }
2640        if ( aExp == 0 ) {
2641            ++expDiff;
2642        }
2643        else {
2644            aSig |= LIT64( 0x2000000000000000 );
2645        }
2646        shift64RightJamming( aSig, - expDiff, &aSig );
2647        zExp = bExp;
2648    }
2649    else {
2650        if ( aExp == 0x7FF ) {
2651            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2652            return a;
2653        }
2654        if ( aExp == 0 ) {
2655            if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
2656            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2657        }
2658        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2659        zExp = aExp;
2660        goto roundAndPack;
2661    }
2662    aSig |= LIT64( 0x2000000000000000 );
2663    zSig = ( aSig + bSig )<<1;
2664    --zExp;
2665    if ( (sbits64) zSig < 0 ) {
2666        zSig = aSig + bSig;
2667        ++zExp;
2668    }
2669 roundAndPack:
2670    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2671
2672}
2673
2674/*----------------------------------------------------------------------------
2675| Returns the result of subtracting the absolute values of the double-
2676| precision floating-point values `a' and `b'.  If `zSign' is 1, the
2677| difference is negated before being returned.  `zSign' is ignored if the
2678| result is a NaN.  The subtraction is performed according to the IEC/IEEE
2679| Standard for Binary Floating-Point Arithmetic.
2680*----------------------------------------------------------------------------*/
2681
2682static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2683{
2684    int16 aExp, bExp, zExp;
2685    bits64 aSig, bSig, zSig;
2686    int16 expDiff;
2687
2688    aSig = extractFloat64Frac( a );
2689    aExp = extractFloat64Exp( a );
2690    bSig = extractFloat64Frac( b );
2691    bExp = extractFloat64Exp( b );
2692    expDiff = aExp - bExp;
2693    aSig <<= 10;
2694    bSig <<= 10;
2695    if ( 0 < expDiff ) goto aExpBigger;
2696    if ( expDiff < 0 ) goto bExpBigger;
2697    if ( aExp == 0x7FF ) {
2698        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2699        float_raise( float_flag_invalid STATUS_VAR);
2700        return float64_default_nan;
2701    }
2702    if ( aExp == 0 ) {
2703        aExp = 1;
2704        bExp = 1;
2705    }
2706    if ( bSig < aSig ) goto aBigger;
2707    if ( aSig < bSig ) goto bBigger;
2708    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2709 bExpBigger:
2710    if ( bExp == 0x7FF ) {
2711        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2712        return packFloat64( zSign ^ 1, 0x7FF, 0 );
2713    }
2714    if ( aExp == 0 ) {
2715        ++expDiff;
2716    }
2717    else {
2718        aSig |= LIT64( 0x4000000000000000 );
2719    }
2720    shift64RightJamming( aSig, - expDiff, &aSig );
2721    bSig |= LIT64( 0x4000000000000000 );
2722 bBigger:
2723    zSig = bSig - aSig;
2724    zExp = bExp;
2725    zSign ^= 1;
2726    goto normalizeRoundAndPack;
2727 aExpBigger:
2728    if ( aExp == 0x7FF ) {
2729        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2730        return a;
2731    }
2732    if ( bExp == 0 ) {
2733        --expDiff;
2734    }
2735    else {
2736        bSig |= LIT64( 0x4000000000000000 );
2737    }
2738    shift64RightJamming( bSig, expDiff, &bSig );
2739    aSig |= LIT64( 0x4000000000000000 );
2740 aBigger:
2741    zSig = aSig - bSig;
2742    zExp = aExp;
2743 normalizeRoundAndPack:
2744    --zExp;
2745    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2746
2747}
2748
2749/*----------------------------------------------------------------------------
2750| Returns the result of adding the double-precision floating-point values `a'
2751| and `b'.  The operation is performed according to the IEC/IEEE Standard for
2752| Binary Floating-Point Arithmetic.
2753*----------------------------------------------------------------------------*/
2754
2755float64 float64_add( float64 a, float64 b STATUS_PARAM )
2756{
2757    flag aSign, bSign;
2758
2759    aSign = extractFloat64Sign( a );
2760    bSign = extractFloat64Sign( b );
2761    if ( aSign == bSign ) {
2762        return addFloat64Sigs( a, b, aSign STATUS_VAR );
2763    }
2764    else {
2765        return subFloat64Sigs( a, b, aSign STATUS_VAR );
2766    }
2767
2768}
2769
2770/*----------------------------------------------------------------------------
2771| Returns the result of subtracting the double-precision floating-point values
2772| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2773| for Binary Floating-Point Arithmetic.
2774*----------------------------------------------------------------------------*/
2775
2776float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2777{
2778    flag aSign, bSign;
2779
2780    aSign = extractFloat64Sign( a );
2781    bSign = extractFloat64Sign( b );
2782    if ( aSign == bSign ) {
2783        return subFloat64Sigs( a, b, aSign STATUS_VAR );
2784    }
2785    else {
2786        return addFloat64Sigs( a, b, aSign STATUS_VAR );
2787    }
2788
2789}
2790
2791/*----------------------------------------------------------------------------
2792| Returns the result of multiplying the double-precision floating-point values
2793| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2794| for Binary Floating-Point Arithmetic.
2795*----------------------------------------------------------------------------*/
2796
2797float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2798{
2799    flag aSign, bSign, zSign;
2800    int16 aExp, bExp, zExp;
2801    bits64 aSig, bSig, zSig0, zSig1;
2802
2803    aSig = extractFloat64Frac( a );
2804    aExp = extractFloat64Exp( a );
2805    aSign = extractFloat64Sign( a );
2806    bSig = extractFloat64Frac( b );
2807    bExp = extractFloat64Exp( b );
2808    bSign = extractFloat64Sign( b );
2809    zSign = aSign ^ bSign;
2810    if ( aExp == 0x7FF ) {
2811        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2812            return propagateFloat64NaN( a, b STATUS_VAR );
2813        }
2814        if ( ( bExp | bSig ) == 0 ) {
2815            float_raise( float_flag_invalid STATUS_VAR);
2816            return float64_default_nan;
2817        }
2818        return packFloat64( zSign, 0x7FF, 0 );
2819    }
2820    if ( bExp == 0x7FF ) {
2821        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2822        if ( ( aExp | aSig ) == 0 ) {
2823            float_raise( float_flag_invalid STATUS_VAR);
2824            return float64_default_nan;
2825        }
2826        return packFloat64( zSign, 0x7FF, 0 );
2827    }
2828    if ( aExp == 0 ) {
2829        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2830        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2831    }
2832    if ( bExp == 0 ) {
2833        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2834        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2835    }
2836    zExp = aExp + bExp - 0x3FF;
2837    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2838    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2839    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2840    zSig0 |= ( zSig1 != 0 );
2841    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2842        zSig0 <<= 1;
2843        --zExp;
2844    }
2845    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2846
2847}
2848
2849/*----------------------------------------------------------------------------
2850| Returns the result of dividing the double-precision floating-point value `a'
2851| by the corresponding value `b'.  The operation is performed according to
2852| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2853*----------------------------------------------------------------------------*/
2854
2855float64 float64_div( float64 a, float64 b STATUS_PARAM )
2856{
2857    flag aSign, bSign, zSign;
2858    int16 aExp, bExp, zExp;
2859    bits64 aSig, bSig, zSig;
2860    bits64 rem0, rem1;
2861    bits64 term0, term1;
2862
2863    aSig = extractFloat64Frac( a );
2864    aExp = extractFloat64Exp( a );
2865    aSign = extractFloat64Sign( a );
2866    bSig = extractFloat64Frac( b );
2867    bExp = extractFloat64Exp( b );
2868    bSign = extractFloat64Sign( b );
2869    zSign = aSign ^ bSign;
2870    if ( aExp == 0x7FF ) {
2871        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2872        if ( bExp == 0x7FF ) {
2873            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2874            float_raise( float_flag_invalid STATUS_VAR);
2875            return float64_default_nan;
2876        }
2877        return packFloat64( zSign, 0x7FF, 0 );
2878    }
2879    if ( bExp == 0x7FF ) {
2880        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2881        return packFloat64( zSign, 0, 0 );
2882    }
2883    if ( bExp == 0 ) {
2884        if ( bSig == 0 ) {
2885            if ( ( aExp | aSig ) == 0 ) {
2886                float_raise( float_flag_invalid STATUS_VAR);
2887                return float64_default_nan;
2888            }
2889            float_raise( float_flag_divbyzero STATUS_VAR);
2890            return packFloat64( zSign, 0x7FF, 0 );
2891        }
2892        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2893    }
2894    if ( aExp == 0 ) {
2895        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2896        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2897    }
2898    zExp = aExp - bExp + 0x3FD;
2899    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2900    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2901    if ( bSig <= ( aSig + aSig ) ) {
2902        aSig >>= 1;
2903        ++zExp;
2904    }
2905    zSig = estimateDiv128To64( aSig, 0, bSig );
2906    if ( ( zSig & 0x1FF ) <= 2 ) {
2907        mul64To128( bSig, zSig, &term0, &term1 );
2908        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2909        while ( (sbits64) rem0 < 0 ) {
2910            --zSig;
2911            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2912        }
2913        zSig |= ( rem1 != 0 );
2914    }
2915    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2916
2917}
2918
2919/*----------------------------------------------------------------------------
2920| Returns the remainder of the double-precision floating-point value `a'
2921| with respect to the corresponding value `b'.  The operation is performed
2922| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2923*----------------------------------------------------------------------------*/
2924
2925float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2926{
2927    flag aSign, bSign, zSign;
2928    int16 aExp, bExp, expDiff;
2929    bits64 aSig, bSig;
2930    bits64 q, alternateASig;
2931    sbits64 sigMean;
2932
2933    aSig = extractFloat64Frac( a );
2934    aExp = extractFloat64Exp( a );
2935    aSign = extractFloat64Sign( a );
2936    bSig = extractFloat64Frac( b );
2937    bExp = extractFloat64Exp( b );
2938    bSign = extractFloat64Sign( b );
2939    if ( aExp == 0x7FF ) {
2940        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2941            return propagateFloat64NaN( a, b STATUS_VAR );
2942        }
2943        float_raise( float_flag_invalid STATUS_VAR);
2944        return float64_default_nan;
2945    }
2946    if ( bExp == 0x7FF ) {
2947        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2948        return a;
2949    }
2950    if ( bExp == 0 ) {
2951        if ( bSig == 0 ) {
2952            float_raise( float_flag_invalid STATUS_VAR);
2953            return float64_default_nan;
2954        }
2955        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2956    }
2957    if ( aExp == 0 ) {
2958        if ( aSig == 0 ) return a;
2959        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2960    }
2961    expDiff = aExp - bExp;
2962    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2963    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2964    if ( expDiff < 0 ) {
2965        if ( expDiff < -1 ) return a;
2966        aSig >>= 1;
2967    }
2968    q = ( bSig <= aSig );
2969    if ( q ) aSig -= bSig;
2970    expDiff -= 64;
2971    while ( 0 < expDiff ) {
2972        q = estimateDiv128To64( aSig, 0, bSig );
2973        q = ( 2 < q ) ? q - 2 : 0;
2974        aSig = - ( ( bSig>>2 ) * q );
2975        expDiff -= 62;
2976    }
2977    expDiff += 64;
2978    if ( 0 < expDiff ) {
2979        q = estimateDiv128To64( aSig, 0, bSig );
2980        q = ( 2 < q ) ? q - 2 : 0;
2981        q >>= 64 - expDiff;
2982        bSig >>= 2;
2983        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2984    }
2985    else {
2986        aSig >>= 2;
2987        bSig >>= 2;
2988    }
2989    do {
2990        alternateASig = aSig;
2991        ++q;
2992        aSig -= bSig;
2993    } while ( 0 <= (sbits64) aSig );
2994    sigMean = aSig + alternateASig;
2995    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2996        aSig = alternateASig;
2997    }
2998    zSign = ( (sbits64) aSig < 0 );
2999    if ( zSign ) aSig = - aSig;
3000    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3001
3002}
3003
3004/*----------------------------------------------------------------------------
3005| Returns the square root of the double-precision floating-point value `a'.
3006| The operation is performed according to the IEC/IEEE Standard for Binary
3007| Floating-Point Arithmetic.
3008*----------------------------------------------------------------------------*/
3009
3010float64 float64_sqrt( float64 a STATUS_PARAM )
3011{
3012    flag aSign;
3013    int16 aExp, zExp;
3014    bits64 aSig, zSig, doubleZSig;
3015    bits64 rem0, rem1, term0, term1;
3016
3017    aSig = extractFloat64Frac( a );
3018    aExp = extractFloat64Exp( a );
3019    aSign = extractFloat64Sign( a );
3020    if ( aExp == 0x7FF ) {
3021        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3022        if ( ! aSign ) return a;
3023        float_raise( float_flag_invalid STATUS_VAR);
3024        return float64_default_nan;
3025    }
3026    if ( aSign ) {
3027        if ( ( aExp | aSig ) == 0 ) return a;
3028        float_raise( float_flag_invalid STATUS_VAR);
3029        return float64_default_nan;
3030    }
3031    if ( aExp == 0 ) {
3032        if ( aSig == 0 ) return float64_zero;
3033        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3034    }
3035    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3036    aSig |= LIT64( 0x0010000000000000 );
3037    zSig = estimateSqrt32( aExp, aSig>>21 );
3038    aSig <<= 9 - ( aExp & 1 );
3039    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3040    if ( ( zSig & 0x1FF ) <= 5 ) {
3041        doubleZSig = zSig<<1;
3042        mul64To128( zSig, zSig, &term0, &term1 );
3043        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3044        while ( (sbits64) rem0 < 0 ) {
3045            --zSig;
3046            doubleZSig -= 2;
3047            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3048        }
3049        zSig |= ( ( rem0 | rem1 ) != 0 );
3050    }
3051    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3052
3053}
3054
3055/*----------------------------------------------------------------------------
3056| Returns the binary log of the double-precision floating-point value `a'.
3057| The operation is performed according to the IEC/IEEE Standard for Binary
3058| Floating-Point Arithmetic.
3059*----------------------------------------------------------------------------*/
3060float64 float64_log2( float64 a STATUS_PARAM )
3061{
3062    flag aSign, zSign;
3063    int16 aExp;
3064    bits64 aSig, aSig0, aSig1, zSig, i;
3065
3066    aSig = extractFloat64Frac( a );
3067    aExp = extractFloat64Exp( a );
3068    aSign = extractFloat64Sign( a );
3069
3070    if ( aExp == 0 ) {
3071        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3072        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3073    }
3074    if ( aSign ) {
3075        float_raise( float_flag_invalid STATUS_VAR);
3076        return float64_default_nan;
3077    }
3078    if ( aExp == 0x7FF ) {
3079        if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3080        return a;
3081    }
3082
3083    aExp -= 0x3FF;
3084    aSig |= LIT64( 0x0010000000000000 );
3085    zSign = aExp < 0;
3086    zSig = (bits64)aExp << 52;
3087    for (i = 1LL << 51; i > 0; i >>= 1) {
3088        mul64To128( aSig, aSig, &aSig0, &aSig1 );
3089        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3090        if ( aSig & LIT64( 0x0020000000000000 ) ) {
3091            aSig >>= 1;
3092            zSig |= i;
3093        }
3094    }
3095
3096    if ( zSign )
3097        zSig = -zSig;
3098    return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3099}
3100
3101/*----------------------------------------------------------------------------
3102| Returns 1 if the double-precision floating-point value `a' is equal to the
3103| corresponding value `b', and 0 otherwise.  The comparison is performed
3104| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3105*----------------------------------------------------------------------------*/
3106
3107int float64_eq( float64 a, float64 b STATUS_PARAM )
3108{
3109    bits64 av, bv;
3110
3111    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3112         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3113       ) {
3114        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3115            float_raise( float_flag_invalid STATUS_VAR);
3116        }
3117        return 0;
3118    }
3119    av = float64_val(a);
3120    bv = float64_val(b);
3121    return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3122
3123}
3124
3125/*----------------------------------------------------------------------------
3126| Returns 1 if the double-precision floating-point value `a' is less than or
3127| equal to the corresponding value `b', and 0 otherwise.  The comparison is
3128| performed according to the IEC/IEEE Standard for Binary Floating-Point
3129| Arithmetic.
3130*----------------------------------------------------------------------------*/
3131
3132int float64_le( float64 a, float64 b STATUS_PARAM )
3133{
3134    flag aSign, bSign;
3135    bits64 av, bv;
3136
3137    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3138         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3139       ) {
3140        float_raise( float_flag_invalid STATUS_VAR);
3141        return 0;
3142    }
3143    aSign = extractFloat64Sign( a );
3144    bSign = extractFloat64Sign( b );
3145    av = float64_val(a);
3146    bv = float64_val(b);
3147    if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3148    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3149
3150}
3151
3152/*----------------------------------------------------------------------------
3153| Returns 1 if the double-precision floating-point value `a' is less than
3154| the corresponding value `b', and 0 otherwise.  The comparison is performed
3155| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3156*----------------------------------------------------------------------------*/
3157
3158int float64_lt( float64 a, float64 b STATUS_PARAM )
3159{
3160    flag aSign, bSign;
3161    bits64 av, bv;
3162
3163    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3164         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3165       ) {
3166        float_raise( float_flag_invalid STATUS_VAR);
3167        return 0;
3168    }
3169    aSign = extractFloat64Sign( a );
3170    bSign = extractFloat64Sign( b );
3171    av = float64_val(a);
3172    bv = float64_val(b);
3173    if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3174    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3175
3176}
3177
3178/*----------------------------------------------------------------------------
3179| Returns 1 if the double-precision floating-point value `a' is equal to the
3180| corresponding value `b', and 0 otherwise.  The invalid exception is raised
3181| if either operand is a NaN.  Otherwise, the comparison is performed
3182| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3183*----------------------------------------------------------------------------*/
3184
3185int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3186{
3187    bits64 av, bv;
3188
3189    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3190         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3191       ) {
3192        float_raise( float_flag_invalid STATUS_VAR);
3193        return 0;
3194    }
3195    av = float64_val(a);
3196    bv = float64_val(b);
3197    return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3198
3199}
3200
3201/*----------------------------------------------------------------------------
3202| Returns 1 if the double-precision floating-point value `a' is less than or
3203| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3204| cause an exception.  Otherwise, the comparison is performed according to the
3205| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3206*----------------------------------------------------------------------------*/
3207
3208int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3209{
3210    flag aSign, bSign;
3211    bits64 av, bv;
3212
3213    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3214         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3215       ) {
3216        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3217            float_raise( float_flag_invalid STATUS_VAR);
3218        }
3219        return 0;
3220    }
3221    aSign = extractFloat64Sign( a );
3222    bSign = extractFloat64Sign( b );
3223    av = float64_val(a);
3224    bv = float64_val(b);
3225    if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3226    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3227
3228}
3229
3230/*----------------------------------------------------------------------------
3231| Returns 1 if the double-precision floating-point value `a' is less than
3232| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3233| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3234| Standard for Binary Floating-Point Arithmetic.
3235*----------------------------------------------------------------------------*/
3236
3237int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3238{
3239    flag aSign, bSign;
3240    bits64 av, bv;
3241
3242    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3243         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3244       ) {
3245        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3246            float_raise( float_flag_invalid STATUS_VAR);
3247        }
3248        return 0;
3249    }
3250    aSign = extractFloat64Sign( a );
3251    bSign = extractFloat64Sign( b );
3252    av = float64_val(a);
3253    bv = float64_val(b);
3254    if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3255    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3256
3257}
3258
3259#ifdef FLOATX80
3260
3261/*----------------------------------------------------------------------------
3262| Returns the result of converting the extended double-precision floating-
3263| point value `a' to the 32-bit two's complement integer format.  The
3264| conversion is performed according to the IEC/IEEE Standard for Binary
3265| Floating-Point Arithmetic---which means in particular that the conversion
3266| is rounded according to the current rounding mode.  If `a' is a NaN, the
3267| largest positive integer is returned.  Otherwise, if the conversion
3268| overflows, the largest integer with the same sign as `a' is returned.
3269*----------------------------------------------------------------------------*/
3270
3271int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3272{
3273    flag aSign;
3274    int32 aExp, shiftCount;
3275    bits64 aSig;
3276
3277    aSig = extractFloatx80Frac( a );
3278    aExp = extractFloatx80Exp( a );
3279    aSign = extractFloatx80Sign( a );
3280    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3281    shiftCount = 0x4037 - aExp;
3282    if ( shiftCount <= 0 ) shiftCount = 1;
3283    shift64RightJamming( aSig, shiftCount, &aSig );
3284    return roundAndPackInt32( aSign, aSig STATUS_VAR );
3285
3286}
3287
3288/*----------------------------------------------------------------------------
3289| Returns the result of converting the extended double-precision floating-
3290| point value `a' to the 32-bit two's complement integer format.  The
3291| conversion is performed according to the IEC/IEEE Standard for Binary
3292| Floating-Point Arithmetic, except that the conversion is always rounded
3293| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3294| Otherwise, if the conversion overflows, the largest integer with the same
3295| sign as `a' is returned.
3296*----------------------------------------------------------------------------*/
3297
3298int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3299{
3300    flag aSign;
3301    int32 aExp, shiftCount;
3302    bits64 aSig, savedASig;
3303    int32 z;
3304
3305    aSig = extractFloatx80Frac( a );
3306    aExp = extractFloatx80Exp( a );
3307    aSign = extractFloatx80Sign( a );
3308    if ( 0x401E < aExp ) {
3309        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3310        goto invalid;
3311    }
3312    else if ( aExp < 0x3FFF ) {
3313        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3314        return 0;
3315    }
3316    shiftCount = 0x403E - aExp;
3317    savedASig = aSig;
3318    aSig >>= shiftCount;
3319    z = aSig;
3320    if ( aSign ) z = - z;
3321    if ( ( z < 0 ) ^ aSign ) {
3322 invalid:
3323        float_raise( float_flag_invalid STATUS_VAR);
3324        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3325    }
3326    if ( ( aSig<<shiftCount ) != savedASig ) {
3327        STATUS(float_exception_flags) |= float_flag_inexact;
3328    }
3329    return z;
3330
3331}
3332
3333/*----------------------------------------------------------------------------
3334| Returns the result of converting the extended double-precision floating-
3335| point value `a' to the 64-bit two's complement integer format.  The
3336| conversion is performed according to the IEC/IEEE Standard for Binary
3337| Floating-Point Arithmetic---which means in particular that the conversion
3338| is rounded according to the current rounding mode.  If `a' is a NaN,
3339| the largest positive integer is returned.  Otherwise, if the conversion
3340| overflows, the largest integer with the same sign as `a' is returned.
3341*----------------------------------------------------------------------------*/
3342
3343int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3344{
3345    flag aSign;
3346    int32 aExp, shiftCount;
3347    bits64 aSig, aSigExtra;
3348
3349    aSig = extractFloatx80Frac( a );
3350    aExp = extractFloatx80Exp( a );
3351    aSign = extractFloatx80Sign( a );
3352    shiftCount = 0x403E - aExp;
3353    if ( shiftCount <= 0 ) {
3354        if ( shiftCount ) {
3355            float_raise( float_flag_invalid STATUS_VAR);
3356            if (    ! aSign
3357                 || (    ( aExp == 0x7FFF )
3358                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
3359               ) {
3360                return LIT64( 0x7FFFFFFFFFFFFFFF );
3361            }
3362            return (sbits64) LIT64( 0x8000000000000000 );
3363        }
3364        aSigExtra = 0;
3365    }
3366    else {
3367        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3368    }
3369    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3370
3371}
3372
3373/*----------------------------------------------------------------------------
3374| Returns the result of converting the extended double-precision floating-
3375| point value `a' to the 64-bit two's complement integer format.  The
3376| conversion is performed according to the IEC/IEEE Standard for Binary
3377| Floating-Point Arithmetic, except that the conversion is always rounded
3378| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3379| Otherwise, if the conversion overflows, the largest integer with the same
3380| sign as `a' is returned.
3381*----------------------------------------------------------------------------*/
3382
3383int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3384{
3385    flag aSign;
3386    int32 aExp, shiftCount;
3387    bits64 aSig;
3388    int64 z;
3389
3390    aSig = extractFloatx80Frac( a );
3391    aExp = extractFloatx80Exp( a );
3392    aSign = extractFloatx80Sign( a );
3393    shiftCount = aExp - 0x403E;
3394    if ( 0 <= shiftCount ) {
3395        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3396        if ( ( a.high != 0xC03E ) || aSig ) {
3397            float_raise( float_flag_invalid STATUS_VAR);
3398            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3399                return LIT64( 0x7FFFFFFFFFFFFFFF );
3400            }
3401        }
3402        return (sbits64) LIT64( 0x8000000000000000 );
3403    }
3404    else if ( aExp < 0x3FFF ) {
3405        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3406        return 0;
3407    }
3408    z = aSig>>( - shiftCount );
3409    if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3410        STATUS(float_exception_flags) |= float_flag_inexact;
3411    }
3412    if ( aSign ) z = - z;
3413    return z;
3414
3415}
3416
3417/*----------------------------------------------------------------------------
3418| Returns the result of converting the extended double-precision floating-
3419| point value `a' to the single-precision floating-point format.  The
3420| conversion is performed according to the IEC/IEEE Standard for Binary
3421| Floating-Point Arithmetic.
3422*----------------------------------------------------------------------------*/
3423
3424float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3425{
3426    flag aSign;
3427    int32 aExp;
3428    bits64 aSig;
3429
3430    aSig = extractFloatx80Frac( a );
3431    aExp = extractFloatx80Exp( a );
3432    aSign = extractFloatx80Sign( a );
3433    if ( aExp == 0x7FFF ) {
3434        if ( (bits64) ( aSig<<1 ) ) {
3435            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3436        }
3437        return packFloat32( aSign, 0xFF, 0 );
3438    }
3439    shift64RightJamming( aSig, 33, &aSig );
3440    if ( aExp || aSig ) aExp -= 0x3F81;
3441    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3442
3443}
3444
3445/*----------------------------------------------------------------------------
3446| Returns the result of converting the extended double-precision floating-
3447| point value `a' to the double-precision floating-point format.  The
3448| conversion is performed according to the IEC/IEEE Standard for Binary
3449| Floating-Point Arithmetic.
3450*----------------------------------------------------------------------------*/
3451
3452float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3453{
3454    flag aSign;
3455    int32 aExp;
3456    bits64 aSig, zSig;
3457
3458    aSig = extractFloatx80Frac( a );
3459    aExp = extractFloatx80Exp( a );
3460    aSign = extractFloatx80Sign( a );
3461    if ( aExp == 0x7FFF ) {
3462        if ( (bits64) ( aSig<<1 ) ) {
3463            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3464        }
3465        return packFloat64( aSign, 0x7FF, 0 );
3466    }
3467    shift64RightJamming( aSig, 1, &zSig );
3468    if ( aExp || aSig ) aExp -= 0x3C01;
3469    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3470
3471}
3472
3473#ifdef FLOAT128
3474
3475/*----------------------------------------------------------------------------
3476| Returns the result of converting the extended double-precision floating-
3477| point value `a' to the quadruple-precision floating-point format.  The
3478| conversion is performed according to the IEC/IEEE Standard for Binary
3479| Floating-Point Arithmetic.
3480*----------------------------------------------------------------------------*/
3481
3482float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3483{
3484    flag aSign;
3485    int16 aExp;
3486    bits64 aSig, zSig0, zSig1;
3487
3488    aSig = extractFloatx80Frac( a );
3489    aExp = extractFloatx80Exp( a );
3490    aSign = extractFloatx80Sign( a );
3491    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3492        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3493    }
3494    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3495    return packFloat128( aSign, aExp, zSig0, zSig1 );
3496
3497}
3498
3499#endif
3500
3501/*----------------------------------------------------------------------------
3502| Rounds the extended double-precision floating-point value `a' to an integer,
3503| and returns the result as an extended quadruple-precision floating-point
3504| value.  The operation is performed according to the IEC/IEEE Standard for
3505| Binary Floating-Point Arithmetic.
3506*----------------------------------------------------------------------------*/
3507
3508floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3509{
3510    flag aSign;
3511    int32 aExp;
3512    bits64 lastBitMask, roundBitsMask;
3513    int8 roundingMode;
3514    floatx80 z;
3515
3516    aExp = extractFloatx80Exp( a );
3517    if ( 0x403E <= aExp ) {
3518        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3519            return propagateFloatx80NaN( a, a STATUS_VAR );
3520        }
3521        return a;
3522    }
3523    if ( aExp < 0x3FFF ) {
3524        if (    ( aExp == 0 )
3525             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3526            return a;
3527        }
3528        STATUS(float_exception_flags) |= float_flag_inexact;
3529        aSign = extractFloatx80Sign( a );
3530        switch ( STATUS(float_rounding_mode) ) {
3531         case float_round_nearest_even:
3532            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3533               ) {
3534                return
3535                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3536            }
3537            break;
3538         case float_round_down:
3539            return
3540                  aSign ?
3541                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3542                : packFloatx80( 0, 0, 0 );
3543         case float_round_up:
3544            return
3545                  aSign ? packFloatx80( 1, 0, 0 )
3546                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3547        }
3548        return packFloatx80( aSign, 0, 0 );
3549    }
3550    lastBitMask = 1;
3551    lastBitMask <<= 0x403E - aExp;
3552    roundBitsMask = lastBitMask - 1;
3553    z = a;
3554    roundingMode = STATUS(float_rounding_mode);
3555    if ( roundingMode == float_round_nearest_even ) {
3556        z.low += lastBitMask>>1;
3557        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3558    }
3559    else if ( roundingMode != float_round_to_zero ) {
3560        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3561            z.low += roundBitsMask;
3562        }
3563    }
3564    z.low &= ~ roundBitsMask;
3565    if ( z.low == 0 ) {
3566        ++z.high;
3567        z.low = LIT64( 0x8000000000000000 );
3568    }
3569    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3570    return z;
3571
3572}
3573
3574/*----------------------------------------------------------------------------
3575| Returns the result of adding the absolute values of the extended double-
3576| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3577| negated before being returned.  `zSign' is ignored if the result is a NaN.
3578| The addition is performed according to the IEC/IEEE Standard for Binary
3579| Floating-Point Arithmetic.
3580*----------------------------------------------------------------------------*/
3581
3582static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3583{
3584    int32 aExp, bExp, zExp;
3585    bits64 aSig, bSig, zSig0, zSig1;
3586    int32 expDiff;
3587
3588    aSig = extractFloatx80Frac( a );
3589    aExp = extractFloatx80Exp( a );
3590    bSig = extractFloatx80Frac( b );
3591    bExp = extractFloatx80Exp( b );
3592    expDiff = aExp - bExp;
3593    if ( 0 < expDiff ) {
3594        if ( aExp == 0x7FFF ) {
3595            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3596            return a;
3597        }
3598        if ( bExp == 0 ) --expDiff;
3599        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3600        zExp = aExp;
3601    }
3602    else if ( expDiff < 0 ) {
3603        if ( bExp == 0x7FFF ) {
3604            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3605            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3606        }
3607        if ( aExp == 0 ) ++expDiff;
3608        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3609        zExp = bExp;
3610    }
3611    else {
3612        if ( aExp == 0x7FFF ) {
3613            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3614                return propagateFloatx80NaN( a, b STATUS_VAR );
3615            }
3616            return a;
3617        }
3618        zSig1 = 0;
3619        zSig0 = aSig + bSig;
3620        if ( aExp == 0 ) {
3621            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3622            goto roundAndPack;
3623        }
3624        zExp = aExp;
3625        goto shiftRight1;
3626    }
3627    zSig0 = aSig + bSig;
3628    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3629 shiftRight1:
3630    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3631    zSig0 |= LIT64( 0x8000000000000000 );
3632    ++zExp;
3633 roundAndPack:
3634    return
3635        roundAndPackFloatx80(
3636            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3637
3638}
3639
3640/*----------------------------------------------------------------------------
3641| Returns the result of subtracting the absolute values of the extended
3642| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3643| difference is negated before being returned.  `zSign' is ignored if the
3644| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3645| Standard for Binary Floating-Point Arithmetic.
3646*----------------------------------------------------------------------------*/
3647
3648static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3649{
3650    int32 aExp, bExp, zExp;
3651    bits64 aSig, bSig, zSig0, zSig1;
3652    int32 expDiff;
3653    floatx80 z;
3654
3655    aSig = extractFloatx80Frac( a );
3656    aExp = extractFloatx80Exp( a );
3657    bSig = extractFloatx80Frac( b );
3658    bExp = extractFloatx80Exp( b );
3659    expDiff = aExp - bExp;
3660    if ( 0 < expDiff ) goto aExpBigger;
3661    if ( expDiff < 0 ) goto bExpBigger;
3662    if ( aExp == 0x7FFF ) {
3663        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3664            return propagateFloatx80NaN( a, b STATUS_VAR );
3665        }
3666        float_raise( float_flag_invalid STATUS_VAR);
3667        z.low = floatx80_default_nan_low;
3668        z.high = floatx80_default_nan_high;
3669        return z;
3670    }
3671    if ( aExp == 0 ) {
3672        aExp = 1;
3673        bExp = 1;
3674    }
3675    zSig1 = 0;
3676    if ( bSig < aSig ) goto aBigger;
3677    if ( aSig < bSig ) goto bBigger;
3678    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3679 bExpBigger:
3680    if ( bExp == 0x7FFF ) {
3681        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3682        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3683    }
3684    if ( aExp == 0 ) ++expDiff;
3685    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3686 bBigger:
3687    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3688    zExp = bExp;
3689    zSign ^= 1;
3690    goto normalizeRoundAndPack;
3691 aExpBigger:
3692    if ( aExp == 0x7FFF ) {
3693        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3694        return a;
3695    }
3696    if ( bExp == 0 ) --expDiff;
3697    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3698 aBigger:
3699    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3700    zExp = aExp;
3701 normalizeRoundAndPack:
3702    return
3703        normalizeRoundAndPackFloatx80(
3704            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3705
3706}
3707
3708/*----------------------------------------------------------------------------
3709| Returns the result of adding the extended double-precision floating-point
3710| values `a' and `b'.  The operation is performed according to the IEC/IEEE
3711| Standard for Binary Floating-Point Arithmetic.
3712*----------------------------------------------------------------------------*/
3713
3714floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3715{
3716    flag aSign, bSign;
3717
3718    aSign = extractFloatx80Sign( a );
3719    bSign = extractFloatx80Sign( b );
3720    if ( aSign == bSign ) {
3721        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3722    }
3723    else {
3724        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3725    }
3726
3727}
3728
3729/*----------------------------------------------------------------------------
3730| Returns the result of subtracting the extended double-precision floating-
3731| point values `a' and `b'.  The operation is performed according to the
3732| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3733*----------------------------------------------------------------------------*/
3734
3735floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3736{
3737    flag aSign, bSign;
3738
3739    aSign = extractFloatx80Sign( a );
3740    bSign = extractFloatx80Sign( b );
3741    if ( aSign == bSign ) {
3742        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3743    }
3744    else {
3745        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3746    }
3747
3748}
3749
3750/*----------------------------------------------------------------------------
3751| Returns the result of multiplying the extended double-precision floating-
3752| point values `a' and `b'.  The operation is performed according to the
3753| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3754*----------------------------------------------------------------------------*/
3755
3756floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3757{
3758    flag aSign, bSign, zSign;
3759    int32 aExp, bExp, zExp;
3760    bits64 aSig, bSig, zSig0, zSig1;
3761    floatx80 z;
3762
3763    aSig = extractFloatx80Frac( a );
3764    aExp = extractFloatx80Exp( a );
3765    aSign = extractFloatx80Sign( a );
3766    bSig = extractFloatx80Frac( b );
3767    bExp = extractFloatx80Exp( b );
3768    bSign = extractFloatx80Sign( b );
3769    zSign = aSign ^ bSign;
3770    if ( aExp == 0x7FFF ) {
3771        if (    (bits64) ( aSig<<1 )
3772             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3773            return propagateFloatx80NaN( a, b STATUS_VAR );
3774        }
3775        if ( ( bExp | bSig ) == 0 ) goto invalid;
3776        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3777    }
3778    if ( bExp == 0x7FFF ) {
3779        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3780        if ( ( aExp | aSig ) == 0 ) {
3781 invalid:
3782            float_raise( float_flag_invalid STATUS_VAR);
3783            z.low = floatx80_default_nan_low;
3784            z.high = floatx80_default_nan_high;
3785            return z;
3786        }
3787        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3788    }
3789    if ( aExp == 0 ) {
3790        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3791        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3792    }
3793    if ( bExp == 0 ) {
3794        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3795        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3796    }
3797    zExp = aExp + bExp - 0x3FFE;
3798    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3799    if ( 0 < (sbits64) zSig0 ) {
3800        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3801        --zExp;
3802    }
3803    return
3804        roundAndPackFloatx80(
3805            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3806
3807}
3808
3809/*----------------------------------------------------------------------------
3810| Returns the result of dividing the extended double-precision floating-point
3811| value `a' by the corresponding value `b'.  The operation is performed
3812| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3813*----------------------------------------------------------------------------*/
3814
3815floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3816{
3817    flag aSign, bSign, zSign;
3818    int32 aExp, bExp, zExp;
3819    bits64 aSig, bSig, zSig0, zSig1;
3820    bits64 rem0, rem1, rem2, term0, term1, term2;
3821    floatx80 z;
3822
3823    aSig = extractFloatx80Frac( a );
3824    aExp = extractFloatx80Exp( a );
3825    aSign = extractFloatx80Sign( a );
3826    bSig = extractFloatx80Frac( b );
3827    bExp = extractFloatx80Exp( b );
3828    bSign = extractFloatx80Sign( b );
3829    zSign = aSign ^ bSign;
3830    if ( aExp == 0x7FFF ) {
3831        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3832        if ( bExp == 0x7FFF ) {
3833            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3834            goto invalid;
3835        }
3836        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3837    }
3838    if ( bExp == 0x7FFF ) {
3839        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3840        return packFloatx80( zSign, 0, 0 );
3841    }
3842    if ( bExp == 0 ) {
3843        if ( bSig == 0 ) {
3844            if ( ( aExp | aSig ) == 0 ) {
3845 invalid:
3846                float_raise( float_flag_invalid STATUS_VAR);
3847                z.low = floatx80_default_nan_low;
3848                z.high = floatx80_default_nan_high;
3849                return z;
3850            }
3851            float_raise( float_flag_divbyzero STATUS_VAR);
3852            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3853        }
3854        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3855    }
3856    if ( aExp == 0 ) {
3857        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3858        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3859    }
3860    zExp = aExp - bExp + 0x3FFE;
3861    rem1 = 0;
3862    if ( bSig <= aSig ) {
3863        shift128Right( aSig, 0, 1, &aSig, &rem1 );
3864        ++zExp;
3865    }
3866    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3867    mul64To128( bSig, zSig0, &term0, &term1 );
3868    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3869    while ( (sbits64) rem0 < 0 ) {
3870        --zSig0;
3871        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3872    }
3873    zSig1 = estimateDiv128To64( rem1, 0, bSig );
3874    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3875        mul64To128( bSig, zSig1, &term1, &term2 );
3876        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3877        while ( (sbits64) rem1 < 0 ) {
3878            --zSig1;
3879            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3880        }
3881        zSig1 |= ( ( rem1 | rem2 ) != 0 );
3882    }
3883    return
3884        roundAndPackFloatx80(
3885            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3886
3887}
3888
3889/*----------------------------------------------------------------------------
3890| Returns the remainder of the extended double-precision floating-point value
3891| `a' with respect to the corresponding value `b'.  The operation is performed
3892| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3893*----------------------------------------------------------------------------*/
3894
3895floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3896{
3897    flag aSign, bSign, zSign;
3898    int32 aExp, bExp, expDiff;
3899    bits64 aSig0, aSig1, bSig;
3900    bits64 q, term0, term1, alternateASig0, alternateASig1;
3901    floatx80 z;
3902
3903    aSig0 = extractFloatx80Frac( a );
3904    aExp = extractFloatx80Exp( a );
3905    aSign = extractFloatx80Sign( a );
3906    bSig = extractFloatx80Frac( b );
3907    bExp = extractFloatx80Exp( b );
3908    bSign = extractFloatx80Sign( b );
3909    if ( aExp == 0x7FFF ) {
3910        if (    (bits64) ( aSig0<<1 )
3911             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3912            return propagateFloatx80NaN( a, b STATUS_VAR );
3913        }
3914        goto invalid;
3915    }
3916    if ( bExp == 0x7FFF ) {
3917        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3918        return a;
3919    }
3920    if ( bExp == 0 ) {
3921        if ( bSig == 0 ) {
3922 invalid:
3923            float_raise( float_flag_invalid STATUS_VAR);
3924            z.low = floatx80_default_nan_low;
3925            z.high = floatx80_default_nan_high;
3926            return z;
3927        }
3928        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3929    }
3930    if ( aExp == 0 ) {
3931        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3932        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3933    }
3934    bSig |= LIT64( 0x8000000000000000 );
3935    zSign = aSign;
3936    expDiff = aExp - bExp;
3937    aSig1 = 0;
3938    if ( expDiff < 0 ) {
3939        if ( expDiff < -1 ) return a;
3940        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3941        expDiff = 0;
3942    }
3943    q = ( bSig <= aSig0 );
3944    if ( q ) aSig0 -= bSig;
3945    expDiff -= 64;
3946    while ( 0 < expDiff ) {
3947        q = estimateDiv128To64( aSig0, aSig1, bSig );
3948        q = ( 2 < q ) ? q - 2 : 0;
3949        mul64To128( bSig, q, &term0, &term1 );
3950        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3951        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3952        expDiff -= 62;
3953    }
3954    expDiff += 64;
3955    if ( 0 < expDiff ) {
3956        q = estimateDiv128To64( aSig0, aSig1, bSig );
3957        q = ( 2 < q ) ? q - 2 : 0;
3958        q >>= 64 - expDiff;
3959        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3960        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3961        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3962        while ( le128( term0, term1, aSig0, aSig1 ) ) {
3963            ++q;
3964            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3965        }
3966    }
3967    else {
3968        term1 = 0;
3969        term0 = bSig;
3970    }
3971    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3972    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3973         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3974              && ( q & 1 ) )
3975       ) {
3976        aSig0 = alternateASig0;
3977        aSig1 = alternateASig1;
3978        zSign = ! zSign;
3979    }
3980    return
3981        normalizeRoundAndPackFloatx80(
3982            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3983
3984}
3985
3986/*----------------------------------------------------------------------------
3987| Returns the square root of the extended double-precision floating-point
3988| value `a'.  The operation is performed according to the IEC/IEEE Standard
3989| for Binary Floating-Point Arithmetic.
3990*----------------------------------------------------------------------------*/
3991
3992floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3993{
3994    flag aSign;
3995    int32 aExp, zExp;
3996    bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3997    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3998    floatx80 z;
3999
4000    aSig0 = extractFloatx80Frac( a );
4001    aExp = extractFloatx80Exp( a );
4002    aSign = extractFloatx80Sign( a );
4003    if ( aExp == 0x7FFF ) {
4004        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4005        if ( ! aSign ) return a;
4006        goto invalid;
4007    }
4008    if ( aSign ) {
4009        if ( ( aExp | aSig0 ) == 0 ) return a;
4010 invalid:
4011        float_raise( float_flag_invalid STATUS_VAR);
4012        z.low = floatx80_default_nan_low;
4013        z.high = floatx80_default_nan_high;
4014        return z;
4015    }
4016    if ( aExp == 0 ) {
4017        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4018        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4019    }
4020    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4021    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4022    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4023    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4024    doubleZSig0 = zSig0<<1;
4025    mul64To128( zSig0, zSig0, &term0, &term1 );
4026    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4027    while ( (sbits64) rem0 < 0 ) {
4028        --zSig0;
4029        doubleZSig0 -= 2;
4030        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4031    }
4032    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4033    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4034        if ( zSig1 == 0 ) zSig1 = 1;
4035        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4036        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4037        mul64To128( zSig1, zSig1, &term2, &term3 );
4038        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4039        while ( (sbits64) rem1 < 0 ) {
4040            --zSig1;
4041            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4042            term3 |= 1;
4043            term2 |= doubleZSig0;
4044            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4045        }
4046        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4047    }
4048    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4049    zSig0 |= doubleZSig0;
4050    return
4051        roundAndPackFloatx80(
4052            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4053
4054}
4055
4056/*----------------------------------------------------------------------------
4057| Returns 1 if the extended double-precision floating-point value `a' is
4058| equal to the corresponding value `b', and 0 otherwise.  The comparison is
4059| performed according to the IEC/IEEE Standard for Binary Floating-Point
4060| Arithmetic.
4061*----------------------------------------------------------------------------*/
4062
4063int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4064{
4065
4066    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4067              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4068         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4069              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4070       ) {
4071        if (    floatx80_is_signaling_nan( a )
4072             || floatx80_is_signaling_nan( b ) ) {
4073            float_raise( float_flag_invalid STATUS_VAR);
4074        }
4075        return 0;
4076    }
4077    return
4078           ( a.low == b.low )
4079        && (    ( a.high == b.high )
4080             || (    ( a.low == 0 )
4081                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4082           );
4083
4084}
4085
4086/*----------------------------------------------------------------------------
4087| Returns 1 if the extended double-precision floating-point value `a' is
4088| less than or equal to the corresponding value `b', and 0 otherwise.  The
4089| comparison is performed according to the IEC/IEEE Standard for Binary
4090| Floating-Point Arithmetic.
4091*----------------------------------------------------------------------------*/
4092
4093int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4094{
4095    flag aSign, bSign;
4096
4097    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4098              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4099         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4100              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4101       ) {
4102        float_raise( float_flag_invalid STATUS_VAR);
4103        return 0;
4104    }
4105    aSign = extractFloatx80Sign( a );
4106    bSign = extractFloatx80Sign( b );
4107    if ( aSign != bSign ) {
4108        return
4109               aSign
4110            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4111                 == 0 );
4112    }
4113    return
4114          aSign ? le128( b.high, b.low, a.high, a.low )
4115        : le128( a.high, a.low, b.high, b.low );
4116
4117}
4118
4119/*----------------------------------------------------------------------------
4120| Returns 1 if the extended double-precision floating-point value `a' is
4121| less than the corresponding value `b', and 0 otherwise.  The comparison
4122| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4123| Arithmetic.
4124*----------------------------------------------------------------------------*/
4125
4126int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4127{
4128    flag aSign, bSign;
4129
4130    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4131              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4132         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4133              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4134       ) {
4135        float_raise( float_flag_invalid STATUS_VAR);
4136        return 0;
4137    }
4138    aSign = extractFloatx80Sign( a );
4139    bSign = extractFloatx80Sign( b );
4140    if ( aSign != bSign ) {
4141        return
4142               aSign
4143            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4144                 != 0 );
4145    }
4146    return
4147          aSign ? lt128( b.high, b.low, a.high, a.low )
4148        : lt128( a.high, a.low, b.high, b.low );
4149
4150}
4151
4152/*----------------------------------------------------------------------------
4153| Returns 1 if the extended double-precision floating-point value `a' is equal
4154| to the corresponding value `b', and 0 otherwise.  The invalid exception is
4155| raised if either operand is a NaN.  Otherwise, the comparison is performed
4156| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4157*----------------------------------------------------------------------------*/
4158
4159int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4160{
4161
4162    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4163              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4164         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4165              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4166       ) {
4167        float_raise( float_flag_invalid STATUS_VAR);
4168        return 0;
4169    }
4170    return
4171           ( a.low == b.low )
4172        && (    ( a.high == b.high )
4173             || (    ( a.low == 0 )
4174                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4175           );
4176
4177}
4178
4179/*----------------------------------------------------------------------------
4180| Returns 1 if the extended double-precision floating-point value `a' is less
4181| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4182| do not cause an exception.  Otherwise, the comparison is performed according
4183| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4184*----------------------------------------------------------------------------*/
4185
4186int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4187{
4188    flag aSign, bSign;
4189
4190    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4191              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4192         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4193              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4194       ) {
4195        if (    floatx80_is_signaling_nan( a )
4196             || floatx80_is_signaling_nan( b ) ) {
4197            float_raise( float_flag_invalid STATUS_VAR);
4198        }
4199        return 0;
4200    }
4201    aSign = extractFloatx80Sign( a );
4202    bSign = extractFloatx80Sign( b );
4203    if ( aSign != bSign ) {
4204        return
4205               aSign
4206            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4207                 == 0 );
4208    }
4209    return
4210          aSign ? le128( b.high, b.low, a.high, a.low )
4211        : le128( a.high, a.low, b.high, b.low );
4212
4213}
4214
4215/*----------------------------------------------------------------------------
4216| Returns 1 if the extended double-precision floating-point value `a' is less
4217| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4218| an exception.  Otherwise, the comparison is performed according to the
4219| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4220*----------------------------------------------------------------------------*/
4221
4222int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4223{
4224    flag aSign, bSign;
4225
4226    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4227              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4228         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4229              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4230       ) {
4231        if (    floatx80_is_signaling_nan( a )
4232             || floatx80_is_signaling_nan( b ) ) {
4233            float_raise( float_flag_invalid STATUS_VAR);
4234        }
4235        return 0;
4236    }
4237    aSign = extractFloatx80Sign( a );
4238    bSign = extractFloatx80Sign( b );
4239    if ( aSign != bSign ) {
4240        return
4241               aSign
4242            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4243                 != 0 );
4244    }
4245    return
4246          aSign ? lt128( b.high, b.low, a.high, a.low )
4247        : lt128( a.high, a.low, b.high, b.low );
4248
4249}
4250
4251#endif
4252
4253#ifdef FLOAT128
4254
4255/*----------------------------------------------------------------------------
4256| Returns the result of converting the quadruple-precision floating-point
4257| value `a' to the 32-bit two's complement integer format.  The conversion
4258| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4259| Arithmetic---which means in particular that the conversion is rounded
4260| according to the current rounding mode.  If `a' is a NaN, the largest
4261| positive integer is returned.  Otherwise, if the conversion overflows, the
4262| largest integer with the same sign as `a' is returned.
4263*----------------------------------------------------------------------------*/
4264
4265int32 float128_to_int32( float128 a STATUS_PARAM )
4266{
4267    flag aSign;
4268    int32 aExp, shiftCount;
4269    bits64 aSig0, aSig1;
4270
4271    aSig1 = extractFloat128Frac1( a );
4272    aSig0 = extractFloat128Frac0( a );
4273    aExp = extractFloat128Exp( a );
4274    aSign = extractFloat128Sign( a );
4275    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4276    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4277    aSig0 |= ( aSig1 != 0 );
4278    shiftCount = 0x4028 - aExp;
4279    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4280    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4281
4282}
4283
4284/*----------------------------------------------------------------------------
4285| Returns the result of converting the quadruple-precision floating-point
4286| value `a' to the 32-bit two's complement integer format.  The conversion
4287| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4288| Arithmetic, except that the conversion is always rounded toward zero.  If
4289| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4290| conversion overflows, the largest integer with the same sign as `a' is
4291| returned.
4292*----------------------------------------------------------------------------*/
4293
4294int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4295{
4296    flag aSign;
4297    int32 aExp, shiftCount;
4298    bits64 aSig0, aSig1, savedASig;
4299    int32 z;
4300
4301    aSig1 = extractFloat128Frac1( a );
4302    aSig0 = extractFloat128Frac0( a );
4303    aExp = extractFloat128Exp( a );
4304    aSign = extractFloat128Sign( a );
4305    aSig0 |= ( aSig1 != 0 );
4306    if ( 0x401E < aExp ) {
4307        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4308        goto invalid;
4309    }
4310    else if ( aExp < 0x3FFF ) {
4311        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4312        return 0;
4313    }
4314    aSig0 |= LIT64( 0x0001000000000000 );
4315    shiftCount = 0x402F - aExp;
4316    savedASig = aSig0;
4317    aSig0 >>= shiftCount;
4318    z = aSig0;
4319    if ( aSign ) z = - z;
4320    if ( ( z < 0 ) ^ aSign ) {
4321 invalid:
4322        float_raise( float_flag_invalid STATUS_VAR);
4323        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4324    }
4325    if ( ( aSig0<<shiftCount ) != savedASig ) {
4326        STATUS(float_exception_flags) |= float_flag_inexact;
4327    }
4328    return z;
4329
4330}
4331
4332/*----------------------------------------------------------------------------
4333| Returns the result of converting the quadruple-precision floating-point
4334| value `a' to the 64-bit two's complement integer format.  The conversion
4335| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4336| Arithmetic---which means in particular that the conversion is rounded
4337| according to the current rounding mode.  If `a' is a NaN, the largest
4338| positive integer is returned.  Otherwise, if the conversion overflows, the
4339| largest integer with the same sign as `a' is returned.
4340*----------------------------------------------------------------------------*/
4341
4342int64 float128_to_int64( float128 a STATUS_PARAM )
4343{
4344    flag aSign;
4345    int32 aExp, shiftCount;
4346    bits64 aSig0, aSig1;
4347
4348    aSig1 = extractFloat128Frac1( a );
4349    aSig0 = extractFloat128Frac0( a );
4350    aExp = extractFloat128Exp( a );
4351    aSign = extractFloat128Sign( a );
4352    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4353    shiftCount = 0x402F - aExp;
4354    if ( shiftCount <= 0 ) {
4355        if ( 0x403E < aExp ) {
4356            float_raise( float_flag_invalid STATUS_VAR);
4357            if (    ! aSign
4358                 || (    ( aExp == 0x7FFF )
4359                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4360                    )
4361               ) {
4362                return LIT64( 0x7FFFFFFFFFFFFFFF );
4363            }
4364            return (sbits64) LIT64( 0x8000000000000000 );
4365        }
4366        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4367    }
4368    else {
4369        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4370    }
4371    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4372
4373}
4374
4375/*----------------------------------------------------------------------------
4376| Returns the result of converting the quadruple-precision floating-point
4377| value `a' to the 64-bit two's complement integer format.  The conversion
4378| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4379| Arithmetic, except that the conversion is always rounded toward zero.
4380| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4381| the conversion overflows, the largest integer with the same sign as `a' is
4382| returned.
4383*----------------------------------------------------------------------------*/
4384
4385int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4386{
4387    flag aSign;
4388    int32 aExp, shiftCount;
4389    bits64 aSig0, aSig1;
4390    int64 z;
4391
4392    aSig1 = extractFloat128Frac1( a );
4393    aSig0 = extractFloat128Frac0( a );
4394    aExp = extractFloat128Exp( a );
4395    aSign = extractFloat128Sign( a );
4396    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4397    shiftCount = aExp - 0x402F;
4398    if ( 0 < shiftCount ) {
4399        if ( 0x403E <= aExp ) {
4400            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4401            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4402                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4403                if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4404            }
4405            else {
4406                float_raise( float_flag_invalid STATUS_VAR);
4407                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4408                    return LIT64( 0x7FFFFFFFFFFFFFFF );
4409                }
4410            }
4411            return (sbits64) LIT64( 0x8000000000000000 );
4412        }
4413        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4414        if ( (bits64) ( aSig1<<shiftCount ) ) {
4415            STATUS(float_exception_flags) |= float_flag_inexact;
4416        }
4417    }
4418    else {
4419        if ( aExp < 0x3FFF ) {
4420            if ( aExp | aSig0 | aSig1 ) {
4421                STATUS(float_exception_flags) |= float_flag_inexact;
4422            }
4423            return 0;
4424        }
4425        z = aSig0>>( - shiftCount );
4426        if (    aSig1
4427             || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4428            STATUS(float_exception_flags) |= float_flag_inexact;
4429        }
4430    }
4431    if ( aSign ) z = - z;
4432    return z;
4433
4434}
4435
4436/*----------------------------------------------------------------------------
4437| Returns the result of converting the quadruple-precision floating-point
4438| value `a' to the single-precision floating-point format.  The conversion
4439| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4440| Arithmetic.
4441*----------------------------------------------------------------------------*/
4442
4443float32 float128_to_float32( float128 a STATUS_PARAM )
4444{
4445    flag aSign;
4446    int32 aExp;
4447    bits64 aSig0, aSig1;
4448    bits32 zSig;
4449
4450    aSig1 = extractFloat128Frac1( a );
4451    aSig0 = extractFloat128Frac0( a );
4452    aExp = extractFloat128Exp( a );
4453    aSign = extractFloat128Sign( a );
4454    if ( aExp == 0x7FFF ) {
4455        if ( aSig0 | aSig1 ) {
4456            return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4457        }
4458        return packFloat32( aSign, 0xFF, 0 );
4459    }
4460    aSig0 |= ( aSig1 != 0 );
4461    shift64RightJamming( aSig0, 18, &aSig0 );
4462    zSig = aSig0;
4463    if ( aExp || zSig ) {
4464        zSig |= 0x40000000;
4465        aExp -= 0x3F81;
4466    }
4467    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4468
4469}
4470
4471/*----------------------------------------------------------------------------
4472| Returns the result of converting the quadruple-precision floating-point
4473| value `a' to the double-precision floating-point format.  The conversion
4474| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4475| Arithmetic.
4476*----------------------------------------------------------------------------*/
4477
4478float64 float128_to_float64( float128 a STATUS_PARAM )
4479{
4480    flag aSign;
4481    int32 aExp;
4482    bits64 aSig0, aSig1;
4483
4484    aSig1 = extractFloat128Frac1( a );
4485    aSig0 = extractFloat128Frac0( a );
4486    aExp = extractFloat128Exp( a );
4487    aSign = extractFloat128Sign( a );
4488    if ( aExp == 0x7FFF ) {
4489        if ( aSig0 | aSig1 ) {
4490            return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4491        }
4492        return packFloat64( aSign, 0x7FF, 0 );
4493    }
4494    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4495    aSig0 |= ( aSig1 != 0 );
4496    if ( aExp || aSig0 ) {
4497        aSig0 |= LIT64( 0x4000000000000000 );
4498        aExp -= 0x3C01;
4499    }
4500    return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4501
4502}
4503
4504#ifdef FLOATX80
4505
4506/*----------------------------------------------------------------------------
4507| Returns the result of converting the quadruple-precision floating-point
4508| value `a' to the extended double-precision floating-point format.  The
4509| conversion is performed according to the IEC/IEEE Standard for Binary
4510| Floating-Point Arithmetic.
4511*----------------------------------------------------------------------------*/
4512
4513floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4514{
4515    flag aSign;
4516    int32 aExp;
4517    bits64 aSig0, aSig1;
4518
4519    aSig1 = extractFloat128Frac1( a );
4520    aSig0 = extractFloat128Frac0( a );
4521    aExp = extractFloat128Exp( a );
4522    aSign = extractFloat128Sign( a );
4523    if ( aExp == 0x7FFF ) {
4524        if ( aSig0 | aSig1 ) {
4525            return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4526        }
4527        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4528    }
4529    if ( aExp == 0 ) {
4530        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4531        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4532    }
4533    else {
4534        aSig0 |= LIT64( 0x0001000000000000 );
4535    }
4536    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4537    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4538
4539}
4540
4541#endif
4542
4543/*----------------------------------------------------------------------------
4544| Rounds the quadruple-precision floating-point value `a' to an integer, and
4545| returns the result as a quadruple-precision floating-point value.  The
4546| operation is performed according to the IEC/IEEE Standard for Binary
4547| Floating-Point Arithmetic.
4548*----------------------------------------------------------------------------*/
4549
4550float128 float128_round_to_int( float128 a STATUS_PARAM )
4551{
4552    flag aSign;
4553    int32 aExp;
4554    bits64 lastBitMask, roundBitsMask;
4555    int8 roundingMode;
4556    float128 z;
4557
4558    aExp = extractFloat128Exp( a );
4559    if ( 0x402F <= aExp ) {
4560        if ( 0x406F <= aExp ) {
4561            if (    ( aExp == 0x7FFF )
4562                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4563               ) {
4564                return propagateFloat128NaN( a, a STATUS_VAR );
4565            }
4566            return a;
4567        }
4568        lastBitMask = 1;
4569        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4570        roundBitsMask = lastBitMask - 1;
4571        z = a;
4572        roundingMode = STATUS(float_rounding_mode);
4573        if ( roundingMode == float_round_nearest_even ) {
4574            if ( lastBitMask ) {
4575                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4576                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4577            }
4578            else {
4579                if ( (sbits64) z.low < 0 ) {
4580                    ++z.high;
4581                    if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4582                }
4583            }
4584        }
4585        else if ( roundingMode != float_round_to_zero ) {
4586            if (   extractFloat128Sign( z )
4587                 ^ ( roundingMode == float_round_up ) ) {
4588                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4589            }
4590        }
4591        z.low &= ~ roundBitsMask;
4592    }
4593    else {
4594        if ( aExp < 0x3FFF ) {
4595            if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4596            STATUS(float_exception_flags) |= float_flag_inexact;
4597            aSign = extractFloat128Sign( a );
4598            switch ( STATUS(float_rounding_mode) ) {
4599             case float_round_nearest_even:
4600                if (    ( aExp == 0x3FFE )
4601                     && (   extractFloat128Frac0( a )
4602                          | extractFloat128Frac1( a ) )
4603                   ) {
4604                    return packFloat128( aSign, 0x3FFF, 0, 0 );
4605                }
4606                break;
4607             case float_round_down:
4608                return
4609                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4610                    : packFloat128( 0, 0, 0, 0 );
4611             case float_round_up:
4612                return
4613                      aSign ? packFloat128( 1, 0, 0, 0 )
4614                    : packFloat128( 0, 0x3FFF, 0, 0 );
4615            }
4616            return packFloat128( aSign, 0, 0, 0 );
4617        }
4618        lastBitMask = 1;
4619        lastBitMask <<= 0x402F - aExp;
4620        roundBitsMask = lastBitMask - 1;
4621        z.low = 0;
4622        z.high = a.high;
4623        roundingMode = STATUS(float_rounding_mode);
4624        if ( roundingMode == float_round_nearest_even ) {
4625            z.high += lastBitMask>>1;
4626            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4627                z.high &= ~ lastBitMask;
4628            }
4629        }
4630        else if ( roundingMode != float_round_to_zero ) {
4631            if (   extractFloat128Sign( z )
4632                 ^ ( roundingMode == float_round_up ) ) {
4633                z.high |= ( a.low != 0 );
4634                z.high += roundBitsMask;
4635            }
4636        }
4637        z.high &= ~ roundBitsMask;
4638    }
4639    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4640        STATUS(float_exception_flags) |= float_flag_inexact;
4641    }
4642    return z;
4643
4644}
4645
4646/*----------------------------------------------------------------------------
4647| Returns the result of adding the absolute values of the quadruple-precision
4648| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4649| before being returned.  `zSign' is ignored if the result is a NaN.
4650| The addition is performed according to the IEC/IEEE Standard for Binary
4651| Floating-Point Arithmetic.
4652*----------------------------------------------------------------------------*/
4653
4654static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4655{
4656    int32 aExp, bExp, zExp;
4657    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4658    int32 expDiff;
4659
4660    aSig1 = extractFloat128Frac1( a );
4661    aSig0 = extractFloat128Frac0( a );
4662    aExp = extractFloat128Exp( a );
4663    bSig1 = extractFloat128Frac1( b );
4664    bSig0 = extractFloat128Frac0( b );
4665    bExp = extractFloat128Exp( b );
4666    expDiff = aExp - bExp;
4667    if ( 0 < expDiff ) {
4668        if ( aExp == 0x7FFF ) {
4669            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4670            return a;
4671        }
4672        if ( bExp == 0 ) {
4673            --expDiff;
4674        }
4675        else {
4676            bSig0 |= LIT64( 0x0001000000000000 );
4677        }
4678        shift128ExtraRightJamming(
4679            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4680        zExp = aExp;
4681    }
4682    else if ( expDiff < 0 ) {
4683        if ( bExp == 0x7FFF ) {
4684            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4685            return packFloat128( zSign, 0x7FFF, 0, 0 );
4686        }
4687        if ( aExp == 0 ) {
4688            ++expDiff;
4689        }
4690        else {
4691            aSig0 |= LIT64( 0x0001000000000000 );
4692        }
4693        shift128ExtraRightJamming(
4694            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4695        zExp = bExp;
4696    }
4697    else {
4698        if ( aExp == 0x7FFF ) {
4699            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4700                return propagateFloat128NaN( a, b STATUS_VAR );
4701            }
4702            return a;
4703        }
4704        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4705        if ( aExp == 0 ) {
4706            if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
4707            return packFloat128( zSign, 0, zSig0, zSig1 );
4708        }
4709        zSig2 = 0;
4710        zSig0 |= LIT64( 0x0002000000000000 );
4711        zExp = aExp;
4712        goto shiftRight1;
4713    }
4714    aSig0 |= LIT64( 0x0001000000000000 );
4715    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4716    --zExp;
4717    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4718    ++zExp;
4719 shiftRight1:
4720    shift128ExtraRightJamming(
4721        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4722 roundAndPack:
4723    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4724
4725}
4726
4727/*----------------------------------------------------------------------------
4728| Returns the result of subtracting the absolute values of the quadruple-
4729| precision floating-point values `a' and `b'.  If `zSign' is 1, the
4730| difference is negated before being returned.  `zSign' is ignored if the
4731| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4732| Standard for Binary Floating-Point Arithmetic.
4733*----------------------------------------------------------------------------*/
4734
4735static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4736{
4737    int32 aExp, bExp, zExp;
4738    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4739    int32 expDiff;
4740    float128 z;
4741
4742    aSig1 = extractFloat128Frac1( a );
4743    aSig0 = extractFloat128Frac0( a );
4744    aExp = extractFloat128Exp( a );
4745    bSig1 = extractFloat128Frac1( b );
4746    bSig0 = extractFloat128Frac0( b );
4747    bExp = extractFloat128Exp( b );
4748    expDiff = aExp - bExp;
4749    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4750    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4751    if ( 0 < expDiff ) goto aExpBigger;
4752    if ( expDiff < 0 ) goto bExpBigger;
4753    if ( aExp == 0x7FFF ) {
4754        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4755            return propagateFloat128NaN( a, b STATUS_VAR );
4756        }
4757        float_raise( float_flag_invalid STATUS_VAR);
4758        z.low = float128_default_nan_low;
4759        z.high = float128_default_nan_high;
4760        return z;
4761    }
4762    if ( aExp == 0 ) {
4763        aExp = 1;
4764        bExp = 1;
4765    }
4766    if ( bSig0 < aSig0 ) goto aBigger;
4767    if ( aSig0 < bSig0 ) goto bBigger;
4768    if ( bSig1 < aSig1 ) goto aBigger;
4769    if ( aSig1 < bSig1 ) goto bBigger;
4770    return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4771 bExpBigger:
4772    if ( bExp == 0x7FFF ) {
4773        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4774        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4775    }
4776    if ( aExp == 0 ) {
4777        ++expDiff;
4778    }
4779    else {
4780        aSig0 |= LIT64( 0x4000000000000000 );
4781    }
4782    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4783    bSig0 |= LIT64( 0x4000000000000000 );
4784 bBigger:
4785    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4786    zExp = bExp;
4787    zSign ^= 1;
4788    goto normalizeRoundAndPack;
4789 aExpBigger:
4790    if ( aExp == 0x7FFF ) {
4791        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4792        return a;
4793    }
4794    if ( bExp == 0 ) {
4795        --expDiff;
4796    }
4797    else {
4798        bSig0 |= LIT64( 0x4000000000000000 );
4799    }
4800    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4801    aSig0 |= LIT64( 0x4000000000000000 );
4802 aBigger:
4803    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4804    zExp = aExp;
4805 normalizeRoundAndPack:
4806    --zExp;
4807    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4808
4809}
4810
4811/*----------------------------------------------------------------------------
4812| Returns the result of adding the quadruple-precision floating-point values
4813| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4814| for Binary Floating-Point Arithmetic.
4815*----------------------------------------------------------------------------*/
4816
4817float128 float128_add( float128 a, float128 b STATUS_PARAM )
4818{
4819    flag aSign, bSign;
4820
4821    aSign = extractFloat128Sign( a );
4822    bSign = extractFloat128Sign( b );
4823    if ( aSign == bSign ) {
4824        return addFloat128Sigs( a, b, aSign STATUS_VAR );
4825    }
4826    else {
4827        return subFloat128Sigs( a, b, aSign STATUS_VAR );
4828    }
4829
4830}
4831
4832/*----------------------------------------------------------------------------
4833| Returns the result of subtracting the quadruple-precision floating-point
4834| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4835| Standard for Binary Floating-Point Arithmetic.
4836*----------------------------------------------------------------------------*/
4837
4838float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4839{
4840    flag aSign, bSign;
4841
4842    aSign = extractFloat128Sign( a );
4843    bSign = extractFloat128Sign( b );
4844    if ( aSign == bSign ) {
4845        return subFloat128Sigs( a, b, aSign STATUS_VAR );
4846    }
4847    else {
4848        return addFloat128Sigs( a, b, aSign STATUS_VAR );
4849    }
4850
4851}
4852
4853/*----------------------------------------------------------------------------
4854| Returns the result of multiplying the quadruple-precision floating-point
4855| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4856| Standard for Binary Floating-Point Arithmetic.
4857*----------------------------------------------------------------------------*/
4858
4859float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4860{
4861    flag aSign, bSign, zSign;
4862    int32 aExp, bExp, zExp;
4863    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4864    float128 z;
4865
4866    aSig1 = extractFloat128Frac1( a );
4867    aSig0 = extractFloat128Frac0( a );
4868    aExp = extractFloat128Exp( a );
4869    aSign = extractFloat128Sign( a );
4870    bSig1 = extractFloat128Frac1( b );
4871    bSig0 = extractFloat128Frac0( b );
4872    bExp = extractFloat128Exp( b );
4873    bSign = extractFloat128Sign( b );
4874    zSign = aSign ^ bSign;
4875    if ( aExp == 0x7FFF ) {
4876        if (    ( aSig0 | aSig1 )
4877             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4878            return propagateFloat128NaN( a, b STATUS_VAR );
4879        }
4880        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4881        return packFloat128( zSign, 0x7FFF, 0, 0 );
4882    }
4883    if ( bExp == 0x7FFF ) {
4884        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4885        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4886 invalid:
4887            float_raise( float_flag_invalid STATUS_VAR);
4888            z.low = float128_default_nan_low;
4889            z.high = float128_default_nan_high;
4890            return z;
4891        }
4892        return packFloat128( zSign, 0x7FFF, 0, 0 );
4893    }
4894    if ( aExp == 0 ) {
4895        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4896        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4897    }
4898    if ( bExp == 0 ) {
4899        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4900        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4901    }
4902    zExp = aExp + bExp - 0x4000;
4903    aSig0 |= LIT64( 0x0001000000000000 );
4904    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4905    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4906    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4907    zSig2 |= ( zSig3 != 0 );
4908    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4909        shift128ExtraRightJamming(
4910            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4911        ++zExp;
4912    }
4913    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4914
4915}
4916
4917/*----------------------------------------------------------------------------
4918| Returns the result of dividing the quadruple-precision floating-point value
4919| `a' by the corresponding value `b'.  The operation is performed according to
4920| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4921*----------------------------------------------------------------------------*/
4922
4923float128 float128_div( float128 a, float128 b STATUS_PARAM )
4924{
4925    flag aSign, bSign, zSign;
4926    int32 aExp, bExp, zExp;
4927    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4928    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4929    float128 z;
4930
4931    aSig1 = extractFloat128Frac1( a );
4932    aSig0 = extractFloat128Frac0( a );
4933    aExp = extractFloat128Exp( a );
4934    aSign = extractFloat128Sign( a );
4935    bSig1 = extractFloat128Frac1( b );
4936    bSig0 = extractFloat128Frac0( b );
4937    bExp = extractFloat128Exp( b );
4938    bSign = extractFloat128Sign( b );
4939    zSign = aSign ^ bSign;
4940    if ( aExp == 0x7FFF ) {
4941        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4942        if ( bExp == 0x7FFF ) {
4943            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4944            goto invalid;
4945        }
4946        return packFloat128( zSign, 0x7FFF, 0, 0 );
4947    }
4948    if ( bExp == 0x7FFF ) {
4949        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4950        return packFloat128( zSign, 0, 0, 0 );
4951    }
4952    if ( bExp == 0 ) {
4953        if ( ( bSig0 | bSig1 ) == 0 ) {
4954            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4955 invalid:
4956                float_raise( float_flag_invalid STATUS_VAR);
4957                z.low = float128_default_nan_low;
4958                z.high = float128_default_nan_high;
4959                return z;
4960            }
4961            float_raise( float_flag_divbyzero STATUS_VAR);
4962            return packFloat128( zSign, 0x7FFF, 0, 0 );
4963        }
4964        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4965    }
4966    if ( aExp == 0 ) {
4967        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4968        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4969    }
4970    zExp = aExp - bExp + 0x3FFD;
4971    shortShift128Left(
4972        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4973    shortShift128Left(
4974        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4975    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4976        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4977        ++zExp;
4978    }
4979    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4980    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4981    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4982    while ( (sbits64) rem0 < 0 ) {
4983        --zSig0;
4984        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4985    }
4986    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4987    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4988        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4989        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4990        while ( (sbits64) rem1 < 0 ) {
4991            --zSig1;
4992            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4993        }
4994        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4995    }
4996    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4997    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4998
4999}
5000
5001/*----------------------------------------------------------------------------
5002| Returns the remainder of the quadruple-precision floating-point value `a'
5003| with respect to the corresponding value `b'.  The operation is performed
5004| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5005*----------------------------------------------------------------------------*/
5006
5007float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5008{
5009    flag aSign, bSign, zSign;
5010    int32 aExp, bExp, expDiff;
5011    bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5012    bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5013    sbits64 sigMean0;
5014    float128 z;
5015
5016    aSig1 = extractFloat128Frac1( a );
5017    aSig0 = extractFloat128Frac0( a );
5018    aExp = extractFloat128Exp( a );
5019    aSign = extractFloat128Sign( a );
5020    bSig1 = extractFloat128Frac1( b );
5021    bSig0 = extractFloat128Frac0( b );
5022    bExp = extractFloat128Exp( b );
5023    bSign = extractFloat128Sign( b );
5024    if ( aExp == 0x7FFF ) {
5025        if (    ( aSig0 | aSig1 )
5026             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5027            return propagateFloat128NaN( a, b STATUS_VAR );
5028        }
5029        goto invalid;
5030    }
5031    if ( bExp == 0x7FFF ) {
5032        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5033        return a;
5034    }
5035    if ( bExp == 0 ) {
5036        if ( ( bSig0 | bSig1 ) == 0 ) {
5037 invalid:
5038            float_raise( float_flag_invalid STATUS_VAR);
5039            z.low = float128_default_nan_low;
5040            z.high = float128_default_nan_high;
5041            return z;
5042        }
5043        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5044    }
5045    if ( aExp == 0 ) {
5046        if ( ( aSig0 | aSig1 ) == 0 ) return a;
5047        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5048    }
5049    expDiff = aExp - bExp;
5050    if ( expDiff < -1 ) return a;
5051    shortShift128Left(
5052        aSig0 | LIT64( 0x0001000000000000 ),
5053        aSig1,
5054        15 - ( expDiff < 0 ),
5055        &aSig0,
5056        &aSig1
5057    );
5058    shortShift128Left(
5059        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5060    q = le128( bSig0, bSig1, aSig0, aSig1 );
5061    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5062    expDiff -= 64;
5063    while ( 0 < expDiff ) {
5064        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5065        q = ( 4 < q ) ? q - 4 : 0;
5066        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5067        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5068        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5069        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5070        expDiff -= 61;
5071    }
5072    if ( -64 < expDiff ) {
5073        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5074        q = ( 4 < q ) ? q - 4 : 0;
5075        q >>= - expDiff;
5076        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5077        expDiff += 52;
5078        if ( expDiff < 0 ) {
5079            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5080        }
5081        else {
5082            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5083        }
5084        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5085        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5086    }
5087    else {
5088        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5089        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5090    }
5091    do {
5092        alternateASig0 = aSig0;
5093        alternateASig1 = aSig1;
5094        ++q;
5095        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5096    } while ( 0 <= (sbits64) aSig0 );
5097    add128(
5098        aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5099    if (    ( sigMean0 < 0 )
5100         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5101        aSig0 = alternateASig0;
5102        aSig1 = alternateASig1;
5103    }
5104    zSign = ( (sbits64) aSig0 < 0 );
5105    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5106    return
5107        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5108
5109}
5110
5111/*----------------------------------------------------------------------------
5112| Returns the square root of the quadruple-precision floating-point value `a'.
5113| The operation is performed according to the IEC/IEEE Standard for Binary
5114| Floating-Point Arithmetic.
5115*----------------------------------------------------------------------------*/
5116
5117float128 float128_sqrt( float128 a STATUS_PARAM )
5118{
5119    flag aSign;
5120    int32 aExp, zExp;
5121    bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5122    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5123    float128 z;
5124
5125    aSig1 = extractFloat128Frac1( a );
5126    aSig0 = extractFloat128Frac0( a );
5127    aExp = extractFloat128Exp( a );
5128    aSign = extractFloat128Sign( a );
5129    if ( aExp == 0x7FFF ) {
5130        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5131        if ( ! aSign ) return a;
5132        goto invalid;
5133    }
5134    if ( aSign ) {
5135        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5136 invalid:
5137        float_raise( float_flag_invalid STATUS_VAR);
5138        z.low = float128_default_nan_low;
5139        z.high = float128_default_nan_high;
5140        return z;
5141    }
5142    if ( aExp == 0 ) {
5143        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5144        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5145    }
5146    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5147    aSig0 |= LIT64( 0x0001000000000000 );
5148    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5149    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5150    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5151    doubleZSig0 = zSig0<<1;
5152    mul64To128( zSig0, zSig0, &term0, &term1 );
5153    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5154    while ( (sbits64) rem0 < 0 ) {
5155        --zSig0;
5156        doubleZSig0 -= 2;
5157        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5158    }
5159    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5160    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5161        if ( zSig1 == 0 ) zSig1 = 1;
5162        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5163        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5164        mul64To128( zSig1, zSig1, &term2, &term3 );
5165        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5166        while ( (sbits64) rem1 < 0 ) {
5167            --zSig1;
5168            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5169            term3 |= 1;
5170            term2 |= doubleZSig0;
5171            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5172        }
5173        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5174    }
5175    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5176    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5177
5178}
5179
5180/*----------------------------------------------------------------------------
5181| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5182| the corresponding value `b', and 0 otherwise.  The comparison is performed
5183| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5184*----------------------------------------------------------------------------*/
5185
5186int float128_eq( float128 a, float128 b STATUS_PARAM )
5187{
5188
5189    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5190              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5191         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5192              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5193       ) {
5194        if (    float128_is_signaling_nan( a )
5195             || float128_is_signaling_nan( b ) ) {
5196            float_raise( float_flag_invalid STATUS_VAR);
5197        }
5198        return 0;
5199    }
5200    return
5201           ( a.low == b.low )
5202        && (    ( a.high == b.high )
5203             || (    ( a.low == 0 )
5204                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5205           );
5206
5207}
5208
5209/*----------------------------------------------------------------------------
5210| Returns 1 if the quadruple-precision floating-point value `a' is less than
5211| or equal to the corresponding value `b', and 0 otherwise.  The comparison
5212| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5213| Arithmetic.
5214*----------------------------------------------------------------------------*/
5215
5216int float128_le( float128 a, float128 b STATUS_PARAM )
5217{
5218    flag aSign, bSign;
5219
5220    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5221              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5222         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5223              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5224       ) {
5225        float_raise( float_flag_invalid STATUS_VAR);
5226        return 0;
5227    }
5228    aSign = extractFloat128Sign( a );
5229    bSign = extractFloat128Sign( b );
5230    if ( aSign != bSign ) {
5231        return
5232               aSign
5233            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5234                 == 0 );
5235    }
5236    return
5237          aSign ? le128( b.high, b.low, a.high, a.low )
5238        : le128( a.high, a.low, b.high, b.low );
5239
5240}
5241
5242/*----------------------------------------------------------------------------
5243| Returns 1 if the quadruple-precision floating-point value `a' is less than
5244| the corresponding value `b', and 0 otherwise.  The comparison is performed
5245| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5246*----------------------------------------------------------------------------*/
5247
5248int float128_lt( float128 a, float128 b STATUS_PARAM )
5249{
5250    flag aSign, bSign;
5251
5252    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5253              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5254         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5255              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5256       ) {
5257        float_raise( float_flag_invalid STATUS_VAR);
5258        return 0;
5259    }
5260    aSign = extractFloat128Sign( a );
5261    bSign = extractFloat128Sign( b );
5262    if ( aSign != bSign ) {
5263        return
5264               aSign
5265            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5266                 != 0 );
5267    }
5268    return
5269          aSign ? lt128( b.high, b.low, a.high, a.low )
5270        : lt128( a.high, a.low, b.high, b.low );
5271
5272}
5273
5274/*----------------------------------------------------------------------------
5275| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5276| the corresponding value `b', and 0 otherwise.  The invalid exception is
5277| raised if either operand is a NaN.  Otherwise, the comparison is performed
5278| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5279*----------------------------------------------------------------------------*/
5280
5281int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5282{
5283
5284    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5285              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5286         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5287              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5288       ) {
5289        float_raise( float_flag_invalid STATUS_VAR);
5290        return 0;
5291    }
5292    return
5293           ( a.low == b.low )
5294        && (    ( a.high == b.high )
5295             || (    ( a.low == 0 )
5296                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5297           );
5298
5299}
5300
5301/*----------------------------------------------------------------------------
5302| Returns 1 if the quadruple-precision floating-point value `a' is less than
5303| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5304| cause an exception.  Otherwise, the comparison is performed according to the
5305| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5306*----------------------------------------------------------------------------*/
5307
5308int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5309{
5310    flag aSign, bSign;
5311
5312    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5313              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5314         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5315              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5316       ) {
5317        if (    float128_is_signaling_nan( a )
5318             || float128_is_signaling_nan( b ) ) {
5319            float_raise( float_flag_invalid STATUS_VAR);
5320        }
5321        return 0;
5322    }
5323    aSign = extractFloat128Sign( a );
5324    bSign = extractFloat128Sign( b );
5325    if ( aSign != bSign ) {
5326        return
5327               aSign
5328            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5329                 == 0 );
5330    }
5331    return
5332          aSign ? le128( b.high, b.low, a.high, a.low )
5333        : le128( a.high, a.low, b.high, b.low );
5334
5335}
5336
5337/*----------------------------------------------------------------------------
5338| Returns 1 if the quadruple-precision floating-point value `a' is less than
5339| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5340| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5341| Standard for Binary Floating-Point Arithmetic.
5342*----------------------------------------------------------------------------*/
5343
5344int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5345{
5346    flag aSign, bSign;
5347
5348    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5349              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5350         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5351              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5352       ) {
5353        if (    float128_is_signaling_nan( a )
5354             || float128_is_signaling_nan( b ) ) {
5355            float_raise( float_flag_invalid STATUS_VAR);
5356        }
5357        return 0;
5358    }
5359    aSign = extractFloat128Sign( a );
5360    bSign = extractFloat128Sign( b );
5361    if ( aSign != bSign ) {
5362        return
5363               aSign
5364            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5365                 != 0 );
5366    }
5367    return
5368          aSign ? lt128( b.high, b.low, a.high, a.low )
5369        : lt128( a.high, a.low, b.high, b.low );
5370
5371}
5372
5373#endif
5374
5375/* misc functions */
5376float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5377{
5378    return int64_to_float32(a STATUS_VAR);
5379}
5380
5381float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5382{
5383    return int64_to_float64(a STATUS_VAR);
5384}
5385
5386unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5387{
5388    int64_t v;
5389    unsigned int res;
5390
5391    v = float32_to_int64(a STATUS_VAR);
5392    if (v < 0) {
5393        res = 0;
5394        float_raise( float_flag_invalid STATUS_VAR);
5395    } else if (v > 0xffffffff) {
5396        res = 0xffffffff;
5397        float_raise( float_flag_invalid STATUS_VAR);
5398    } else {
5399        res = v;
5400    }
5401    return res;
5402}
5403
5404unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5405{
5406    int64_t v;
5407    unsigned int res;
5408
5409    v = float32_to_int64_round_to_zero(a STATUS_VAR);
5410    if (v < 0) {
5411        res = 0;
5412        float_raise( float_flag_invalid STATUS_VAR);
5413    } else if (v > 0xffffffff) {
5414        res = 0xffffffff;
5415        float_raise( float_flag_invalid STATUS_VAR);
5416    } else {
5417        res = v;
5418    }
5419    return res;
5420}
5421
5422unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5423{
5424    int64_t v;
5425    unsigned int res;
5426
5427    v = float64_to_int64(a STATUS_VAR);
5428    if (v < 0) {
5429        res = 0;
5430        float_raise( float_flag_invalid STATUS_VAR);
5431    } else if (v > 0xffffffff) {
5432        res = 0xffffffff;
5433        float_raise( float_flag_invalid STATUS_VAR);
5434    } else {
5435        res = v;
5436    }
5437    return res;
5438}
5439
5440unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5441{
5442    int64_t v;
5443    unsigned int res;
5444
5445    v = float64_to_int64_round_to_zero(a STATUS_VAR);
5446    if (v < 0) {
5447        res = 0;
5448        float_raise( float_flag_invalid STATUS_VAR);
5449    } else if (v > 0xffffffff) {
5450        res = 0xffffffff;
5451        float_raise( float_flag_invalid STATUS_VAR);
5452    } else {
5453        res = v;
5454    }
5455    return res;
5456}
5457
5458/* FIXME: This looks broken.  */
5459uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5460{
5461    int64_t v;
5462
5463    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5464    v += float64_val(a);
5465    v = float64_to_int64(make_float64(v) STATUS_VAR);
5466
5467    return v - INT64_MIN;
5468}
5469
5470uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5471{
5472    int64_t v;
5473
5474    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5475    v += float64_val(a);
5476    v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5477
5478    return v - INT64_MIN;
5479}
5480
5481#define COMPARE(s, nan_exp)                                                  \
5482INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5483                                      int is_quiet STATUS_PARAM )            \
5484{                                                                            \
5485    flag aSign, bSign;                                                       \
5486    bits ## s av, bv;                                                        \
5487                                                                             \
5488    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5489         extractFloat ## s ## Frac( a ) ) ||                                 \
5490        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5491          extractFloat ## s ## Frac( b ) )) {                                \
5492        if (!is_quiet ||                                                     \
5493            float ## s ## _is_signaling_nan( a ) ||                          \
5494            float ## s ## _is_signaling_nan( b ) ) {                         \
5495            float_raise( float_flag_invalid STATUS_VAR);                     \
5496        }                                                                    \
5497        return float_relation_unordered;                                     \
5498    }                                                                        \
5499    aSign = extractFloat ## s ## Sign( a );                                  \
5500    bSign = extractFloat ## s ## Sign( b );                                  \
5501    av = float ## s ## _val(a);                                              \
5502    bv = float ## s ## _val(b);                                              \
5503    if ( aSign != bSign ) {                                                  \
5504        if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
5505            /* zero case */                                                  \
5506            return float_relation_equal;                                     \
5507        } else {                                                             \
5508            return 1 - (2 * aSign);                                          \
5509        }                                                                    \
5510    } else {                                                                 \
5511        if (av == bv) {                                                      \
5512            return float_relation_equal;                                     \
5513        } else {                                                             \
5514            return 1 - 2 * (aSign ^ ( av < bv ));                            \
5515        }                                                                    \
5516    }                                                                        \
5517}                                                                            \
5518                                                                             \
5519int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
5520{                                                                            \
5521    return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5522}                                                                            \
5523                                                                             \
5524int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
5525{                                                                            \
5526    return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
5527}
5528
5529COMPARE(32, 0xff)
5530COMPARE(64, 0x7ff)
5531
5532INLINE int float128_compare_internal( float128 a, float128 b,
5533                                      int is_quiet STATUS_PARAM )
5534{
5535    flag aSign, bSign;
5536
5537    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5538          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
5539        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
5540          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
5541        if (!is_quiet ||
5542            float128_is_signaling_nan( a ) ||
5543            float128_is_signaling_nan( b ) ) {
5544            float_raise( float_flag_invalid STATUS_VAR);
5545        }
5546        return float_relation_unordered;
5547    }
5548    aSign = extractFloat128Sign( a );
5549    bSign = extractFloat128Sign( b );
5550    if ( aSign != bSign ) {
5551        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
5552            /* zero case */
5553            return float_relation_equal;
5554        } else {
5555            return 1 - (2 * aSign);
5556        }
5557    } else {
5558        if (a.low == b.low && a.high == b.high) {
5559            return float_relation_equal;
5560        } else {
5561            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
5562        }
5563    }
5564}
5565
5566int float128_compare( float128 a, float128 b STATUS_PARAM )
5567{
5568    return float128_compare_internal(a, b, 0 STATUS_VAR);
5569}
5570
5571int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
5572{
5573    return float128_compare_internal(a, b, 1 STATUS_VAR);
5574}
5575
5576/* Multiply A by 2 raised to the power N.  */
5577float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5578{
5579    flag aSign;
5580    int16 aExp;
5581    bits32 aSig;
5582
5583    aSig = extractFloat32Frac( a );
5584    aExp = extractFloat32Exp( a );
5585    aSign = extractFloat32Sign( a );
5586
5587    if ( aExp == 0xFF ) {
5588        return a;
5589    }
5590    if ( aExp != 0 )
5591        aSig |= 0x00800000;
5592    else if ( aSig == 0 )
5593        return a;
5594
5595    aExp += n - 1;
5596    aSig <<= 7;
5597    return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5598}
5599
5600float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5601{
5602    flag aSign;
5603    int16 aExp;
5604    bits64 aSig;
5605
5606    aSig = extractFloat64Frac( a );
5607    aExp = extractFloat64Exp( a );
5608    aSign = extractFloat64Sign( a );
5609
5610    if ( aExp == 0x7FF ) {
5611        return a;
5612    }
5613    if ( aExp != 0 )
5614        aSig |= LIT64( 0x0010000000000000 );
5615    else if ( aSig == 0 )
5616        return a;
5617
5618    aExp += n - 1;
5619    aSig <<= 10;
5620    return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5621}
5622
5623#ifdef FLOATX80
5624floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5625{
5626    flag aSign;
5627    int16 aExp;
5628    bits64 aSig;
5629
5630    aSig = extractFloatx80Frac( a );
5631    aExp = extractFloatx80Exp( a );
5632    aSign = extractFloatx80Sign( a );
5633
5634    if ( aExp == 0x7FF ) {
5635        return a;
5636    }
5637    if (aExp == 0 && aSig == 0)
5638        return a;
5639
5640    aExp += n;
5641    return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5642                                          aSign, aExp, aSig, 0 STATUS_VAR );
5643}
5644#endif
5645
5646#ifdef FLOAT128
5647float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5648{
5649    flag aSign;
5650    int32 aExp;
5651    bits64 aSig0, aSig1;
5652
5653    aSig1 = extractFloat128Frac1( a );
5654    aSig0 = extractFloat128Frac0( a );
5655    aExp = extractFloat128Exp( a );
5656    aSign = extractFloat128Sign( a );
5657    if ( aExp == 0x7FFF ) {
5658        return a;
5659    }
5660    if ( aExp != 0 )
5661        aSig0 |= LIT64( 0x0001000000000000 );
5662    else if ( aSig0 == 0 && aSig1 == 0 )
5663        return a;
5664
5665    aExp += n - 1;
5666    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
5667                                          STATUS_VAR );
5668
5669}
5670#endif
5671