linux/arch/arm/nwfpe/softfloat.c
<<
>>
Prefs
   1/*
   2===============================================================================
   3
   4This C source file is part of the SoftFloat IEC/IEEE Floating-point
   5Arithmetic Package, Release 2.
   6
   7Written by John R. Hauser.  This work was made possible in part by the
   8International Computer Science Institute, located at Suite 600, 1947 Center
   9Street, Berkeley, California 94704.  Funding was partially provided by the
  10National Science Foundation under grant MIP-9311980.  The original version
  11of this code was written as part of a project to build a fixed-point vector
  12processor in collaboration with the University of California at Berkeley,
  13overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14is available through the web page
  15http://www.jhauser.us/arithmetic/SoftFloat-2b/SoftFloat-source.txt
  16
  17THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  18has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  20PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22
  23Derivative works are acceptable, even for commercial purposes, so long as
  24(1) they include prominent notice that the work is derivative, and (2) they
  25include prominent notice akin to these three paragraphs for those parts of
  26this code that are retained.
  27
  28===============================================================================
  29*/
  30
  31#include <asm/div64.h>
  32
  33#include "fpa11.h"
  34//#include "milieu.h"
  35//#include "softfloat.h"
  36
  37/*
  38-------------------------------------------------------------------------------
  39Primitive arithmetic functions, including multi-word arithmetic, and
  40division and square root approximations.  (Can be specialized to target if
  41desired.)
  42-------------------------------------------------------------------------------
  43*/
  44#include "softfloat-macros"
  45
  46/*
  47-------------------------------------------------------------------------------
  48Functions and definitions to determine:  (1) whether tininess for underflow
  49is detected before or after rounding by default, (2) what (if anything)
  50happens when exceptions are raised, (3) how signaling NaNs are distinguished
  51from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  52are propagated from function inputs to output.  These details are target-
  53specific.
  54-------------------------------------------------------------------------------
  55*/
  56#include "softfloat-specialize"
  57
  58/*
  59-------------------------------------------------------------------------------
  60Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  61and 7, and returns the properly rounded 32-bit integer corresponding to the
  62input.  If `zSign' is nonzero, the input is negated before being converted
  63to an integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point
  64input is simply rounded to an integer, with the inexact exception raised if
  65the input cannot be represented exactly as an integer.  If the fixed-point
  66input is too large, however, the invalid exception is raised and the largest
  67positive or negative integer is returned.
  68-------------------------------------------------------------------------------
  69*/
  70static int32 roundAndPackInt32( struct roundingData *roundData, flag zSign, bits64 absZ )
  71{
  72    int8 roundingMode;
  73    flag roundNearestEven;
  74    int8 roundIncrement, roundBits;
  75    int32 z;
  76
  77    roundingMode = roundData->mode;
  78    roundNearestEven = ( roundingMode == float_round_nearest_even );
  79    roundIncrement = 0x40;
  80    if ( ! roundNearestEven ) {
  81        if ( roundingMode == float_round_to_zero ) {
  82            roundIncrement = 0;
  83        }
  84        else {
  85            roundIncrement = 0x7F;
  86            if ( zSign ) {
  87                if ( roundingMode == float_round_up ) roundIncrement = 0;
  88            }
  89            else {
  90                if ( roundingMode == float_round_down ) roundIncrement = 0;
  91            }
  92        }
  93    }
  94    roundBits = absZ & 0x7F;
  95    absZ = ( absZ + roundIncrement )>>7;
  96    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
  97    z = absZ;
  98    if ( zSign ) z = - z;
  99    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 100        roundData->exception |= float_flag_invalid;
 101        return zSign ? 0x80000000 : 0x7FFFFFFF;
 102    }
 103    if ( roundBits ) roundData->exception |= float_flag_inexact;
 104    return z;
 105
 106}
 107
 108/*
 109-------------------------------------------------------------------------------
 110Returns the fraction bits of the single-precision floating-point value `a'.
 111-------------------------------------------------------------------------------
 112*/
 113INLINE bits32 extractFloat32Frac( float32 a )
 114{
 115
 116    return a & 0x007FFFFF;
 117
 118}
 119
 120/*
 121-------------------------------------------------------------------------------
 122Returns the exponent bits of the single-precision floating-point value `a'.
 123-------------------------------------------------------------------------------
 124*/
 125INLINE int16 extractFloat32Exp( float32 a )
 126{
 127
 128    return ( a>>23 ) & 0xFF;
 129
 130}
 131
 132/*
 133-------------------------------------------------------------------------------
 134Returns the sign bit of the single-precision floating-point value `a'.
 135-------------------------------------------------------------------------------
 136*/
 137#if 0   /* in softfloat.h */
 138INLINE flag extractFloat32Sign( float32 a )
 139{
 140
 141    return a>>31;
 142
 143}
 144#endif
 145
 146/*
 147-------------------------------------------------------------------------------
 148Normalizes the subnormal single-precision floating-point value represented
 149by the denormalized significand `aSig'.  The normalized exponent and
 150significand are stored at the locations pointed to by `zExpPtr' and
 151`zSigPtr', respectively.
 152-------------------------------------------------------------------------------
 153*/
 154static void
 155 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 156{
 157    int8 shiftCount;
 158
 159    shiftCount = countLeadingZeros32( aSig ) - 8;
 160    *zSigPtr = aSig<<shiftCount;
 161    *zExpPtr = 1 - shiftCount;
 162
 163}
 164
 165/*
 166-------------------------------------------------------------------------------
 167Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 168single-precision floating-point value, returning the result.  After being
 169shifted into the proper positions, the three fields are simply added
 170together to form the result.  This means that any integer portion of `zSig'
 171will be added into the exponent.  Since a properly normalized significand
 172will have an integer portion equal to 1, the `zExp' input should be 1 less
 173than the desired result exponent whenever `zSig' is a complete, normalized
 174significand.
 175-------------------------------------------------------------------------------
 176*/
 177INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 178{
 179#if 0
 180   float32 f;
 181   __asm__("@ packFloat32                               \n\
 182            mov %0, %1, asl #31                         \n\
 183            orr %0, %2, asl #23                         \n\
 184            orr %0, %3"
 185            : /* no outputs */
 186            : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig)
 187            : "cc");
 188   return f;
 189#else
 190    return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
 191#endif 
 192}
 193
 194/*
 195-------------------------------------------------------------------------------
 196Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 197and significand `zSig', and returns the proper single-precision floating-
 198point value corresponding to the abstract input.  Ordinarily, the abstract
 199value is simply rounded and packed into the single-precision format, with
 200the inexact exception raised if the abstract input cannot be represented
 201exactly.  If the abstract value is too large, however, the overflow and
 202inexact exceptions are raised and an infinity or maximal finite value is
 203returned.  If the abstract value is too small, the input value is rounded to
 204a subnormal number, and the underflow and inexact exceptions are raised if
 205the abstract input cannot be represented exactly as a subnormal single-
 206precision floating-point number.
 207    The input significand `zSig' has its binary point between bits 30
 208and 29, which is 7 bits to the left of the usual location.  This shifted
 209significand must be normalized or smaller.  If `zSig' is not normalized,
 210`zExp' must be 0; in that case, the result returned is a subnormal number,
 211and it must not require rounding.  In the usual case that `zSig' is
 212normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 213The handling of underflow and overflow follows the IEC/IEEE Standard for
 214Binary Floating-point Arithmetic.
 215-------------------------------------------------------------------------------
 216*/
 217static float32 roundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
 218{
 219    int8 roundingMode;
 220    flag roundNearestEven;
 221    int8 roundIncrement, roundBits;
 222    flag isTiny;
 223
 224    roundingMode = roundData->mode;
 225    roundNearestEven = ( roundingMode == float_round_nearest_even );
 226    roundIncrement = 0x40;
 227    if ( ! roundNearestEven ) {
 228        if ( roundingMode == float_round_to_zero ) {
 229            roundIncrement = 0;
 230        }
 231        else {
 232            roundIncrement = 0x7F;
 233            if ( zSign ) {
 234                if ( roundingMode == float_round_up ) roundIncrement = 0;
 235            }
 236            else {
 237                if ( roundingMode == float_round_down ) roundIncrement = 0;
 238            }
 239        }
 240    }
 241    roundBits = zSig & 0x7F;
 242    if ( 0xFD <= (bits16) zExp ) {
 243        if (    ( 0xFD < zExp )
 244             || (    ( zExp == 0xFD )
 245                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 246           ) {
 247            roundData->exception |= float_flag_overflow | float_flag_inexact;
 248            return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
 249        }
 250        if ( zExp < 0 ) {
 251            isTiny =
 252                   ( float_detect_tininess == float_tininess_before_rounding )
 253                || ( zExp < -1 )
 254                || ( zSig + roundIncrement < 0x80000000 );
 255            shift32RightJamming( zSig, - zExp, &zSig );
 256            zExp = 0;
 257            roundBits = zSig & 0x7F;
 258            if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
 259        }
 260    }
 261    if ( roundBits ) roundData->exception |= float_flag_inexact;
 262    zSig = ( zSig + roundIncrement )>>7;
 263    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 264    if ( zSig == 0 ) zExp = 0;
 265    return packFloat32( zSign, zExp, zSig );
 266
 267}
 268
 269/*
 270-------------------------------------------------------------------------------
 271Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 272and significand `zSig', and returns the proper single-precision floating-
 273point value corresponding to the abstract input.  This routine is just like
 274`roundAndPackFloat32' except that `zSig' does not have to be normalized in
 275any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
 276point exponent.
 277-------------------------------------------------------------------------------
 278*/
 279static float32
 280 normalizeRoundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
 281{
 282    int8 shiftCount;
 283
 284    shiftCount = countLeadingZeros32( zSig ) - 1;
 285    return roundAndPackFloat32( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
 286
 287}
 288
 289/*
 290-------------------------------------------------------------------------------
 291Returns the fraction bits of the double-precision floating-point value `a'.
 292-------------------------------------------------------------------------------
 293*/
 294INLINE bits64 extractFloat64Frac( float64 a )
 295{
 296
 297    return a & LIT64( 0x000FFFFFFFFFFFFF );
 298
 299}
 300
 301/*
 302-------------------------------------------------------------------------------
 303Returns the exponent bits of the double-precision floating-point value `a'.
 304-------------------------------------------------------------------------------
 305*/
 306INLINE int16 extractFloat64Exp( float64 a )
 307{
 308
 309    return ( a>>52 ) & 0x7FF;
 310
 311}
 312
 313/*
 314-------------------------------------------------------------------------------
 315Returns the sign bit of the double-precision floating-point value `a'.
 316-------------------------------------------------------------------------------
 317*/
 318#if 0   /* in softfloat.h */
 319INLINE flag extractFloat64Sign( float64 a )
 320{
 321
 322    return a>>63;
 323
 324}
 325#endif
 326
 327/*
 328-------------------------------------------------------------------------------
 329Normalizes the subnormal double-precision floating-point value represented
 330by the denormalized significand `aSig'.  The normalized exponent and
 331significand are stored at the locations pointed to by `zExpPtr' and
 332`zSigPtr', respectively.
 333-------------------------------------------------------------------------------
 334*/
 335static void
 336 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 337{
 338    int8 shiftCount;
 339
 340    shiftCount = countLeadingZeros64( aSig ) - 11;
 341    *zSigPtr = aSig<<shiftCount;
 342    *zExpPtr = 1 - shiftCount;
 343
 344}
 345
 346/*
 347-------------------------------------------------------------------------------
 348Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 349double-precision floating-point value, returning the result.  After being
 350shifted into the proper positions, the three fields are simply added
 351together to form the result.  This means that any integer portion of `zSig'
 352will be added into the exponent.  Since a properly normalized significand
 353will have an integer portion equal to 1, the `zExp' input should be 1 less
 354than the desired result exponent whenever `zSig' is a complete, normalized
 355significand.
 356-------------------------------------------------------------------------------
 357*/
 358INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 359{
 360
 361    return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
 362
 363}
 364
 365/*
 366-------------------------------------------------------------------------------
 367Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 368and significand `zSig', and returns the proper double-precision floating-
 369point value corresponding to the abstract input.  Ordinarily, the abstract
 370value is simply rounded and packed into the double-precision format, with
 371the inexact exception raised if the abstract input cannot be represented
 372exactly.  If the abstract value is too large, however, the overflow and
 373inexact exceptions are raised and an infinity or maximal finite value is
 374returned.  If the abstract value is too small, the input value is rounded to
 375a subnormal number, and the underflow and inexact exceptions are raised if
 376the abstract input cannot be represented exactly as a subnormal double-
 377precision floating-point number.
 378    The input significand `zSig' has its binary point between bits 62
 379and 61, which is 10 bits to the left of the usual location.  This shifted
 380significand must be normalized or smaller.  If `zSig' is not normalized,
 381`zExp' must be 0; in that case, the result returned is a subnormal number,
 382and it must not require rounding.  In the usual case that `zSig' is
 383normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 384The handling of underflow and overflow follows the IEC/IEEE Standard for
 385Binary Floating-point Arithmetic.
 386-------------------------------------------------------------------------------
 387*/
 388static float64 roundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
 389{
 390    int8 roundingMode;
 391    flag roundNearestEven;
 392    int16 roundIncrement, roundBits;
 393    flag isTiny;
 394
 395    roundingMode = roundData->mode;
 396    roundNearestEven = ( roundingMode == float_round_nearest_even );
 397    roundIncrement = 0x200;
 398    if ( ! roundNearestEven ) {
 399        if ( roundingMode == float_round_to_zero ) {
 400            roundIncrement = 0;
 401        }
 402        else {
 403            roundIncrement = 0x3FF;
 404            if ( zSign ) {
 405                if ( roundingMode == float_round_up ) roundIncrement = 0;
 406            }
 407            else {
 408                if ( roundingMode == float_round_down ) roundIncrement = 0;
 409            }
 410        }
 411    }
 412    roundBits = zSig & 0x3FF;
 413    if ( 0x7FD <= (bits16) zExp ) {
 414        if (    ( 0x7FD < zExp )
 415             || (    ( zExp == 0x7FD )
 416                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 417           ) {
 418            //register int lr = __builtin_return_address(0);
 419            //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
 420            roundData->exception |= float_flag_overflow | float_flag_inexact;
 421            return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
 422        }
 423        if ( zExp < 0 ) {
 424            isTiny =
 425                   ( float_detect_tininess == float_tininess_before_rounding )
 426                || ( zExp < -1 )
 427                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 428            shift64RightJamming( zSig, - zExp, &zSig );
 429            zExp = 0;
 430            roundBits = zSig & 0x3FF;
 431            if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
 432        }
 433    }
 434    if ( roundBits ) roundData->exception |= float_flag_inexact;
 435    zSig = ( zSig + roundIncrement )>>10;
 436    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 437    if ( zSig == 0 ) zExp = 0;
 438    return packFloat64( zSign, zExp, zSig );
 439
 440}
 441
 442/*
 443-------------------------------------------------------------------------------
 444Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 445and significand `zSig', and returns the proper double-precision floating-
 446point value corresponding to the abstract input.  This routine is just like
 447`roundAndPackFloat64' except that `zSig' does not have to be normalized in
 448any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
 449point exponent.
 450-------------------------------------------------------------------------------
 451*/
 452static float64
 453 normalizeRoundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
 454{
 455    int8 shiftCount;
 456
 457    shiftCount = countLeadingZeros64( zSig ) - 1;
 458    return roundAndPackFloat64( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
 459
 460}
 461
 462#ifdef FLOATX80
 463
 464/*
 465-------------------------------------------------------------------------------
 466Returns the fraction bits of the extended double-precision floating-point
 467value `a'.
 468-------------------------------------------------------------------------------
 469*/
 470INLINE bits64 extractFloatx80Frac( floatx80 a )
 471{
 472
 473    return a.low;
 474
 475}
 476
 477/*
 478-------------------------------------------------------------------------------
 479Returns the exponent bits of the extended double-precision floating-point
 480value `a'.
 481-------------------------------------------------------------------------------
 482*/
 483INLINE int32 extractFloatx80Exp( floatx80 a )
 484{
 485
 486    return a.high & 0x7FFF;
 487
 488}
 489
 490/*
 491-------------------------------------------------------------------------------
 492Returns the sign bit of the extended double-precision floating-point value
 493`a'.
 494-------------------------------------------------------------------------------
 495*/
 496INLINE flag extractFloatx80Sign( floatx80 a )
 497{
 498
 499    return a.high>>15;
 500
 501}
 502
 503/*
 504-------------------------------------------------------------------------------
 505Normalizes the subnormal extended double-precision floating-point value
 506represented by the denormalized significand `aSig'.  The normalized exponent
 507and significand are stored at the locations pointed to by `zExpPtr' and
 508`zSigPtr', respectively.
 509-------------------------------------------------------------------------------
 510*/
 511static void
 512 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 513{
 514    int8 shiftCount;
 515
 516    shiftCount = countLeadingZeros64( aSig );
 517    *zSigPtr = aSig<<shiftCount;
 518    *zExpPtr = 1 - shiftCount;
 519
 520}
 521
 522/*
 523-------------------------------------------------------------------------------
 524Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 525extended double-precision floating-point value, returning the result.
 526-------------------------------------------------------------------------------
 527*/
 528INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 529{
 530    floatx80 z;
 531
 532    z.low = zSig;
 533    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 534    z.__padding = 0;
 535    return z;
 536
 537}
 538
 539/*
 540-------------------------------------------------------------------------------
 541Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 542and extended significand formed by the concatenation of `zSig0' and `zSig1',
 543and returns the proper extended double-precision floating-point value
 544corresponding to the abstract input.  Ordinarily, the abstract value is
 545rounded and packed into the extended double-precision format, with the
 546inexact exception raised if the abstract input cannot be represented
 547exactly.  If the abstract value is too large, however, the overflow and
 548inexact exceptions are raised and an infinity or maximal finite value is
 549returned.  If the abstract value is too small, the input value is rounded to
 550a subnormal number, and the underflow and inexact exceptions are raised if
 551the abstract input cannot be represented exactly as a subnormal extended
 552double-precision floating-point number.
 553    If `roundingPrecision' is 32 or 64, the result is rounded to the same
 554number of bits as single or double precision, respectively.  Otherwise, the
 555result is rounded to the full precision of the extended double-precision
 556format.
 557    The input significand must be normalized or smaller.  If the input
 558significand is not normalized, `zExp' must be 0; in that case, the result
 559returned is a subnormal number, and it must not require rounding.  The
 560handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 561Floating-point Arithmetic.
 562-------------------------------------------------------------------------------
 563*/
 564static floatx80
 565 roundAndPackFloatx80(
 566     struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 567 )
 568{
 569    int8 roundingMode, roundingPrecision;
 570    flag roundNearestEven, increment, isTiny;
 571    int64 roundIncrement, roundMask, roundBits;
 572
 573    roundingMode = roundData->mode;
 574    roundingPrecision = roundData->precision;
 575    roundNearestEven = ( roundingMode == float_round_nearest_even );
 576    if ( roundingPrecision == 80 ) goto precision80;
 577    if ( roundingPrecision == 64 ) {
 578        roundIncrement = LIT64( 0x0000000000000400 );
 579        roundMask = LIT64( 0x00000000000007FF );
 580    }
 581    else if ( roundingPrecision == 32 ) {
 582        roundIncrement = LIT64( 0x0000008000000000 );
 583        roundMask = LIT64( 0x000000FFFFFFFFFF );
 584    }
 585    else {
 586        goto precision80;
 587    }
 588    zSig0 |= ( zSig1 != 0 );
 589    if ( ! roundNearestEven ) {
 590        if ( roundingMode == float_round_to_zero ) {
 591            roundIncrement = 0;
 592        }
 593        else {
 594            roundIncrement = roundMask;
 595            if ( zSign ) {
 596                if ( roundingMode == float_round_up ) roundIncrement = 0;
 597            }
 598            else {
 599                if ( roundingMode == float_round_down ) roundIncrement = 0;
 600            }
 601        }
 602    }
 603    roundBits = zSig0 & roundMask;
 604    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 605        if (    ( 0x7FFE < zExp )
 606             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 607           ) {
 608            goto overflow;
 609        }
 610        if ( zExp <= 0 ) {
 611            isTiny =
 612                   ( float_detect_tininess == float_tininess_before_rounding )
 613                || ( zExp < 0 )
 614                || ( zSig0 <= zSig0 + roundIncrement );
 615            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 616            zExp = 0;
 617            roundBits = zSig0 & roundMask;
 618            if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
 619            if ( roundBits ) roundData->exception |= float_flag_inexact;
 620            zSig0 += roundIncrement;
 621            if ( (sbits64) zSig0 < 0 ) zExp = 1;
 622            roundIncrement = roundMask + 1;
 623            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 624                roundMask |= roundIncrement;
 625            }
 626            zSig0 &= ~ roundMask;
 627            return packFloatx80( zSign, zExp, zSig0 );
 628        }
 629    }
 630    if ( roundBits ) roundData->exception |= float_flag_inexact;
 631    zSig0 += roundIncrement;
 632    if ( zSig0 < roundIncrement ) {
 633        ++zExp;
 634        zSig0 = LIT64( 0x8000000000000000 );
 635    }
 636    roundIncrement = roundMask + 1;
 637    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 638        roundMask |= roundIncrement;
 639    }
 640    zSig0 &= ~ roundMask;
 641    if ( zSig0 == 0 ) zExp = 0;
 642    return packFloatx80( zSign, zExp, zSig0 );
 643 precision80:
 644    increment = ( (sbits64) zSig1 < 0 );
 645    if ( ! roundNearestEven ) {
 646        if ( roundingMode == float_round_to_zero ) {
 647            increment = 0;
 648        }
 649        else {
 650            if ( zSign ) {
 651                increment = ( roundingMode == float_round_down ) && zSig1;
 652            }
 653            else {
 654                increment = ( roundingMode == float_round_up ) && zSig1;
 655            }
 656        }
 657    }
 658    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 659        if (    ( 0x7FFE < zExp )
 660             || (    ( zExp == 0x7FFE )
 661                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 662                  && increment
 663                )
 664           ) {
 665            roundMask = 0;
 666 overflow:
 667            roundData->exception |= float_flag_overflow | float_flag_inexact;
 668            if (    ( roundingMode == float_round_to_zero )
 669                 || ( zSign && ( roundingMode == float_round_up ) )
 670                 || ( ! zSign && ( roundingMode == float_round_down ) )
 671               ) {
 672                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 673            }
 674            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 675        }
 676        if ( zExp <= 0 ) {
 677            isTiny =
 678                   ( float_detect_tininess == float_tininess_before_rounding )
 679                || ( zExp < 0 )
 680                || ! increment
 681                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 682            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 683            zExp = 0;
 684            if ( isTiny && zSig1 ) roundData->exception |= float_flag_underflow;
 685            if ( zSig1 ) roundData->exception |= float_flag_inexact;
 686            if ( roundNearestEven ) {
 687                increment = ( (sbits64) zSig1 < 0 );
 688            }
 689            else {
 690                if ( zSign ) {
 691                    increment = ( roundingMode == float_round_down ) && zSig1;
 692                }
 693                else {
 694                    increment = ( roundingMode == float_round_up ) && zSig1;
 695                }
 696            }
 697            if ( increment ) {
 698                ++zSig0;
 699                zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
 700                if ( (sbits64) zSig0 < 0 ) zExp = 1;
 701            }
 702            return packFloatx80( zSign, zExp, zSig0 );
 703        }
 704    }
 705    if ( zSig1 ) roundData->exception |= float_flag_inexact;
 706    if ( increment ) {
 707        ++zSig0;
 708        if ( zSig0 == 0 ) {
 709            ++zExp;
 710            zSig0 = LIT64( 0x8000000000000000 );
 711        }
 712        else {
 713            zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
 714        }
 715    }
 716    else {
 717        if ( zSig0 == 0 ) zExp = 0;
 718    }
 719    
 720    return packFloatx80( zSign, zExp, zSig0 );
 721}
 722
 723/*
 724-------------------------------------------------------------------------------
 725Takes an abstract floating-point value having sign `zSign', exponent
 726`zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 727and returns the proper extended double-precision floating-point value
 728corresponding to the abstract input.  This routine is just like
 729`roundAndPackFloatx80' except that the input significand does not have to be
 730normalized.
 731-------------------------------------------------------------------------------
 732*/
 733static floatx80
 734 normalizeRoundAndPackFloatx80(
 735     struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 736 )
 737{
 738    int8 shiftCount;
 739
 740    if ( zSig0 == 0 ) {
 741        zSig0 = zSig1;
 742        zSig1 = 0;
 743        zExp -= 64;
 744    }
 745    shiftCount = countLeadingZeros64( zSig0 );
 746    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 747    zExp -= shiftCount;
 748    return
 749        roundAndPackFloatx80( roundData, zSign, zExp, zSig0, zSig1 );
 750
 751}
 752
 753#endif
 754
 755/*
 756-------------------------------------------------------------------------------
 757Returns the result of converting the 32-bit two's complement integer `a' to
 758the single-precision floating-point format.  The conversion is performed
 759according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
 760-------------------------------------------------------------------------------
 761*/
 762float32 int32_to_float32(struct roundingData *roundData, int32 a)
 763{
 764    flag zSign;
 765
 766    if ( a == 0 ) return 0;
 767    if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
 768    zSign = ( a < 0 );
 769    return normalizeRoundAndPackFloat32( roundData, zSign, 0x9C, zSign ? - a : a );
 770
 771}
 772
 773/*
 774-------------------------------------------------------------------------------
 775Returns the result of converting the 32-bit two's complement integer `a' to
 776the double-precision floating-point format.  The conversion is performed
 777according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
 778-------------------------------------------------------------------------------
 779*/
 780float64 int32_to_float64( int32 a )
 781{
 782    flag aSign;
 783    uint32 absA;
 784    int8 shiftCount;
 785    bits64 zSig;
 786
 787    if ( a == 0 ) return 0;
 788    aSign = ( a < 0 );
 789    absA = aSign ? - a : a;
 790    shiftCount = countLeadingZeros32( absA ) + 21;
 791    zSig = absA;
 792    return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
 793
 794}
 795
 796#ifdef FLOATX80
 797
 798/*
 799-------------------------------------------------------------------------------
 800Returns the result of converting the 32-bit two's complement integer `a'
 801to the extended double-precision floating-point format.  The conversion
 802is performed according to the IEC/IEEE Standard for Binary Floating-point
 803Arithmetic.
 804-------------------------------------------------------------------------------
 805*/
 806floatx80 int32_to_floatx80( int32 a )
 807{
 808    flag zSign;
 809    uint32 absA;
 810    int8 shiftCount;
 811    bits64 zSig;
 812
 813    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
 814    zSign = ( a < 0 );
 815    absA = zSign ? - a : a;
 816    shiftCount = countLeadingZeros32( absA ) + 32;
 817    zSig = absA;
 818    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
 819
 820}
 821
 822#endif
 823
 824/*
 825-------------------------------------------------------------------------------
 826Returns the result of converting the single-precision floating-point value
 827`a' to the 32-bit two's complement integer format.  The conversion is
 828performed according to the IEC/IEEE Standard for Binary Floating-point
 829Arithmetic---which means in particular that the conversion is rounded
 830according to the current rounding mode.  If `a' is a NaN, the largest
 831positive integer is returned.  Otherwise, if the conversion overflows, the
 832largest integer with the same sign as `a' is returned.
 833-------------------------------------------------------------------------------
 834*/
 835int32 float32_to_int32( struct roundingData *roundData, float32 a )
 836{
 837    flag aSign;
 838    int16 aExp, shiftCount;
 839    bits32 aSig;
 840    bits64 zSig;
 841
 842    aSig = extractFloat32Frac( a );
 843    aExp = extractFloat32Exp( a );
 844    aSign = extractFloat32Sign( a );
 845    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
 846    if ( aExp ) aSig |= 0x00800000;
 847    shiftCount = 0xAF - aExp;
 848    zSig = aSig;
 849    zSig <<= 32;
 850    if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
 851    return roundAndPackInt32( roundData, aSign, zSig );
 852
 853}
 854
 855/*
 856-------------------------------------------------------------------------------
 857Returns the result of converting the single-precision floating-point value
 858`a' to the 32-bit two's complement integer format.  The conversion is
 859performed according to the IEC/IEEE Standard for Binary Floating-point
 860Arithmetic, except that the conversion is always rounded toward zero.  If
 861`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
 862conversion overflows, the largest integer with the same sign as `a' is
 863returned.
 864-------------------------------------------------------------------------------
 865*/
 866int32 float32_to_int32_round_to_zero( float32 a )
 867{
 868    flag aSign;
 869    int16 aExp, shiftCount;
 870    bits32 aSig;
 871    int32 z;
 872
 873    aSig = extractFloat32Frac( a );
 874    aExp = extractFloat32Exp( a );
 875    aSign = extractFloat32Sign( a );
 876    shiftCount = aExp - 0x9E;
 877    if ( 0 <= shiftCount ) {
 878        if ( a == 0xCF000000 ) return 0x80000000;
 879        float_raise( float_flag_invalid );
 880        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
 881        return 0x80000000;
 882    }
 883    else if ( aExp <= 0x7E ) {
 884        if ( aExp | aSig ) float_raise( float_flag_inexact );
 885        return 0;
 886    }
 887    aSig = ( aSig | 0x00800000 )<<8;
 888    z = aSig>>( - shiftCount );
 889    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
 890        float_raise( float_flag_inexact );
 891    }
 892    return aSign ? - z : z;
 893
 894}
 895
 896/*
 897-------------------------------------------------------------------------------
 898Returns the result of converting the single-precision floating-point value
 899`a' to the double-precision floating-point format.  The conversion is
 900performed according to the IEC/IEEE Standard for Binary Floating-point
 901Arithmetic.
 902-------------------------------------------------------------------------------
 903*/
 904float64 float32_to_float64( float32 a )
 905{
 906    flag aSign;
 907    int16 aExp;
 908    bits32 aSig;
 909
 910    aSig = extractFloat32Frac( a );
 911    aExp = extractFloat32Exp( a );
 912    aSign = extractFloat32Sign( a );
 913    if ( aExp == 0xFF ) {
 914        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
 915        return packFloat64( aSign, 0x7FF, 0 );
 916    }
 917    if ( aExp == 0 ) {
 918        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
 919        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 920        --aExp;
 921    }
 922    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
 923
 924}
 925
 926#ifdef FLOATX80
 927
 928/*
 929-------------------------------------------------------------------------------
 930Returns the result of converting the single-precision floating-point value
 931`a' to the extended double-precision floating-point format.  The conversion
 932is performed according to the IEC/IEEE Standard for Binary Floating-point
 933Arithmetic.
 934-------------------------------------------------------------------------------
 935*/
 936floatx80 float32_to_floatx80( float32 a )
 937{
 938    flag aSign;
 939    int16 aExp;
 940    bits32 aSig;
 941
 942    aSig = extractFloat32Frac( a );
 943    aExp = extractFloat32Exp( a );
 944    aSign = extractFloat32Sign( a );
 945    if ( aExp == 0xFF ) {
 946        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
 947        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 948    }
 949    if ( aExp == 0 ) {
 950        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
 951        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 952    }
 953    aSig |= 0x00800000;
 954    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
 955
 956}
 957
 958#endif
 959
 960/*
 961-------------------------------------------------------------------------------
 962Rounds the single-precision floating-point value `a' to an integer, and
 963returns the result as a single-precision floating-point value.  The
 964operation is performed according to the IEC/IEEE Standard for Binary
 965Floating-point Arithmetic.
 966-------------------------------------------------------------------------------
 967*/
 968float32 float32_round_to_int( struct roundingData *roundData, float32 a )
 969{
 970    flag aSign;
 971    int16 aExp;
 972    bits32 lastBitMask, roundBitsMask;
 973    int8 roundingMode;
 974    float32 z;
 975
 976    aExp = extractFloat32Exp( a );
 977    if ( 0x96 <= aExp ) {
 978        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
 979            return propagateFloat32NaN( a, a );
 980        }
 981        return a;
 982    }
 983    roundingMode = roundData->mode;
 984    if ( aExp <= 0x7E ) {
 985        if ( (bits32) ( a<<1 ) == 0 ) return a;
 986        roundData->exception |= float_flag_inexact;
 987        aSign = extractFloat32Sign( a );
 988        switch ( roundingMode ) {
 989         case float_round_nearest_even:
 990            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
 991                return packFloat32( aSign, 0x7F, 0 );
 992            }
 993            break;
 994         case float_round_down:
 995            return aSign ? 0xBF800000 : 0;
 996         case float_round_up:
 997            return aSign ? 0x80000000 : 0x3F800000;
 998        }
 999        return packFloat32( aSign, 0, 0 );
1000    }
1001    lastBitMask = 1;
1002    lastBitMask <<= 0x96 - aExp;
1003    roundBitsMask = lastBitMask - 1;
1004    z = a;
1005    if ( roundingMode == float_round_nearest_even ) {
1006        z += lastBitMask>>1;
1007        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1008    }
1009    else if ( roundingMode != float_round_to_zero ) {
1010        if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1011            z += roundBitsMask;
1012        }
1013    }
1014    z &= ~ roundBitsMask;
1015    if ( z != a ) roundData->exception |= float_flag_inexact;
1016    return z;
1017
1018}
1019
1020/*
1021-------------------------------------------------------------------------------
1022Returns the result of adding the absolute values of the single-precision
1023floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1024before being returned.  `zSign' is ignored if the result is a NaN.  The
1025addition is performed according to the IEC/IEEE Standard for Binary
1026Floating-point Arithmetic.
1027-------------------------------------------------------------------------------
1028*/
1029static float32 addFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1030{
1031    int16 aExp, bExp, zExp;
1032    bits32 aSig, bSig, zSig;
1033    int16 expDiff;
1034
1035    aSig = extractFloat32Frac( a );
1036    aExp = extractFloat32Exp( a );
1037    bSig = extractFloat32Frac( b );
1038    bExp = extractFloat32Exp( b );
1039    expDiff = aExp - bExp;
1040    aSig <<= 6;
1041    bSig <<= 6;
1042    if ( 0 < expDiff ) {
1043        if ( aExp == 0xFF ) {
1044            if ( aSig ) return propagateFloat32NaN( a, b );
1045            return a;
1046        }
1047        if ( bExp == 0 ) {
1048            --expDiff;
1049        }
1050        else {
1051            bSig |= 0x20000000;
1052        }
1053        shift32RightJamming( bSig, expDiff, &bSig );
1054        zExp = aExp;
1055    }
1056    else if ( expDiff < 0 ) {
1057        if ( bExp == 0xFF ) {
1058            if ( bSig ) return propagateFloat32NaN( a, b );
1059            return packFloat32( zSign, 0xFF, 0 );
1060        }
1061        if ( aExp == 0 ) {
1062            ++expDiff;
1063        }
1064        else {
1065            aSig |= 0x20000000;
1066        }
1067        shift32RightJamming( aSig, - expDiff, &aSig );
1068        zExp = bExp;
1069    }
1070    else {
1071        if ( aExp == 0xFF ) {
1072            if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1073            return a;
1074        }
1075        if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1076        zSig = 0x40000000 + aSig + bSig;
1077        zExp = aExp;
1078        goto roundAndPack;
1079    }
1080    aSig |= 0x20000000;
1081    zSig = ( aSig + bSig )<<1;
1082    --zExp;
1083    if ( (sbits32) zSig < 0 ) {
1084        zSig = aSig + bSig;
1085        ++zExp;
1086    }
1087 roundAndPack:
1088    return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1089
1090}
1091
1092/*
1093-------------------------------------------------------------------------------
1094Returns the result of subtracting the absolute values of the single-
1095precision floating-point values `a' and `b'.  If `zSign' is true, the
1096difference is negated before being returned.  `zSign' is ignored if the
1097result is a NaN.  The subtraction is performed according to the IEC/IEEE
1098Standard for Binary Floating-point Arithmetic.
1099-------------------------------------------------------------------------------
1100*/
1101static float32 subFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1102{
1103    int16 aExp, bExp, zExp;
1104    bits32 aSig, bSig, zSig;
1105    int16 expDiff;
1106
1107    aSig = extractFloat32Frac( a );
1108    aExp = extractFloat32Exp( a );
1109    bSig = extractFloat32Frac( b );
1110    bExp = extractFloat32Exp( b );
1111    expDiff = aExp - bExp;
1112    aSig <<= 7;
1113    bSig <<= 7;
1114    if ( 0 < expDiff ) goto aExpBigger;
1115    if ( expDiff < 0 ) goto bExpBigger;
1116    if ( aExp == 0xFF ) {
1117        if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1118        roundData->exception |= float_flag_invalid;
1119        return float32_default_nan;
1120    }
1121    if ( aExp == 0 ) {
1122        aExp = 1;
1123        bExp = 1;
1124    }
1125    if ( bSig < aSig ) goto aBigger;
1126    if ( aSig < bSig ) goto bBigger;
1127    return packFloat32( roundData->mode == float_round_down, 0, 0 );
1128 bExpBigger:
1129    if ( bExp == 0xFF ) {
1130        if ( bSig ) return propagateFloat32NaN( a, b );
1131        return packFloat32( zSign ^ 1, 0xFF, 0 );
1132    }
1133    if ( aExp == 0 ) {
1134        ++expDiff;
1135    }
1136    else {
1137        aSig |= 0x40000000;
1138    }
1139    shift32RightJamming( aSig, - expDiff, &aSig );
1140    bSig |= 0x40000000;
1141 bBigger:
1142    zSig = bSig - aSig;
1143    zExp = bExp;
1144    zSign ^= 1;
1145    goto normalizeRoundAndPack;
1146 aExpBigger:
1147    if ( aExp == 0xFF ) {
1148        if ( aSig ) return propagateFloat32NaN( a, b );
1149        return a;
1150    }
1151    if ( bExp == 0 ) {
1152        --expDiff;
1153    }
1154    else {
1155        bSig |= 0x40000000;
1156    }
1157    shift32RightJamming( bSig, expDiff, &bSig );
1158    aSig |= 0x40000000;
1159 aBigger:
1160    zSig = aSig - bSig;
1161    zExp = aExp;
1162 normalizeRoundAndPack:
1163    --zExp;
1164    return normalizeRoundAndPackFloat32( roundData, zSign, zExp, zSig );
1165
1166}
1167
1168/*
1169-------------------------------------------------------------------------------
1170Returns the result of adding the single-precision floating-point values `a'
1171and `b'.  The operation is performed according to the IEC/IEEE Standard for
1172Binary Floating-point Arithmetic.
1173-------------------------------------------------------------------------------
1174*/
1175float32 float32_add( struct roundingData *roundData, float32 a, float32 b )
1176{
1177    flag aSign, bSign;
1178
1179    aSign = extractFloat32Sign( a );
1180    bSign = extractFloat32Sign( b );
1181    if ( aSign == bSign ) {
1182        return addFloat32Sigs( roundData, a, b, aSign );
1183    }
1184    else {
1185        return subFloat32Sigs( roundData, a, b, aSign );
1186    }
1187
1188}
1189
1190/*
1191-------------------------------------------------------------------------------
1192Returns the result of subtracting the single-precision floating-point values
1193`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1194for Binary Floating-point Arithmetic.
1195-------------------------------------------------------------------------------
1196*/
1197float32 float32_sub( struct roundingData *roundData, float32 a, float32 b )
1198{
1199    flag aSign, bSign;
1200
1201    aSign = extractFloat32Sign( a );
1202    bSign = extractFloat32Sign( b );
1203    if ( aSign == bSign ) {
1204        return subFloat32Sigs( roundData, a, b, aSign );
1205    }
1206    else {
1207        return addFloat32Sigs( roundData, a, b, aSign );
1208    }
1209
1210}
1211
1212/*
1213-------------------------------------------------------------------------------
1214Returns the result of multiplying the single-precision floating-point values
1215`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1216for Binary Floating-point Arithmetic.
1217-------------------------------------------------------------------------------
1218*/
1219float32 float32_mul( struct roundingData *roundData, float32 a, float32 b )
1220{
1221    flag aSign, bSign, zSign;
1222    int16 aExp, bExp, zExp;
1223    bits32 aSig, bSig;
1224    bits64 zSig64;
1225    bits32 zSig;
1226
1227    aSig = extractFloat32Frac( a );
1228    aExp = extractFloat32Exp( a );
1229    aSign = extractFloat32Sign( a );
1230    bSig = extractFloat32Frac( b );
1231    bExp = extractFloat32Exp( b );
1232    bSign = extractFloat32Sign( b );
1233    zSign = aSign ^ bSign;
1234    if ( aExp == 0xFF ) {
1235        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1236            return propagateFloat32NaN( a, b );
1237        }
1238        if ( ( bExp | bSig ) == 0 ) {
1239            roundData->exception |= float_flag_invalid;
1240            return float32_default_nan;
1241        }
1242        return packFloat32( zSign, 0xFF, 0 );
1243    }
1244    if ( bExp == 0xFF ) {
1245        if ( bSig ) return propagateFloat32NaN( a, b );
1246        if ( ( aExp | aSig ) == 0 ) {
1247            roundData->exception |= float_flag_invalid;
1248            return float32_default_nan;
1249        }
1250        return packFloat32( zSign, 0xFF, 0 );
1251    }
1252    if ( aExp == 0 ) {
1253        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1254        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1255    }
1256    if ( bExp == 0 ) {
1257        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1258        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1259    }
1260    zExp = aExp + bExp - 0x7F;
1261    aSig = ( aSig | 0x00800000 )<<7;
1262    bSig = ( bSig | 0x00800000 )<<8;
1263    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1264    zSig = zSig64;
1265    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1266        zSig <<= 1;
1267        --zExp;
1268    }
1269    return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1270
1271}
1272
1273/*
1274-------------------------------------------------------------------------------
1275Returns the result of dividing the single-precision floating-point value `a'
1276by the corresponding value `b'.  The operation is performed according to the
1277IEC/IEEE Standard for Binary Floating-point Arithmetic.
1278-------------------------------------------------------------------------------
1279*/
1280float32 float32_div( struct roundingData *roundData, float32 a, float32 b )
1281{
1282    flag aSign, bSign, zSign;
1283    int16 aExp, bExp, zExp;
1284    bits32 aSig, bSig, zSig;
1285
1286    aSig = extractFloat32Frac( a );
1287    aExp = extractFloat32Exp( a );
1288    aSign = extractFloat32Sign( a );
1289    bSig = extractFloat32Frac( b );
1290    bExp = extractFloat32Exp( b );
1291    bSign = extractFloat32Sign( b );
1292    zSign = aSign ^ bSign;
1293    if ( aExp == 0xFF ) {
1294        if ( aSig ) return propagateFloat32NaN( a, b );
1295        if ( bExp == 0xFF ) {
1296            if ( bSig ) return propagateFloat32NaN( a, b );
1297            roundData->exception |= float_flag_invalid;
1298            return float32_default_nan;
1299        }
1300        return packFloat32( zSign, 0xFF, 0 );
1301    }
1302    if ( bExp == 0xFF ) {
1303        if ( bSig ) return propagateFloat32NaN( a, b );
1304        return packFloat32( zSign, 0, 0 );
1305    }
1306    if ( bExp == 0 ) {
1307        if ( bSig == 0 ) {
1308            if ( ( aExp | aSig ) == 0 ) {
1309                roundData->exception |= float_flag_invalid;
1310                return float32_default_nan;
1311            }
1312            roundData->exception |= float_flag_divbyzero;
1313            return packFloat32( zSign, 0xFF, 0 );
1314        }
1315        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1316    }
1317    if ( aExp == 0 ) {
1318        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1319        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1320    }
1321    zExp = aExp - bExp + 0x7D;
1322    aSig = ( aSig | 0x00800000 )<<7;
1323    bSig = ( bSig | 0x00800000 )<<8;
1324    if ( bSig <= ( aSig + aSig ) ) {
1325        aSig >>= 1;
1326        ++zExp;
1327    }
1328    {
1329        bits64 tmp = ( (bits64) aSig )<<32;
1330        do_div( tmp, bSig );
1331        zSig = tmp;
1332    }
1333    if ( ( zSig & 0x3F ) == 0 ) {
1334        zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1335    }
1336    return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1337
1338}
1339
1340/*
1341-------------------------------------------------------------------------------
1342Returns the remainder of the single-precision floating-point value `a'
1343with respect to the corresponding value `b'.  The operation is performed
1344according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1345-------------------------------------------------------------------------------
1346*/
1347float32 float32_rem( struct roundingData *roundData, float32 a, float32 b )
1348{
1349    flag aSign, bSign, zSign;
1350    int16 aExp, bExp, expDiff;
1351    bits32 aSig, bSig;
1352    bits32 q;
1353    bits64 aSig64, bSig64, q64;
1354    bits32 alternateASig;
1355    sbits32 sigMean;
1356
1357    aSig = extractFloat32Frac( a );
1358    aExp = extractFloat32Exp( a );
1359    aSign = extractFloat32Sign( a );
1360    bSig = extractFloat32Frac( b );
1361    bExp = extractFloat32Exp( b );
1362    bSign = extractFloat32Sign( b );
1363    if ( aExp == 0xFF ) {
1364        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1365            return propagateFloat32NaN( a, b );
1366        }
1367        roundData->exception |= float_flag_invalid;
1368        return float32_default_nan;
1369    }
1370    if ( bExp == 0xFF ) {
1371        if ( bSig ) return propagateFloat32NaN( a, b );
1372        return a;
1373    }
1374    if ( bExp == 0 ) {
1375        if ( bSig == 0 ) {
1376            roundData->exception |= float_flag_invalid;
1377            return float32_default_nan;
1378        }
1379        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1380    }
1381    if ( aExp == 0 ) {
1382        if ( aSig == 0 ) return a;
1383        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1384    }
1385    expDiff = aExp - bExp;
1386    aSig |= 0x00800000;
1387    bSig |= 0x00800000;
1388    if ( expDiff < 32 ) {
1389        aSig <<= 8;
1390        bSig <<= 8;
1391        if ( expDiff < 0 ) {
1392            if ( expDiff < -1 ) return a;
1393            aSig >>= 1;
1394        }
1395        q = ( bSig <= aSig );
1396        if ( q ) aSig -= bSig;
1397        if ( 0 < expDiff ) {
1398            bits64 tmp = ( (bits64) aSig )<<32;
1399            do_div( tmp, bSig );
1400            q = tmp;
1401            q >>= 32 - expDiff;
1402            bSig >>= 2;
1403            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1404        }
1405        else {
1406            aSig >>= 2;
1407            bSig >>= 2;
1408        }
1409    }
1410    else {
1411        if ( bSig <= aSig ) aSig -= bSig;
1412        aSig64 = ( (bits64) aSig )<<40;
1413        bSig64 = ( (bits64) bSig )<<40;
1414        expDiff -= 64;
1415        while ( 0 < expDiff ) {
1416            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1417            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1418            aSig64 = - ( ( bSig * q64 )<<38 );
1419            expDiff -= 62;
1420        }
1421        expDiff += 64;
1422        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1423        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1424        q = q64>>( 64 - expDiff );
1425        bSig <<= 6;
1426        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1427    }
1428    do {
1429        alternateASig = aSig;
1430        ++q;
1431        aSig -= bSig;
1432    } while ( 0 <= (sbits32) aSig );
1433    sigMean = aSig + alternateASig;
1434    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1435        aSig = alternateASig;
1436    }
1437    zSign = ( (sbits32) aSig < 0 );
1438    if ( zSign ) aSig = - aSig;
1439    return normalizeRoundAndPackFloat32( roundData, aSign ^ zSign, bExp, aSig );
1440
1441}
1442
1443/*
1444-------------------------------------------------------------------------------
1445Returns the square root of the single-precision floating-point value `a'.
1446The operation is performed according to the IEC/IEEE Standard for Binary
1447Floating-point Arithmetic.
1448-------------------------------------------------------------------------------
1449*/
1450float32 float32_sqrt( struct roundingData *roundData, float32 a )
1451{
1452    flag aSign;
1453    int16 aExp, zExp;
1454    bits32 aSig, zSig;
1455    bits64 rem, term;
1456
1457    aSig = extractFloat32Frac( a );
1458    aExp = extractFloat32Exp( a );
1459    aSign = extractFloat32Sign( a );
1460    if ( aExp == 0xFF ) {
1461        if ( aSig ) return propagateFloat32NaN( a, 0 );
1462        if ( ! aSign ) return a;
1463        roundData->exception |= float_flag_invalid;
1464        return float32_default_nan;
1465    }
1466    if ( aSign ) {
1467        if ( ( aExp | aSig ) == 0 ) return a;
1468        roundData->exception |= float_flag_invalid;
1469        return float32_default_nan;
1470    }
1471    if ( aExp == 0 ) {
1472        if ( aSig == 0 ) return 0;
1473        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1474    }
1475    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1476    aSig = ( aSig | 0x00800000 )<<8;
1477    zSig = estimateSqrt32( aExp, aSig ) + 2;
1478    if ( ( zSig & 0x7F ) <= 5 ) {
1479        if ( zSig < 2 ) {
1480            zSig = 0xFFFFFFFF;
1481        }
1482        else {
1483            aSig >>= aExp & 1;
1484            term = ( (bits64) zSig ) * zSig;
1485            rem = ( ( (bits64) aSig )<<32 ) - term;
1486            while ( (sbits64) rem < 0 ) {
1487                --zSig;
1488                rem += ( ( (bits64) zSig )<<1 ) | 1;
1489            }
1490            zSig |= ( rem != 0 );
1491        }
1492    }
1493    shift32RightJamming( zSig, 1, &zSig );
1494    return roundAndPackFloat32( roundData, 0, zExp, zSig );
1495
1496}
1497
1498/*
1499-------------------------------------------------------------------------------
1500Returns 1 if the single-precision floating-point value `a' is equal to the
1501corresponding value `b', and 0 otherwise.  The comparison is performed
1502according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1503-------------------------------------------------------------------------------
1504*/
1505flag float32_eq( float32 a, float32 b )
1506{
1507
1508    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1509         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1510       ) {
1511        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1512            float_raise( float_flag_invalid );
1513        }
1514        return 0;
1515    }
1516    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1517
1518}
1519
1520/*
1521-------------------------------------------------------------------------------
1522Returns 1 if the single-precision floating-point value `a' is less than or
1523equal to the corresponding value `b', and 0 otherwise.  The comparison is
1524performed according to the IEC/IEEE Standard for Binary Floating-point
1525Arithmetic.
1526-------------------------------------------------------------------------------
1527*/
1528flag float32_le( float32 a, float32 b )
1529{
1530    flag aSign, bSign;
1531
1532    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1533         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1534       ) {
1535        float_raise( float_flag_invalid );
1536        return 0;
1537    }
1538    aSign = extractFloat32Sign( a );
1539    bSign = extractFloat32Sign( b );
1540    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1541    return ( a == b ) || ( aSign ^ ( a < b ) );
1542
1543}
1544
1545/*
1546-------------------------------------------------------------------------------
1547Returns 1 if the single-precision floating-point value `a' is less than
1548the corresponding value `b', and 0 otherwise.  The comparison is performed
1549according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1550-------------------------------------------------------------------------------
1551*/
1552flag float32_lt( float32 a, float32 b )
1553{
1554    flag aSign, bSign;
1555
1556    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1557         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1558       ) {
1559        float_raise( float_flag_invalid );
1560        return 0;
1561    }
1562    aSign = extractFloat32Sign( a );
1563    bSign = extractFloat32Sign( b );
1564    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1565    return ( a != b ) && ( aSign ^ ( a < b ) );
1566
1567}
1568
1569/*
1570-------------------------------------------------------------------------------
1571Returns 1 if the single-precision floating-point value `a' is equal to the
1572corresponding value `b', and 0 otherwise.  The invalid exception is raised
1573if either operand is a NaN.  Otherwise, the comparison is performed
1574according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1575-------------------------------------------------------------------------------
1576*/
1577flag float32_eq_signaling( float32 a, float32 b )
1578{
1579
1580    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1581         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1582       ) {
1583        float_raise( float_flag_invalid );
1584        return 0;
1585    }
1586    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1587
1588}
1589
1590/*
1591-------------------------------------------------------------------------------
1592Returns 1 if the single-precision floating-point value `a' is less than or
1593equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
1594cause an exception.  Otherwise, the comparison is performed according to the
1595IEC/IEEE Standard for Binary Floating-point Arithmetic.
1596-------------------------------------------------------------------------------
1597*/
1598flag float32_le_quiet( float32 a, float32 b )
1599{
1600    flag aSign, bSign;
1601    //int16 aExp, bExp;
1602
1603    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1604         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1605       ) {
1606        /* Do nothing, even if NaN as we're quiet */
1607        return 0;
1608    }
1609    aSign = extractFloat32Sign( a );
1610    bSign = extractFloat32Sign( b );
1611    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1612    return ( a == b ) || ( aSign ^ ( a < b ) );
1613
1614}
1615
1616/*
1617-------------------------------------------------------------------------------
1618Returns 1 if the single-precision floating-point value `a' is less than
1619the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
1620exception.  Otherwise, the comparison is performed according to the IEC/IEEE
1621Standard for Binary Floating-point Arithmetic.
1622-------------------------------------------------------------------------------
1623*/
1624flag float32_lt_quiet( float32 a, float32 b )
1625{
1626    flag aSign, bSign;
1627
1628    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1629         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1630       ) {
1631        /* Do nothing, even if NaN as we're quiet */
1632        return 0;
1633    }
1634    aSign = extractFloat32Sign( a );
1635    bSign = extractFloat32Sign( b );
1636    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1637    return ( a != b ) && ( aSign ^ ( a < b ) );
1638
1639}
1640
1641/*
1642-------------------------------------------------------------------------------
1643Returns the result of converting the double-precision floating-point value
1644`a' to the 32-bit two's complement integer format.  The conversion is
1645performed according to the IEC/IEEE Standard for Binary Floating-point
1646Arithmetic---which means in particular that the conversion is rounded
1647according to the current rounding mode.  If `a' is a NaN, the largest
1648positive integer is returned.  Otherwise, if the conversion overflows, the
1649largest integer with the same sign as `a' is returned.
1650-------------------------------------------------------------------------------
1651*/
1652int32 float64_to_int32( struct roundingData *roundData, float64 a )
1653{
1654    flag aSign;
1655    int16 aExp, shiftCount;
1656    bits64 aSig;
1657
1658    aSig = extractFloat64Frac( a );
1659    aExp = extractFloat64Exp( a );
1660    aSign = extractFloat64Sign( a );
1661    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1662    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1663    shiftCount = 0x42C - aExp;
1664    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1665    return roundAndPackInt32( roundData, aSign, aSig );
1666
1667}
1668
1669/*
1670-------------------------------------------------------------------------------
1671Returns the result of converting the double-precision floating-point value
1672`a' to the 32-bit two's complement integer format.  The conversion is
1673performed according to the IEC/IEEE Standard for Binary Floating-point
1674Arithmetic, except that the conversion is always rounded toward zero.  If
1675`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1676conversion overflows, the largest integer with the same sign as `a' is
1677returned.
1678-------------------------------------------------------------------------------
1679*/
1680int32 float64_to_int32_round_to_zero( float64 a )
1681{
1682    flag aSign;
1683    int16 aExp, shiftCount;
1684    bits64 aSig, savedASig;
1685    int32 z;
1686
1687    aSig = extractFloat64Frac( a );
1688    aExp = extractFloat64Exp( a );
1689    aSign = extractFloat64Sign( a );
1690    shiftCount = 0x433 - aExp;
1691    if ( shiftCount < 21 ) {
1692        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1693        goto invalid;
1694    }
1695    else if ( 52 < shiftCount ) {
1696        if ( aExp || aSig ) float_raise( float_flag_inexact );
1697        return 0;
1698    }
1699    aSig |= LIT64( 0x0010000000000000 );
1700    savedASig = aSig;
1701    aSig >>= shiftCount;
1702    z = aSig;
1703    if ( aSign ) z = - z;
1704    if ( ( z < 0 ) ^ aSign ) {
1705 invalid:
1706        float_raise( float_flag_invalid );
1707        return aSign ? 0x80000000 : 0x7FFFFFFF;
1708    }
1709    if ( ( aSig<<shiftCount ) != savedASig ) {
1710        float_raise( float_flag_inexact );
1711    }
1712    return z;
1713
1714}
1715
1716/*
1717-------------------------------------------------------------------------------
1718Returns the result of converting the double-precision floating-point value
1719`a' to the 32-bit two's complement unsigned integer format.  The conversion
1720is performed according to the IEC/IEEE Standard for Binary Floating-point
1721Arithmetic---which means in particular that the conversion is rounded
1722according to the current rounding mode.  If `a' is a NaN, the largest
1723positive integer is returned.  Otherwise, if the conversion overflows, the
1724largest positive integer is returned.
1725-------------------------------------------------------------------------------
1726*/
1727int32 float64_to_uint32( struct roundingData *roundData, float64 a )
1728{
1729    flag aSign;
1730    int16 aExp, shiftCount;
1731    bits64 aSig;
1732
1733    aSig = extractFloat64Frac( a );
1734    aExp = extractFloat64Exp( a );
1735    aSign = 0; //extractFloat64Sign( a );
1736    //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1737    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1738    shiftCount = 0x42C - aExp;
1739    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1740    return roundAndPackInt32( roundData, aSign, aSig );
1741}
1742
1743/*
1744-------------------------------------------------------------------------------
1745Returns the result of converting the double-precision floating-point value
1746`a' to the 32-bit two's complement integer format.  The conversion is
1747performed according to the IEC/IEEE Standard for Binary Floating-point
1748Arithmetic, except that the conversion is always rounded toward zero.  If
1749`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1750conversion overflows, the largest positive integer is returned.
1751-------------------------------------------------------------------------------
1752*/
1753int32 float64_to_uint32_round_to_zero( float64 a )
1754{
1755    flag aSign;
1756    int16 aExp, shiftCount;
1757    bits64 aSig, savedASig;
1758    int32 z;
1759
1760    aSig = extractFloat64Frac( a );
1761    aExp = extractFloat64Exp( a );
1762    aSign = extractFloat64Sign( a );
1763    shiftCount = 0x433 - aExp;
1764    if ( shiftCount < 21 ) {
1765        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1766        goto invalid;
1767    }
1768    else if ( 52 < shiftCount ) {
1769        if ( aExp || aSig ) float_raise( float_flag_inexact );
1770        return 0;
1771    }
1772    aSig |= LIT64( 0x0010000000000000 );
1773    savedASig = aSig;
1774    aSig >>= shiftCount;
1775    z = aSig;
1776    if ( aSign ) z = - z;
1777    if ( ( z < 0 ) ^ aSign ) {
1778 invalid:
1779        float_raise( float_flag_invalid );
1780        return aSign ? 0x80000000 : 0x7FFFFFFF;
1781    }
1782    if ( ( aSig<<shiftCount ) != savedASig ) {
1783        float_raise( float_flag_inexact );
1784    }
1785    return z;
1786}
1787
1788/*
1789-------------------------------------------------------------------------------
1790Returns the result of converting the double-precision floating-point value
1791`a' to the single-precision floating-point format.  The conversion is
1792performed according to the IEC/IEEE Standard for Binary Floating-point
1793Arithmetic.
1794-------------------------------------------------------------------------------
1795*/
1796float32 float64_to_float32( struct roundingData *roundData, float64 a )
1797{
1798    flag aSign;
1799    int16 aExp;
1800    bits64 aSig;
1801    bits32 zSig;
1802
1803    aSig = extractFloat64Frac( a );
1804    aExp = extractFloat64Exp( a );
1805    aSign = extractFloat64Sign( a );
1806    if ( aExp == 0x7FF ) {
1807        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1808        return packFloat32( aSign, 0xFF, 0 );
1809    }
1810    shift64RightJamming( aSig, 22, &aSig );
1811    zSig = aSig;
1812    if ( aExp || zSig ) {
1813        zSig |= 0x40000000;
1814        aExp -= 0x381;
1815    }
1816    return roundAndPackFloat32( roundData, aSign, aExp, zSig );
1817
1818}
1819
1820#ifdef FLOATX80
1821
1822/*
1823-------------------------------------------------------------------------------
1824Returns the result of converting the double-precision floating-point value
1825`a' to the extended double-precision floating-point format.  The conversion
1826is performed according to the IEC/IEEE Standard for Binary Floating-point
1827Arithmetic.
1828-------------------------------------------------------------------------------
1829*/
1830floatx80 float64_to_floatx80( float64 a )
1831{
1832    flag aSign;
1833    int16 aExp;
1834    bits64 aSig;
1835
1836    aSig = extractFloat64Frac( a );
1837    aExp = extractFloat64Exp( a );
1838    aSign = extractFloat64Sign( a );
1839    if ( aExp == 0x7FF ) {
1840        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1841        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1842    }
1843    if ( aExp == 0 ) {
1844        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1845        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1846    }
1847    return
1848        packFloatx80(
1849            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1850
1851}
1852
1853#endif
1854
1855/*
1856-------------------------------------------------------------------------------
1857Rounds the double-precision floating-point value `a' to an integer, and
1858returns the result as a double-precision floating-point value.  The
1859operation is performed according to the IEC/IEEE Standard for Binary
1860Floating-point Arithmetic.
1861-------------------------------------------------------------------------------
1862*/
1863float64 float64_round_to_int( struct roundingData *roundData, float64 a )
1864{
1865    flag aSign;
1866    int16 aExp;
1867    bits64 lastBitMask, roundBitsMask;
1868    int8 roundingMode;
1869    float64 z;
1870
1871    aExp = extractFloat64Exp( a );
1872    if ( 0x433 <= aExp ) {
1873        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1874            return propagateFloat64NaN( a, a );
1875        }
1876        return a;
1877    }
1878    if ( aExp <= 0x3FE ) {
1879        if ( (bits64) ( a<<1 ) == 0 ) return a;
1880        roundData->exception |= float_flag_inexact;
1881        aSign = extractFloat64Sign( a );
1882        switch ( roundData->mode ) {
1883         case float_round_nearest_even:
1884            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1885                return packFloat64( aSign, 0x3FF, 0 );
1886            }
1887            break;
1888         case float_round_down:
1889            return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1890         case float_round_up:
1891            return
1892            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1893        }
1894        return packFloat64( aSign, 0, 0 );
1895    }
1896    lastBitMask = 1;
1897    lastBitMask <<= 0x433 - aExp;
1898    roundBitsMask = lastBitMask - 1;
1899    z = a;
1900    roundingMode = roundData->mode;
1901    if ( roundingMode == float_round_nearest_even ) {
1902        z += lastBitMask>>1;
1903        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1904    }
1905    else if ( roundingMode != float_round_to_zero ) {
1906        if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1907            z += roundBitsMask;
1908        }
1909    }
1910    z &= ~ roundBitsMask;
1911    if ( z != a ) roundData->exception |= float_flag_inexact;
1912    return z;
1913
1914}
1915
1916/*
1917-------------------------------------------------------------------------------
1918Returns the result of adding the absolute values of the double-precision
1919floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1920before being returned.  `zSign' is ignored if the result is a NaN.  The
1921addition is performed according to the IEC/IEEE Standard for Binary
1922Floating-point Arithmetic.
1923-------------------------------------------------------------------------------
1924*/
1925static float64 addFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
1926{
1927    int16 aExp, bExp, zExp;
1928    bits64 aSig, bSig, zSig;
1929    int16 expDiff;
1930
1931    aSig = extractFloat64Frac( a );
1932    aExp = extractFloat64Exp( a );
1933    bSig = extractFloat64Frac( b );
1934    bExp = extractFloat64Exp( b );
1935    expDiff = aExp - bExp;
1936    aSig <<= 9;
1937    bSig <<= 9;
1938    if ( 0 < expDiff ) {
1939        if ( aExp == 0x7FF ) {
1940            if ( aSig ) return propagateFloat64NaN( a, b );
1941            return a;
1942        }
1943        if ( bExp == 0 ) {
1944            --expDiff;
1945        }
1946        else {
1947            bSig |= LIT64( 0x2000000000000000 );
1948        }
1949        shift64RightJamming( bSig, expDiff, &bSig );
1950        zExp = aExp;
1951    }
1952    else if ( expDiff < 0 ) {
1953        if ( bExp == 0x7FF ) {
1954            if ( bSig ) return propagateFloat64NaN( a, b );
1955            return packFloat64( zSign, 0x7FF, 0 );
1956        }
1957        if ( aExp == 0 ) {
1958            ++expDiff;
1959        }
1960        else {
1961            aSig |= LIT64( 0x2000000000000000 );
1962        }
1963        shift64RightJamming( aSig, - expDiff, &aSig );
1964        zExp = bExp;
1965    }
1966    else {
1967        if ( aExp == 0x7FF ) {
1968            if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1969            return a;
1970        }
1971        if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1972        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1973        zExp = aExp;
1974        goto roundAndPack;
1975    }
1976    aSig |= LIT64( 0x2000000000000000 );
1977    zSig = ( aSig + bSig )<<1;
1978    --zExp;
1979    if ( (sbits64) zSig < 0 ) {
1980        zSig = aSig + bSig;
1981        ++zExp;
1982    }
1983 roundAndPack:
1984    return roundAndPackFloat64( roundData, zSign, zExp, zSig );
1985
1986}
1987
1988/*
1989-------------------------------------------------------------------------------
1990Returns the result of subtracting the absolute values of the double-
1991precision floating-point values `a' and `b'.  If `zSign' is true, the
1992difference is negated before being returned.  `zSign' is ignored if the
1993result is a NaN.  The subtraction is performed according to the IEC/IEEE
1994Standard for Binary Floating-point Arithmetic.
1995-------------------------------------------------------------------------------
1996*/
1997static float64 subFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
1998{
1999    int16 aExp, bExp, zExp;
2000    bits64 aSig, bSig, zSig;
2001    int16 expDiff;
2002
2003    aSig = extractFloat64Frac( a );
2004    aExp = extractFloat64Exp( a );
2005    bSig = extractFloat64Frac( b );
2006    bExp = extractFloat64Exp( b );
2007    expDiff = aExp - bExp;
2008    aSig <<= 10;
2009    bSig <<= 10;
2010    if ( 0 < expDiff ) goto aExpBigger;
2011    if ( expDiff < 0 ) goto bExpBigger;
2012    if ( aExp == 0x7FF ) {
2013        if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2014        roundData->exception |= float_flag_invalid;
2015        return float64_default_nan;
2016    }
2017    if ( aExp == 0 ) {
2018        aExp = 1;
2019        bExp = 1;
2020    }
2021    if ( bSig < aSig ) goto aBigger;
2022    if ( aSig < bSig ) goto bBigger;
2023    return packFloat64( roundData->mode == float_round_down, 0, 0 );
2024 bExpBigger:
2025    if ( bExp == 0x7FF ) {
2026        if ( bSig ) return propagateFloat64NaN( a, b );
2027        return packFloat64( zSign ^ 1, 0x7FF, 0 );
2028    }
2029    if ( aExp == 0 ) {
2030        ++expDiff;
2031    }
2032    else {
2033        aSig |= LIT64( 0x4000000000000000 );
2034    }
2035    shift64RightJamming( aSig, - expDiff, &aSig );
2036    bSig |= LIT64( 0x4000000000000000 );
2037 bBigger:
2038    zSig = bSig - aSig;
2039    zExp = bExp;
2040    zSign ^= 1;
2041    goto normalizeRoundAndPack;
2042 aExpBigger:
2043    if ( aExp == 0x7FF ) {
2044        if ( aSig ) return propagateFloat64NaN( a, b );
2045        return a;
2046    }
2047    if ( bExp == 0 ) {
2048        --expDiff;
2049    }
2050    else {
2051        bSig |= LIT64( 0x4000000000000000 );
2052    }
2053    shift64RightJamming( bSig, expDiff, &bSig );
2054    aSig |= LIT64( 0x4000000000000000 );
2055 aBigger:
2056    zSig = aSig - bSig;
2057    zExp = aExp;
2058 normalizeRoundAndPack:
2059    --zExp;
2060    return normalizeRoundAndPackFloat64( roundData, zSign, zExp, zSig );
2061
2062}
2063
2064/*
2065-------------------------------------------------------------------------------
2066Returns the result of adding the double-precision floating-point values `a'
2067and `b'.  The operation is performed according to the IEC/IEEE Standard for
2068Binary Floating-point Arithmetic.
2069-------------------------------------------------------------------------------
2070*/
2071float64 float64_add( struct roundingData *roundData, float64 a, float64 b )
2072{
2073    flag aSign, bSign;
2074
2075    aSign = extractFloat64Sign( a );
2076    bSign = extractFloat64Sign( b );
2077    if ( aSign == bSign ) {
2078        return addFloat64Sigs( roundData, a, b, aSign );
2079    }
2080    else {
2081        return subFloat64Sigs( roundData, a, b, aSign );
2082    }
2083
2084}
2085
2086/*
2087-------------------------------------------------------------------------------
2088Returns the result of subtracting the double-precision floating-point values
2089`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2090for Binary Floating-point Arithmetic.
2091-------------------------------------------------------------------------------
2092*/
2093float64 float64_sub( struct roundingData *roundData, float64 a, float64 b )
2094{
2095    flag aSign, bSign;
2096
2097    aSign = extractFloat64Sign( a );
2098    bSign = extractFloat64Sign( b );
2099    if ( aSign == bSign ) {
2100        return subFloat64Sigs( roundData, a, b, aSign );
2101    }
2102    else {
2103        return addFloat64Sigs( roundData, a, b, aSign );
2104    }
2105
2106}
2107
2108/*
2109-------------------------------------------------------------------------------
2110Returns the result of multiplying the double-precision floating-point values
2111`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2112for Binary Floating-point Arithmetic.
2113-------------------------------------------------------------------------------
2114*/
2115float64 float64_mul( struct roundingData *roundData, float64 a, float64 b )
2116{
2117    flag aSign, bSign, zSign;
2118    int16 aExp, bExp, zExp;
2119    bits64 aSig, bSig, zSig0, zSig1;
2120
2121    aSig = extractFloat64Frac( a );
2122    aExp = extractFloat64Exp( a );
2123    aSign = extractFloat64Sign( a );
2124    bSig = extractFloat64Frac( b );
2125    bExp = extractFloat64Exp( b );
2126    bSign = extractFloat64Sign( b );
2127    zSign = aSign ^ bSign;
2128    if ( aExp == 0x7FF ) {
2129        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2130            return propagateFloat64NaN( a, b );
2131        }
2132        if ( ( bExp | bSig ) == 0 ) {
2133            roundData->exception |= float_flag_invalid;
2134            return float64_default_nan;
2135        }
2136        return packFloat64( zSign, 0x7FF, 0 );
2137    }
2138    if ( bExp == 0x7FF ) {
2139        if ( bSig ) return propagateFloat64NaN( a, b );
2140        if ( ( aExp | aSig ) == 0 ) {
2141            roundData->exception |= float_flag_invalid;
2142            return float64_default_nan;
2143        }
2144        return packFloat64( zSign, 0x7FF, 0 );
2145    }
2146    if ( aExp == 0 ) {
2147        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2148        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2149    }
2150    if ( bExp == 0 ) {
2151        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2152        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2153    }
2154    zExp = aExp + bExp - 0x3FF;
2155    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2156    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2157    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2158    zSig0 |= ( zSig1 != 0 );
2159    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2160        zSig0 <<= 1;
2161        --zExp;
2162    }
2163    return roundAndPackFloat64( roundData, zSign, zExp, zSig0 );
2164
2165}
2166
2167/*
2168-------------------------------------------------------------------------------
2169Returns the result of dividing the double-precision floating-point value `a'
2170by the corresponding value `b'.  The operation is performed according to
2171the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2172-------------------------------------------------------------------------------
2173*/
2174float64 float64_div( struct roundingData *roundData, float64 a, float64 b )
2175{
2176    flag aSign, bSign, zSign;
2177    int16 aExp, bExp, zExp;
2178    bits64 aSig, bSig, zSig;
2179    bits64 rem0, rem1;
2180    bits64 term0, term1;
2181
2182    aSig = extractFloat64Frac( a );
2183    aExp = extractFloat64Exp( a );
2184    aSign = extractFloat64Sign( a );
2185    bSig = extractFloat64Frac( b );
2186    bExp = extractFloat64Exp( b );
2187    bSign = extractFloat64Sign( b );
2188    zSign = aSign ^ bSign;
2189    if ( aExp == 0x7FF ) {
2190        if ( aSig ) return propagateFloat64NaN( a, b );
2191        if ( bExp == 0x7FF ) {
2192            if ( bSig ) return propagateFloat64NaN( a, b );
2193            roundData->exception |= float_flag_invalid;
2194            return float64_default_nan;
2195        }
2196        return packFloat64( zSign, 0x7FF, 0 );
2197    }
2198    if ( bExp == 0x7FF ) {
2199        if ( bSig ) return propagateFloat64NaN( a, b );
2200        return packFloat64( zSign, 0, 0 );
2201    }
2202    if ( bExp == 0 ) {
2203        if ( bSig == 0 ) {
2204            if ( ( aExp | aSig ) == 0 ) {
2205                roundData->exception |= float_flag_invalid;
2206                return float64_default_nan;
2207            }
2208            roundData->exception |= float_flag_divbyzero;
2209            return packFloat64( zSign, 0x7FF, 0 );
2210        }
2211        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2212    }
2213    if ( aExp == 0 ) {
2214        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2215        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2216    }
2217    zExp = aExp - bExp + 0x3FD;
2218    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2219    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2220    if ( bSig <= ( aSig + aSig ) ) {
2221        aSig >>= 1;
2222        ++zExp;
2223    }
2224    zSig = estimateDiv128To64( aSig, 0, bSig );
2225    if ( ( zSig & 0x1FF ) <= 2 ) {
2226        mul64To128( bSig, zSig, &term0, &term1 );
2227        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2228        while ( (sbits64) rem0 < 0 ) {
2229            --zSig;
2230            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2231        }
2232        zSig |= ( rem1 != 0 );
2233    }
2234    return roundAndPackFloat64( roundData, zSign, zExp, zSig );
2235
2236}
2237
2238/*
2239-------------------------------------------------------------------------------
2240Returns the remainder of the double-precision floating-point value `a'
2241with respect to the corresponding value `b'.  The operation is performed
2242according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2243-------------------------------------------------------------------------------
2244*/
2245float64 float64_rem( struct roundingData *roundData, float64 a, float64 b )
2246{
2247    flag aSign, bSign, zSign;
2248    int16 aExp, bExp, expDiff;
2249    bits64 aSig, bSig;
2250    bits64 q, alternateASig;
2251    sbits64 sigMean;
2252
2253    aSig = extractFloat64Frac( a );
2254    aExp = extractFloat64Exp( a );
2255    aSign = extractFloat64Sign( a );
2256    bSig = extractFloat64Frac( b );
2257    bExp = extractFloat64Exp( b );
2258    bSign = extractFloat64Sign( b );
2259    if ( aExp == 0x7FF ) {
2260        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2261            return propagateFloat64NaN( a, b );
2262        }
2263        roundData->exception |= float_flag_invalid;
2264        return float64_default_nan;
2265    }
2266    if ( bExp == 0x7FF ) {
2267        if ( bSig ) return propagateFloat64NaN( a, b );
2268        return a;
2269    }
2270    if ( bExp == 0 ) {
2271        if ( bSig == 0 ) {
2272            roundData->exception |= float_flag_invalid;
2273            return float64_default_nan;
2274        }
2275        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2276    }
2277    if ( aExp == 0 ) {
2278        if ( aSig == 0 ) return a;
2279        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2280    }
2281    expDiff = aExp - bExp;
2282    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2283    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2284    if ( expDiff < 0 ) {
2285        if ( expDiff < -1 ) return a;
2286        aSig >>= 1;
2287    }
2288    q = ( bSig <= aSig );
2289    if ( q ) aSig -= bSig;
2290    expDiff -= 64;
2291    while ( 0 < expDiff ) {
2292        q = estimateDiv128To64( aSig, 0, bSig );
2293        q = ( 2 < q ) ? q - 2 : 0;
2294        aSig = - ( ( bSig>>2 ) * q );
2295        expDiff -= 62;
2296    }
2297    expDiff += 64;
2298    if ( 0 < expDiff ) {
2299        q = estimateDiv128To64( aSig, 0, bSig );
2300        q = ( 2 < q ) ? q - 2 : 0;
2301        q >>= 64 - expDiff;
2302        bSig >>= 2;
2303        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2304    }
2305    else {
2306        aSig >>= 2;
2307        bSig >>= 2;
2308    }
2309    do {
2310        alternateASig = aSig;
2311        ++q;
2312        aSig -= bSig;
2313    } while ( 0 <= (sbits64) aSig );
2314    sigMean = aSig + alternateASig;
2315    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2316        aSig = alternateASig;
2317    }
2318    zSign = ( (sbits64) aSig < 0 );
2319    if ( zSign ) aSig = - aSig;
2320    return normalizeRoundAndPackFloat64( roundData, aSign ^ zSign, bExp, aSig );
2321
2322}
2323
2324/*
2325-------------------------------------------------------------------------------
2326Returns the square root of the double-precision floating-point value `a'.
2327The operation is performed according to the IEC/IEEE Standard for Binary
2328Floating-point Arithmetic.
2329-------------------------------------------------------------------------------
2330*/
2331float64 float64_sqrt( struct roundingData *roundData, float64 a )
2332{
2333    flag aSign;
2334    int16 aExp, zExp;
2335    bits64 aSig, zSig;
2336    bits64 rem0, rem1, term0, term1; //, shiftedRem;
2337    //float64 z;
2338
2339    aSig = extractFloat64Frac( a );
2340    aExp = extractFloat64Exp( a );
2341    aSign = extractFloat64Sign( a );
2342    if ( aExp == 0x7FF ) {
2343        if ( aSig ) return propagateFloat64NaN( a, a );
2344        if ( ! aSign ) return a;
2345        roundData->exception |= float_flag_invalid;
2346        return float64_default_nan;
2347    }
2348    if ( aSign ) {
2349        if ( ( aExp | aSig ) == 0 ) return a;
2350        roundData->exception |= float_flag_invalid;
2351        return float64_default_nan;
2352    }
2353    if ( aExp == 0 ) {
2354        if ( aSig == 0 ) return 0;
2355        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2356    }
2357    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2358    aSig |= LIT64( 0x0010000000000000 );
2359    zSig = estimateSqrt32( aExp, aSig>>21 );
2360    zSig <<= 31;
2361    aSig <<= 9 - ( aExp & 1 );
2362    zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2363    if ( ( zSig & 0x3FF ) <= 5 ) {
2364        if ( zSig < 2 ) {
2365            zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2366        }
2367        else {
2368            aSig <<= 2;
2369            mul64To128( zSig, zSig, &term0, &term1 );
2370            sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2371            while ( (sbits64) rem0 < 0 ) {
2372                --zSig;
2373                shortShift128Left( 0, zSig, 1, &term0, &term1 );
2374                term1 |= 1;
2375                add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2376            }
2377            zSig |= ( ( rem0 | rem1 ) != 0 );
2378        }
2379    }
2380    shift64RightJamming( zSig, 1, &zSig );
2381    return roundAndPackFloat64( roundData, 0, zExp, zSig );
2382
2383}
2384
2385/*
2386-------------------------------------------------------------------------------
2387Returns 1 if the double-precision floating-point value `a' is equal to the
2388corresponding value `b', and 0 otherwise.  The comparison is performed
2389according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2390-------------------------------------------------------------------------------
2391*/
2392flag float64_eq( float64 a, float64 b )
2393{
2394
2395    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2396         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2397       ) {
2398        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2399            float_raise( float_flag_invalid );
2400        }
2401        return 0;
2402    }
2403    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2404
2405}
2406
2407/*
2408-------------------------------------------------------------------------------
2409Returns 1 if the double-precision floating-point value `a' is less than or
2410equal to the corresponding value `b', and 0 otherwise.  The comparison is
2411performed according to the IEC/IEEE Standard for Binary Floating-point
2412Arithmetic.
2413-------------------------------------------------------------------------------
2414*/
2415flag float64_le( float64 a, float64 b )
2416{
2417    flag aSign, bSign;
2418
2419    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2420         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2421       ) {
2422        float_raise( float_flag_invalid );
2423        return 0;
2424    }
2425    aSign = extractFloat64Sign( a );
2426    bSign = extractFloat64Sign( b );
2427    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2428    return ( a == b ) || ( aSign ^ ( a < b ) );
2429
2430}
2431
2432/*
2433-------------------------------------------------------------------------------
2434Returns 1 if the double-precision floating-point value `a' is less than
2435the corresponding value `b', and 0 otherwise.  The comparison is performed
2436according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2437-------------------------------------------------------------------------------
2438*/
2439flag float64_lt( float64 a, float64 b )
2440{
2441    flag aSign, bSign;
2442
2443    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2444         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2445       ) {
2446        float_raise( float_flag_invalid );
2447        return 0;
2448    }
2449    aSign = extractFloat64Sign( a );
2450    bSign = extractFloat64Sign( b );
2451    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2452    return ( a != b ) && ( aSign ^ ( a < b ) );
2453
2454}
2455
2456/*
2457-------------------------------------------------------------------------------
2458Returns 1 if the double-precision floating-point value `a' is equal to the
2459corresponding value `b', and 0 otherwise.  The invalid exception is raised
2460if either operand is a NaN.  Otherwise, the comparison is performed
2461according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2462-------------------------------------------------------------------------------
2463*/
2464flag float64_eq_signaling( float64 a, float64 b )
2465{
2466
2467    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2468         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2469       ) {
2470        float_raise( float_flag_invalid );
2471        return 0;
2472    }
2473    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2474
2475}
2476
2477/*
2478-------------------------------------------------------------------------------
2479Returns 1 if the double-precision floating-point value `a' is less than or
2480equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2481cause an exception.  Otherwise, the comparison is performed according to the
2482IEC/IEEE Standard for Binary Floating-point Arithmetic.
2483-------------------------------------------------------------------------------
2484*/
2485flag float64_le_quiet( float64 a, float64 b )
2486{
2487    flag aSign, bSign;
2488    //int16 aExp, bExp;
2489
2490    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2491         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2492       ) {
2493        /* Do nothing, even if NaN as we're quiet */
2494        return 0;
2495    }
2496    aSign = extractFloat64Sign( a );
2497    bSign = extractFloat64Sign( b );
2498    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2499    return ( a == b ) || ( aSign ^ ( a < b ) );
2500
2501}
2502
2503/*
2504-------------------------------------------------------------------------------
2505Returns 1 if the double-precision floating-point value `a' is less than
2506the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2507exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2508Standard for Binary Floating-point Arithmetic.
2509-------------------------------------------------------------------------------
2510*/
2511flag float64_lt_quiet( float64 a, float64 b )
2512{
2513    flag aSign, bSign;
2514
2515    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2516         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2517       ) {
2518        /* Do nothing, even if NaN as we're quiet */
2519        return 0;
2520    }
2521    aSign = extractFloat64Sign( a );
2522    bSign = extractFloat64Sign( b );
2523    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2524    return ( a != b ) && ( aSign ^ ( a < b ) );
2525
2526}
2527
2528#ifdef FLOATX80
2529
2530/*
2531-------------------------------------------------------------------------------
2532Returns the result of converting the extended double-precision floating-
2533point value `a' to the 32-bit two's complement integer format.  The
2534conversion is performed according to the IEC/IEEE Standard for Binary
2535Floating-point Arithmetic---which means in particular that the conversion
2536is rounded according to the current rounding mode.  If `a' is a NaN, the
2537largest positive integer is returned.  Otherwise, if the conversion
2538overflows, the largest integer with the same sign as `a' is returned.
2539-------------------------------------------------------------------------------
2540*/
2541int32 floatx80_to_int32( struct roundingData *roundData, floatx80 a )
2542{
2543    flag aSign;
2544    int32 aExp, shiftCount;
2545    bits64 aSig;
2546
2547    aSig = extractFloatx80Frac( a );
2548    aExp = extractFloatx80Exp( a );
2549    aSign = extractFloatx80Sign( a );
2550    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2551    shiftCount = 0x4037 - aExp;
2552    if ( shiftCount <= 0 ) shiftCount = 1;
2553    shift64RightJamming( aSig, shiftCount, &aSig );
2554    return roundAndPackInt32( roundData, aSign, aSig );
2555
2556}
2557
2558/*
2559-------------------------------------------------------------------------------
2560Returns the result of converting the extended double-precision floating-
2561point value `a' to the 32-bit two's complement integer format.  The
2562conversion is performed according to the IEC/IEEE Standard for Binary
2563Floating-point Arithmetic, except that the conversion is always rounded
2564toward zero.  If `a' is a NaN, the largest positive integer is returned.
2565Otherwise, if the conversion overflows, the largest integer with the same
2566sign as `a' is returned.
2567-------------------------------------------------------------------------------
2568*/
2569int32 floatx80_to_int32_round_to_zero( floatx80 a )
2570{
2571    flag aSign;
2572    int32 aExp, shiftCount;
2573    bits64 aSig, savedASig;
2574    int32 z;
2575
2576    aSig = extractFloatx80Frac( a );
2577    aExp = extractFloatx80Exp( a );
2578    aSign = extractFloatx80Sign( a );
2579    shiftCount = 0x403E - aExp;
2580    if ( shiftCount < 32 ) {
2581        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2582        goto invalid;
2583    }
2584    else if ( 63 < shiftCount ) {
2585        if ( aExp || aSig ) float_raise( float_flag_inexact );
2586        return 0;
2587    }
2588    savedASig = aSig;
2589    aSig >>= shiftCount;
2590    z = aSig;
2591    if ( aSign ) z = - z;
2592    if ( ( z < 0 ) ^ aSign ) {
2593 invalid:
2594        float_raise( float_flag_invalid );
2595        return aSign ? 0x80000000 : 0x7FFFFFFF;
2596    }
2597    if ( ( aSig<<shiftCount ) != savedASig ) {
2598        float_raise( float_flag_inexact );
2599    }
2600    return z;
2601
2602}
2603
2604/*
2605-------------------------------------------------------------------------------
2606Returns the result of converting the extended double-precision floating-
2607point value `a' to the single-precision floating-point format.  The
2608conversion is performed according to the IEC/IEEE Standard for Binary
2609Floating-point Arithmetic.
2610-------------------------------------------------------------------------------
2611*/
2612float32 floatx80_to_float32( struct roundingData *roundData, floatx80 a )
2613{
2614    flag aSign;
2615    int32 aExp;
2616    bits64 aSig;
2617
2618    aSig = extractFloatx80Frac( a );
2619    aExp = extractFloatx80Exp( a );
2620    aSign = extractFloatx80Sign( a );
2621    if ( aExp == 0x7FFF ) {
2622        if ( (bits64) ( aSig<<1 ) ) {
2623            return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2624        }
2625        return packFloat32( aSign, 0xFF, 0 );
2626    }
2627    shift64RightJamming( aSig, 33, &aSig );
2628    if ( aExp || aSig ) aExp -= 0x3F81;
2629    return roundAndPackFloat32( roundData, aSign, aExp, aSig );
2630
2631}
2632
2633/*
2634-------------------------------------------------------------------------------
2635Returns the result of converting the extended double-precision floating-
2636point value `a' to the double-precision floating-point format.  The
2637conversion is performed according to the IEC/IEEE Standard for Binary
2638Floating-point Arithmetic.
2639-------------------------------------------------------------------------------
2640*/
2641float64 floatx80_to_float64( struct roundingData *roundData, floatx80 a )
2642{
2643    flag aSign;
2644    int32 aExp;
2645    bits64 aSig, zSig;
2646
2647    aSig = extractFloatx80Frac( a );
2648    aExp = extractFloatx80Exp( a );
2649    aSign = extractFloatx80Sign( a );
2650    if ( aExp == 0x7FFF ) {
2651        if ( (bits64) ( aSig<<1 ) ) {
2652            return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2653        }
2654        return packFloat64( aSign, 0x7FF, 0 );
2655    }
2656    shift64RightJamming( aSig, 1, &zSig );
2657    if ( aExp || aSig ) aExp -= 0x3C01;
2658    return roundAndPackFloat64( roundData, aSign, aExp, zSig );
2659
2660}
2661
2662/*
2663-------------------------------------------------------------------------------
2664Rounds the extended double-precision floating-point value `a' to an integer,
2665and returns the result as an extended quadruple-precision floating-point
2666value.  The operation is performed according to the IEC/IEEE Standard for
2667Binary Floating-point Arithmetic.
2668-------------------------------------------------------------------------------
2669*/
2670floatx80 floatx80_round_to_int( struct roundingData *roundData, floatx80 a )
2671{
2672    flag aSign;
2673    int32 aExp;
2674    bits64 lastBitMask, roundBitsMask;
2675    int8 roundingMode;
2676    floatx80 z;
2677
2678    aExp = extractFloatx80Exp( a );
2679    if ( 0x403E <= aExp ) {
2680        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2681            return propagateFloatx80NaN( a, a );
2682        }
2683        return a;
2684    }
2685    if ( aExp <= 0x3FFE ) {
2686        if (    ( aExp == 0 )
2687             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2688            return a;
2689        }
2690        roundData->exception |= float_flag_inexact;
2691        aSign = extractFloatx80Sign( a );
2692        switch ( roundData->mode ) {
2693         case float_round_nearest_even:
2694            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2695               ) {
2696                return
2697                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2698            }
2699            break;
2700         case float_round_down:
2701            return
2702                  aSign ?
2703                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2704                : packFloatx80( 0, 0, 0 );
2705         case float_round_up:
2706            return
2707                  aSign ? packFloatx80( 1, 0, 0 )
2708                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2709        }
2710        return packFloatx80( aSign, 0, 0 );
2711    }
2712    lastBitMask = 1;
2713    lastBitMask <<= 0x403E - aExp;
2714    roundBitsMask = lastBitMask - 1;
2715    z = a;
2716    roundingMode = roundData->mode;
2717    if ( roundingMode == float_round_nearest_even ) {
2718        z.low += lastBitMask>>1;
2719        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2720    }
2721    else if ( roundingMode != float_round_to_zero ) {
2722        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2723            z.low += roundBitsMask;
2724        }
2725    }
2726    z.low &= ~ roundBitsMask;
2727    if ( z.low == 0 ) {
2728        ++z.high;
2729        z.low = LIT64( 0x8000000000000000 );
2730    }
2731    if ( z.low != a.low ) roundData->exception |= float_flag_inexact;
2732    return z;
2733
2734}
2735
2736/*
2737-------------------------------------------------------------------------------
2738Returns the result of adding the absolute values of the extended double-
2739precision floating-point values `a' and `b'.  If `zSign' is true, the sum is
2740negated before being returned.  `zSign' is ignored if the result is a NaN.
2741The addition is performed according to the IEC/IEEE Standard for Binary
2742Floating-point Arithmetic.
2743-------------------------------------------------------------------------------
2744*/
2745static floatx80 addFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2746{
2747    int32 aExp, bExp, zExp;
2748    bits64 aSig, bSig, zSig0, zSig1;
2749    int32 expDiff;
2750
2751    aSig = extractFloatx80Frac( a );
2752    aExp = extractFloatx80Exp( a );
2753    bSig = extractFloatx80Frac( b );
2754    bExp = extractFloatx80Exp( b );
2755    expDiff = aExp - bExp;
2756    if ( 0 < expDiff ) {
2757        if ( aExp == 0x7FFF ) {
2758            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2759            return a;
2760        }
2761        if ( bExp == 0 ) --expDiff;
2762        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2763        zExp = aExp;
2764    }
2765    else if ( expDiff < 0 ) {
2766        if ( bExp == 0x7FFF ) {
2767            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2768            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2769        }
2770        if ( aExp == 0 ) ++expDiff;
2771        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2772        zExp = bExp;
2773    }
2774    else {
2775        if ( aExp == 0x7FFF ) {
2776            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2777                return propagateFloatx80NaN( a, b );
2778            }
2779            return a;
2780        }
2781        zSig1 = 0;
2782        zSig0 = aSig + bSig;
2783        if ( aExp == 0 ) {
2784            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2785            goto roundAndPack;
2786        }
2787        zExp = aExp;
2788        goto shiftRight1;
2789    }
2790    
2791    zSig0 = aSig + bSig;
2792
2793    if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 
2794 shiftRight1:
2795    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2796    zSig0 |= LIT64( 0x8000000000000000 );
2797    ++zExp;
2798 roundAndPack:
2799    return
2800        roundAndPackFloatx80(
2801            roundData, zSign, zExp, zSig0, zSig1 );
2802
2803}
2804
2805/*
2806-------------------------------------------------------------------------------
2807Returns the result of subtracting the absolute values of the extended
2808double-precision floating-point values `a' and `b'.  If `zSign' is true,
2809the difference is negated before being returned.  `zSign' is ignored if the
2810result is a NaN.  The subtraction is performed according to the IEC/IEEE
2811Standard for Binary Floating-point Arithmetic.
2812-------------------------------------------------------------------------------
2813*/
2814static floatx80 subFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2815{
2816    int32 aExp, bExp, zExp;
2817    bits64 aSig, bSig, zSig0, zSig1;
2818    int32 expDiff;
2819    floatx80 z;
2820
2821    aSig = extractFloatx80Frac( a );
2822    aExp = extractFloatx80Exp( a );
2823    bSig = extractFloatx80Frac( b );
2824    bExp = extractFloatx80Exp( b );
2825    expDiff = aExp - bExp;
2826    if ( 0 < expDiff ) goto aExpBigger;
2827    if ( expDiff < 0 ) goto bExpBigger;
2828    if ( aExp == 0x7FFF ) {
2829        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2830            return propagateFloatx80NaN( a, b );
2831        }
2832        roundData->exception |= float_flag_invalid;
2833        z.low = floatx80_default_nan_low;
2834        z.high = floatx80_default_nan_high;
2835        z.__padding = 0;
2836        return z;
2837    }
2838    if ( aExp == 0 ) {
2839        aExp = 1;
2840        bExp = 1;
2841    }
2842    zSig1 = 0;
2843    if ( bSig < aSig ) goto aBigger;
2844    if ( aSig < bSig ) goto bBigger;
2845    return packFloatx80( roundData->mode == float_round_down, 0, 0 );
2846 bExpBigger:
2847    if ( bExp == 0x7FFF ) {
2848        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2849        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2850    }
2851    if ( aExp == 0 ) ++expDiff;
2852    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2853 bBigger:
2854    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2855    zExp = bExp;
2856    zSign ^= 1;
2857    goto normalizeRoundAndPack;
2858 aExpBigger:
2859    if ( aExp == 0x7FFF ) {
2860        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2861        return a;
2862    }
2863    if ( bExp == 0 ) --expDiff;
2864    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2865 aBigger:
2866    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2867    zExp = aExp;
2868 normalizeRoundAndPack:
2869    return
2870        normalizeRoundAndPackFloatx80(
2871            roundData, zSign, zExp, zSig0, zSig1 );
2872
2873}
2874
2875/*
2876-------------------------------------------------------------------------------
2877Returns the result of adding the extended double-precision floating-point
2878values `a' and `b'.  The operation is performed according to the IEC/IEEE
2879Standard for Binary Floating-point Arithmetic.
2880-------------------------------------------------------------------------------
2881*/
2882floatx80 floatx80_add( struct roundingData *roundData, floatx80 a, floatx80 b )
2883{
2884    flag aSign, bSign;
2885    
2886    aSign = extractFloatx80Sign( a );
2887    bSign = extractFloatx80Sign( b );
2888    if ( aSign == bSign ) {
2889        return addFloatx80Sigs( roundData, a, b, aSign );
2890    }
2891    else {
2892        return subFloatx80Sigs( roundData, a, b, aSign );
2893    }
2894    
2895}
2896
2897/*
2898-------------------------------------------------------------------------------
2899Returns the result of subtracting the extended double-precision floating-
2900point values `a' and `b'.  The operation is performed according to the
2901IEC/IEEE Standard for Binary Floating-point Arithmetic.
2902-------------------------------------------------------------------------------
2903*/
2904floatx80 floatx80_sub( struct roundingData *roundData, floatx80 a, floatx80 b )
2905{
2906    flag aSign, bSign;
2907
2908    aSign = extractFloatx80Sign( a );
2909    bSign = extractFloatx80Sign( b );
2910    if ( aSign == bSign ) {
2911        return subFloatx80Sigs( roundData, a, b, aSign );
2912    }
2913    else {
2914        return addFloatx80Sigs( roundData, a, b, aSign );
2915    }
2916
2917}
2918
2919/*
2920-------------------------------------------------------------------------------
2921Returns the result of multiplying the extended double-precision floating-
2922point values `a' and `b'.  The operation is performed according to the
2923IEC/IEEE Standard for Binary Floating-point Arithmetic.
2924-------------------------------------------------------------------------------
2925*/
2926floatx80 floatx80_mul( struct roundingData *roundData, floatx80 a, floatx80 b )
2927{
2928    flag aSign, bSign, zSign;
2929    int32 aExp, bExp, zExp;
2930    bits64 aSig, bSig, zSig0, zSig1;
2931    floatx80 z;
2932
2933    aSig = extractFloatx80Frac( a );
2934    aExp = extractFloatx80Exp( a );
2935    aSign = extractFloatx80Sign( a );
2936    bSig = extractFloatx80Frac( b );
2937    bExp = extractFloatx80Exp( b );
2938    bSign = extractFloatx80Sign( b );
2939    zSign = aSign ^ bSign;
2940    if ( aExp == 0x7FFF ) {
2941        if (    (bits64) ( aSig<<1 )
2942             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2943            return propagateFloatx80NaN( a, b );
2944        }
2945        if ( ( bExp | bSig ) == 0 ) goto invalid;
2946        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2947    }
2948    if ( bExp == 0x7FFF ) {
2949        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2950        if ( ( aExp | aSig ) == 0 ) {
2951 invalid:
2952            roundData->exception |= float_flag_invalid;
2953            z.low = floatx80_default_nan_low;
2954            z.high = floatx80_default_nan_high;
2955            z.__padding = 0;
2956            return z;
2957        }
2958        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2959    }
2960    if ( aExp == 0 ) {
2961        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2962        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2963    }
2964    if ( bExp == 0 ) {
2965        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2966        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2967    }
2968    zExp = aExp + bExp - 0x3FFE;
2969    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2970    if ( 0 < (sbits64) zSig0 ) {
2971        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2972        --zExp;
2973    }
2974    return
2975        roundAndPackFloatx80(
2976            roundData, zSign, zExp, zSig0, zSig1 );
2977
2978}
2979
2980/*
2981-------------------------------------------------------------------------------
2982Returns the result of dividing the extended double-precision floating-point
2983value `a' by the corresponding value `b'.  The operation is performed
2984according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2985-------------------------------------------------------------------------------
2986*/
2987floatx80 floatx80_div( struct roundingData *roundData, floatx80 a, floatx80 b )
2988{
2989    flag aSign, bSign, zSign;
2990    int32 aExp, bExp, zExp;
2991    bits64 aSig, bSig, zSig0, zSig1;
2992    bits64 rem0, rem1, rem2, term0, term1, term2;
2993    floatx80 z;
2994
2995    aSig = extractFloatx80Frac( a );
2996    aExp = extractFloatx80Exp( a );
2997    aSign = extractFloatx80Sign( a );
2998    bSig = extractFloatx80Frac( b );
2999    bExp = extractFloatx80Exp( b );
3000    bSign = extractFloatx80Sign( b );
3001    zSign = aSign ^ bSign;
3002    if ( aExp == 0x7FFF ) {
3003        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3004        if ( bExp == 0x7FFF ) {
3005            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3006            goto invalid;
3007        }
3008        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3009    }
3010    if ( bExp == 0x7FFF ) {
3011        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3012        return packFloatx80( zSign, 0, 0 );
3013    }
3014    if ( bExp == 0 ) {
3015        if ( bSig == 0 ) {
3016            if ( ( aExp | aSig ) == 0 ) {
3017 invalid:
3018                roundData->exception |= float_flag_invalid;
3019                z.low = floatx80_default_nan_low;
3020                z.high = floatx80_default_nan_high;
3021                z.__padding = 0;
3022                return z;
3023            }
3024            roundData->exception |= float_flag_divbyzero;
3025            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3026        }
3027        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3028    }
3029    if ( aExp == 0 ) {
3030        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3031        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3032    }
3033    zExp = aExp - bExp + 0x3FFE;
3034    rem1 = 0;
3035    if ( bSig <= aSig ) {
3036        shift128Right( aSig, 0, 1, &aSig, &rem1 );
3037        ++zExp;
3038    }
3039    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3040    mul64To128( bSig, zSig0, &term0, &term1 );
3041    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3042    while ( (sbits64) rem0 < 0 ) {
3043        --zSig0;
3044        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3045    }
3046    zSig1 = estimateDiv128To64( rem1, 0, bSig );
3047    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3048        mul64To128( bSig, zSig1, &term1, &term2 );
3049        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3050        while ( (sbits64) rem1 < 0 ) {
3051            --zSig1;
3052            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3053        }
3054        zSig1 |= ( ( rem1 | rem2 ) != 0 );
3055    }
3056    return
3057        roundAndPackFloatx80(
3058            roundData, zSign, zExp, zSig0, zSig1 );
3059
3060}
3061
3062/*
3063-------------------------------------------------------------------------------
3064Returns the remainder of the extended double-precision floating-point value
3065`a' with respect to the corresponding value `b'.  The operation is performed
3066according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3067-------------------------------------------------------------------------------
3068*/
3069floatx80 floatx80_rem( struct roundingData *roundData, floatx80 a, floatx80 b )
3070{
3071    flag aSign, bSign, zSign;
3072    int32 aExp, bExp, expDiff;
3073    bits64 aSig0, aSig1, bSig;
3074    bits64 q, term0, term1, alternateASig0, alternateASig1;
3075    floatx80 z;
3076
3077    aSig0 = extractFloatx80Frac( a );
3078    aExp = extractFloatx80Exp( a );
3079    aSign = extractFloatx80Sign( a );
3080    bSig = extractFloatx80Frac( b );
3081    bExp = extractFloatx80Exp( b );
3082    bSign = extractFloatx80Sign( b );
3083    if ( aExp == 0x7FFF ) {
3084        if (    (bits64) ( aSig0<<1 )
3085             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3086            return propagateFloatx80NaN( a, b );
3087        }
3088        goto invalid;
3089    }
3090    if ( bExp == 0x7FFF ) {
3091        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3092        return a;
3093    }
3094    if ( bExp == 0 ) {
3095        if ( bSig == 0 ) {
3096 invalid:
3097            roundData->exception |= float_flag_invalid;
3098            z.low = floatx80_default_nan_low;
3099            z.high = floatx80_default_nan_high;
3100            z.__padding = 0;
3101            return z;
3102        }
3103        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3104    }
3105    if ( aExp == 0 ) {
3106        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3107        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3108    }
3109    bSig |= LIT64( 0x8000000000000000 );
3110    zSign = aSign;
3111    expDiff = aExp - bExp;
3112    aSig1 = 0;
3113    if ( expDiff < 0 ) {
3114        if ( expDiff < -1 ) return a;
3115        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3116        expDiff = 0;
3117    }
3118    q = ( bSig <= aSig0 );
3119    if ( q ) aSig0 -= bSig;
3120    expDiff -= 64;
3121    while ( 0 < expDiff ) {
3122        q = estimateDiv128To64( aSig0, aSig1, bSig );
3123        q = ( 2 < q ) ? q - 2 : 0;
3124        mul64To128( bSig, q, &term0, &term1 );
3125        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3126        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3127        expDiff -= 62;
3128    }
3129    expDiff += 64;
3130    if ( 0 < expDiff ) {
3131        q = estimateDiv128To64( aSig0, aSig1, bSig );
3132        q = ( 2 < q ) ? q - 2 : 0;
3133        q >>= 64 - expDiff;
3134        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3135        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3136        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3137        while ( le128( term0, term1, aSig0, aSig1 ) ) {
3138            ++q;
3139            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3140        }
3141    }
3142    else {
3143        term1 = 0;
3144        term0 = bSig;
3145    }
3146    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3147    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3148         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3149              && ( q & 1 ) )
3150       ) {
3151        aSig0 = alternateASig0;
3152        aSig1 = alternateASig1;
3153        zSign = ! zSign;
3154    }
3155
3156    return
3157        normalizeRoundAndPackFloatx80(
3158            roundData, zSign, bExp + expDiff, aSig0, aSig1 );
3159
3160}
3161
3162/*
3163-------------------------------------------------------------------------------
3164Returns the square root of the extended double-precision floating-point
3165value `a'.  The operation is performed according to the IEC/IEEE Standard
3166for Binary Floating-point Arithmetic.
3167-------------------------------------------------------------------------------
3168*/
3169floatx80 floatx80_sqrt( struct roundingData *roundData, floatx80 a )
3170{
3171    flag aSign;
3172    int32 aExp, zExp;
3173    bits64 aSig0, aSig1, zSig0, zSig1;
3174    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3175    bits64 shiftedRem0, shiftedRem1;
3176    floatx80 z;
3177
3178    aSig0 = extractFloatx80Frac( a );
3179    aExp = extractFloatx80Exp( a );
3180    aSign = extractFloatx80Sign( a );
3181    if ( aExp == 0x7FFF ) {
3182        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3183        if ( ! aSign ) return a;
3184        goto invalid;
3185    }
3186    if ( aSign ) {
3187        if ( ( aExp | aSig0 ) == 0 ) return a;
3188 invalid:
3189        roundData->exception |= float_flag_invalid;
3190        z.low = floatx80_default_nan_low;
3191        z.high = floatx80_default_nan_high;
3192        z.__padding = 0;
3193        return z;
3194    }
3195    if ( aExp == 0 ) {
3196        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3197        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3198    }
3199    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3200    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3201    zSig0 <<= 31;
3202    aSig1 = 0;
3203    shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3204    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3205    if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3206    shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3207    mul64To128( zSig0, zSig0, &term0, &term1 );
3208    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3209    while ( (sbits64) rem0 < 0 ) {
3210        --zSig0;
3211        shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3212        term1 |= 1;
3213        add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3214    }
3215    shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3216    zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3217    if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3218        if ( zSig1 == 0 ) zSig1 = 1;
3219        mul64To128( zSig0, zSig1, &term1, &term2 );
3220        shortShift128Left( term1, term2, 1, &term1, &term2 );
3221        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3222        mul64To128( zSig1, zSig1, &term2, &term3 );
3223        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3224        while ( (sbits64) rem1 < 0 ) {
3225            --zSig1;
3226            shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3227            term3 |= 1;
3228            add192(
3229                rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3230        }
3231        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3232    }
3233    return
3234        roundAndPackFloatx80(
3235            roundData, 0, zExp, zSig0, zSig1 );
3236
3237}
3238
3239/*
3240-------------------------------------------------------------------------------
3241Returns 1 if the extended double-precision floating-point value `a' is
3242equal to the corresponding value `b', and 0 otherwise.  The comparison is
3243performed according to the IEC/IEEE Standard for Binary Floating-point
3244Arithmetic.
3245-------------------------------------------------------------------------------
3246*/
3247flag floatx80_eq( floatx80 a, floatx80 b )
3248{
3249
3250    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3251              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3252         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3253              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3254       ) {
3255        if (    floatx80_is_signaling_nan( a )
3256             || floatx80_is_signaling_nan( b ) ) {
3257            float_raise( float_flag_invalid );
3258        }
3259        return 0;
3260    }
3261    return
3262           ( a.low == b.low )
3263        && (    ( a.high == b.high )
3264             || (    ( a.low == 0 )
3265                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3266           );
3267
3268}
3269
3270/*
3271-------------------------------------------------------------------------------
3272Returns 1 if the extended double-precision floating-point value `a' is
3273less than or equal to the corresponding value `b', and 0 otherwise.  The
3274comparison is performed according to the IEC/IEEE Standard for Binary
3275Floating-point Arithmetic.
3276-------------------------------------------------------------------------------
3277*/
3278flag floatx80_le( floatx80 a, floatx80 b )
3279{
3280    flag aSign, bSign;
3281
3282    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3283              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3284         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3285              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3286       ) {
3287        float_raise( float_flag_invalid );
3288        return 0;
3289    }
3290    aSign = extractFloatx80Sign( a );
3291    bSign = extractFloatx80Sign( b );
3292    if ( aSign != bSign ) {
3293        return
3294               aSign
3295            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3296                 == 0 );
3297    }
3298    return
3299          aSign ? le128( b.high, b.low, a.high, a.low )
3300        : le128( a.high, a.low, b.high, b.low );
3301
3302}
3303
3304/*
3305-------------------------------------------------------------------------------
3306Returns 1 if the extended double-precision floating-point value `a' is
3307less than the corresponding value `b', and 0 otherwise.  The comparison
3308is performed according to the IEC/IEEE Standard for Binary Floating-point
3309Arithmetic.
3310-------------------------------------------------------------------------------
3311*/
3312flag floatx80_lt( floatx80 a, floatx80 b )
3313{
3314    flag aSign, bSign;
3315
3316    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3317              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3318         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3319              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3320       ) {
3321        float_raise( float_flag_invalid );
3322        return 0;
3323    }
3324    aSign = extractFloatx80Sign( a );
3325    bSign = extractFloatx80Sign( b );
3326    if ( aSign != bSign ) {
3327        return
3328               aSign
3329            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3330                 != 0 );
3331    }
3332    return
3333          aSign ? lt128( b.high, b.low, a.high, a.low )
3334        : lt128( a.high, a.low, b.high, b.low );
3335
3336}
3337
3338/*
3339-------------------------------------------------------------------------------
3340Returns 1 if the extended double-precision floating-point value `a' is equal
3341to the corresponding value `b', and 0 otherwise.  The invalid exception is
3342raised if either operand is a NaN.  Otherwise, the comparison is performed
3343according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3344-------------------------------------------------------------------------------
3345*/
3346flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3347{
3348
3349    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3350              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3351         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3352              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3353       ) {
3354        float_raise( float_flag_invalid );
3355        return 0;
3356    }
3357    return
3358           ( a.low == b.low )
3359        && (    ( a.high == b.high )
3360             || (    ( a.low == 0 )
3361                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3362           );
3363
3364}
3365
3366/*
3367-------------------------------------------------------------------------------
3368Returns 1 if the extended double-precision floating-point value `a' is less
3369than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
3370do not cause an exception.  Otherwise, the comparison is performed according
3371to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3372-------------------------------------------------------------------------------
3373*/
3374flag floatx80_le_quiet( floatx80 a, floatx80 b )
3375{
3376    flag aSign, bSign;
3377
3378    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3379              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3380         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3381              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3382       ) {
3383        /* Do nothing, even if NaN as we're quiet */
3384        return 0;
3385    }
3386    aSign = extractFloatx80Sign( a );
3387    bSign = extractFloatx80Sign( b );
3388    if ( aSign != bSign ) {
3389        return
3390               aSign
3391            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3392                 == 0 );
3393    }
3394    return
3395          aSign ? le128( b.high, b.low, a.high, a.low )
3396        : le128( a.high, a.low, b.high, b.low );
3397
3398}
3399
3400/*
3401-------------------------------------------------------------------------------
3402Returns 1 if the extended double-precision floating-point value `a' is less
3403than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
3404an exception.  Otherwise, the comparison is performed according to the
3405IEC/IEEE Standard for Binary Floating-point Arithmetic.
3406-------------------------------------------------------------------------------
3407*/
3408flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3409{
3410    flag aSign, bSign;
3411
3412    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3413              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3414         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3415              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3416       ) {
3417        /* Do nothing, even if NaN as we're quiet */
3418        return 0;
3419    }
3420    aSign = extractFloatx80Sign( a );
3421    bSign = extractFloatx80Sign( b );
3422    if ( aSign != bSign ) {
3423        return
3424               aSign
3425            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3426                 != 0 );
3427    }
3428    return
3429          aSign ? lt128( b.high, b.low, a.high, a.low )
3430        : lt128( a.high, a.low, b.high, b.low );
3431
3432}
3433
3434#endif
3435
3436