qemu/fpu/softfloat.c
<<
>>
Prefs
   1/*
   2 * QEMU float support
   3 *
   4 * The code in this source file is derived from release 2a of the SoftFloat
   5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6 * some later contributions) are provided under that license, as detailed below.
   7 * It has subsequently been modified by contributors to the QEMU Project,
   8 * so some portions are provided under:
   9 *  the SoftFloat-2a license
  10 *  the BSD license
  11 *  GPL-v2-or-later
  12 *
  13 * Any future contributions to this file after December 1st 2014 will be
  14 * taken to be licensed under the Softfloat-2a license unless specifically
  15 * indicated otherwise.
  16 */
  17
  18/*
  19===============================================================================
  20This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21Arithmetic Package, Release 2a.
  22
  23Written by John R. Hauser.  This work was made possible in part by the
  24International Computer Science Institute, located at Suite 600, 1947 Center
  25Street, Berkeley, California 94704.  Funding was partially provided by the
  26National Science Foundation under grant MIP-9311980.  The original version
  27of this code was written as part of a project to build a fixed-point vector
  28processor in collaboration with the University of California at Berkeley,
  29overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31arithmetic/SoftFloat.html'.
  32
  33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39Derivative works are acceptable, even for commercial purposes, so long as
  40(1) they include prominent notice that the work is derivative, and (2) they
  41include prominent notice akin to these four paragraphs for those parts of
  42this code that are retained.
  43
  44===============================================================================
  45*/
  46
  47/* BSD licensing:
  48 * Copyright (c) 2006, Fabrice Bellard
  49 * All rights reserved.
  50 *
  51 * Redistribution and use in source and binary forms, with or without
  52 * modification, are permitted provided that the following conditions are met:
  53 *
  54 * 1. Redistributions of source code must retain the above copyright notice,
  55 * this list of conditions and the following disclaimer.
  56 *
  57 * 2. Redistributions in binary form must reproduce the above copyright notice,
  58 * this list of conditions and the following disclaimer in the documentation
  59 * and/or other materials provided with the distribution.
  60 *
  61 * 3. Neither the name of the copyright holder nor the names of its contributors
  62 * may be used to endorse or promote products derived from this software without
  63 * specific prior written permission.
  64 *
  65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75 * THE POSSIBILITY OF SUCH DAMAGE.
  76 */
  77
  78/* Portions of this work are licensed under the terms of the GNU GPL,
  79 * version 2 or later. See the COPYING file in the top-level directory.
  80 */
  81
  82/* softfloat (and in particular the code in softfloat-specialize.h) is
  83 * target-dependent and needs the TARGET_* macros.
  84 */
  85#include "qemu/osdep.h"
  86
  87#include "fpu/softfloat.h"
  88
  89/* We only need stdlib for abort() */
  90
  91/*----------------------------------------------------------------------------
  92| Primitive arithmetic functions, including multi-word arithmetic, and
  93| division and square root approximations.  (Can be specialized to target if
  94| desired.)
  95*----------------------------------------------------------------------------*/
  96#include "softfloat-macros.h"
  97
  98/*----------------------------------------------------------------------------
  99| Functions and definitions to determine:  (1) whether tininess for underflow
 100| is detected before or after rounding by default, (2) what (if anything)
 101| happens when exceptions are raised, (3) how signaling NaNs are distinguished
 102| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 103| are propagated from function inputs to output.  These details are target-
 104| specific.
 105*----------------------------------------------------------------------------*/
 106#include "softfloat-specialize.h"
 107
 108/*----------------------------------------------------------------------------
 109| Returns the fraction bits of the half-precision floating-point value `a'.
 110*----------------------------------------------------------------------------*/
 111
 112static inline uint32_t extractFloat16Frac(float16 a)
 113{
 114    return float16_val(a) & 0x3ff;
 115}
 116
 117/*----------------------------------------------------------------------------
 118| Returns the exponent bits of the half-precision floating-point value `a'.
 119*----------------------------------------------------------------------------*/
 120
 121static inline int extractFloat16Exp(float16 a)
 122{
 123    return (float16_val(a) >> 10) & 0x1f;
 124}
 125
 126/*----------------------------------------------------------------------------
 127| Returns the sign bit of the single-precision floating-point value `a'.
 128*----------------------------------------------------------------------------*/
 129
 130static inline flag extractFloat16Sign(float16 a)
 131{
 132    return float16_val(a)>>15;
 133}
 134
 135/*----------------------------------------------------------------------------
 136| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 137| and 7, and returns the properly rounded 32-bit integer corresponding to the
 138| input.  If `zSign' is 1, the input is negated before being converted to an
 139| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 140| is simply rounded to an integer, with the inexact exception raised if the
 141| input cannot be represented exactly as an integer.  However, if the fixed-
 142| point input is too large, the invalid exception is raised and the largest
 143| positive or negative integer is returned.
 144*----------------------------------------------------------------------------*/
 145
 146static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
 147{
 148    int8_t roundingMode;
 149    flag roundNearestEven;
 150    int8_t roundIncrement, roundBits;
 151    int32_t z;
 152
 153    roundingMode = status->float_rounding_mode;
 154    roundNearestEven = ( roundingMode == float_round_nearest_even );
 155    switch (roundingMode) {
 156    case float_round_nearest_even:
 157    case float_round_ties_away:
 158        roundIncrement = 0x40;
 159        break;
 160    case float_round_to_zero:
 161        roundIncrement = 0;
 162        break;
 163    case float_round_up:
 164        roundIncrement = zSign ? 0 : 0x7f;
 165        break;
 166    case float_round_down:
 167        roundIncrement = zSign ? 0x7f : 0;
 168        break;
 169    default:
 170        abort();
 171    }
 172    roundBits = absZ & 0x7F;
 173    absZ = ( absZ + roundIncrement )>>7;
 174    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 175    z = absZ;
 176    if ( zSign ) z = - z;
 177    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 178        float_raise(float_flag_invalid, status);
 179        return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 180    }
 181    if (roundBits) {
 182        status->float_exception_flags |= float_flag_inexact;
 183    }
 184    return z;
 185
 186}
 187
 188/*----------------------------------------------------------------------------
 189| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 190| `absZ1', with binary point between bits 63 and 64 (between the input words),
 191| and returns the properly rounded 64-bit integer corresponding to the input.
 192| If `zSign' is 1, the input is negated before being converted to an integer.
 193| Ordinarily, the fixed-point input is simply rounded to an integer, with
 194| the inexact exception raised if the input cannot be represented exactly as
 195| an integer.  However, if the fixed-point input is too large, the invalid
 196| exception is raised and the largest positive or negative integer is
 197| returned.
 198*----------------------------------------------------------------------------*/
 199
 200static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
 201                               float_status *status)
 202{
 203    int8_t roundingMode;
 204    flag roundNearestEven, increment;
 205    int64_t z;
 206
 207    roundingMode = status->float_rounding_mode;
 208    roundNearestEven = ( roundingMode == float_round_nearest_even );
 209    switch (roundingMode) {
 210    case float_round_nearest_even:
 211    case float_round_ties_away:
 212        increment = ((int64_t) absZ1 < 0);
 213        break;
 214    case float_round_to_zero:
 215        increment = 0;
 216        break;
 217    case float_round_up:
 218        increment = !zSign && absZ1;
 219        break;
 220    case float_round_down:
 221        increment = zSign && absZ1;
 222        break;
 223    default:
 224        abort();
 225    }
 226    if ( increment ) {
 227        ++absZ0;
 228        if ( absZ0 == 0 ) goto overflow;
 229        absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 230    }
 231    z = absZ0;
 232    if ( zSign ) z = - z;
 233    if ( z && ( ( z < 0 ) ^ zSign ) ) {
 234 overflow:
 235        float_raise(float_flag_invalid, status);
 236        return
 237              zSign ? (int64_t) LIT64( 0x8000000000000000 )
 238            : LIT64( 0x7FFFFFFFFFFFFFFF );
 239    }
 240    if (absZ1) {
 241        status->float_exception_flags |= float_flag_inexact;
 242    }
 243    return z;
 244
 245}
 246
 247/*----------------------------------------------------------------------------
 248| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 249| `absZ1', with binary point between bits 63 and 64 (between the input words),
 250| and returns the properly rounded 64-bit unsigned integer corresponding to the
 251| input.  Ordinarily, the fixed-point input is simply rounded to an integer,
 252| with the inexact exception raised if the input cannot be represented exactly
 253| as an integer.  However, if the fixed-point input is too large, the invalid
 254| exception is raised and the largest unsigned integer is returned.
 255*----------------------------------------------------------------------------*/
 256
 257static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
 258                                uint64_t absZ1, float_status *status)
 259{
 260    int8_t roundingMode;
 261    flag roundNearestEven, increment;
 262
 263    roundingMode = status->float_rounding_mode;
 264    roundNearestEven = (roundingMode == float_round_nearest_even);
 265    switch (roundingMode) {
 266    case float_round_nearest_even:
 267    case float_round_ties_away:
 268        increment = ((int64_t)absZ1 < 0);
 269        break;
 270    case float_round_to_zero:
 271        increment = 0;
 272        break;
 273    case float_round_up:
 274        increment = !zSign && absZ1;
 275        break;
 276    case float_round_down:
 277        increment = zSign && absZ1;
 278        break;
 279    default:
 280        abort();
 281    }
 282    if (increment) {
 283        ++absZ0;
 284        if (absZ0 == 0) {
 285            float_raise(float_flag_invalid, status);
 286            return LIT64(0xFFFFFFFFFFFFFFFF);
 287        }
 288        absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
 289    }
 290
 291    if (zSign && absZ0) {
 292        float_raise(float_flag_invalid, status);
 293        return 0;
 294    }
 295
 296    if (absZ1) {
 297        status->float_exception_flags |= float_flag_inexact;
 298    }
 299    return absZ0;
 300}
 301
 302/*----------------------------------------------------------------------------
 303| Returns the fraction bits of the single-precision floating-point value `a'.
 304*----------------------------------------------------------------------------*/
 305
 306static inline uint32_t extractFloat32Frac( float32 a )
 307{
 308
 309    return float32_val(a) & 0x007FFFFF;
 310
 311}
 312
 313/*----------------------------------------------------------------------------
 314| Returns the exponent bits of the single-precision floating-point value `a'.
 315*----------------------------------------------------------------------------*/
 316
 317static inline int extractFloat32Exp(float32 a)
 318{
 319
 320    return ( float32_val(a)>>23 ) & 0xFF;
 321
 322}
 323
 324/*----------------------------------------------------------------------------
 325| Returns the sign bit of the single-precision floating-point value `a'.
 326*----------------------------------------------------------------------------*/
 327
 328static inline flag extractFloat32Sign( float32 a )
 329{
 330
 331    return float32_val(a)>>31;
 332
 333}
 334
 335/*----------------------------------------------------------------------------
 336| If `a' is denormal and we are in flush-to-zero mode then set the
 337| input-denormal exception and return zero. Otherwise just return the value.
 338*----------------------------------------------------------------------------*/
 339float32 float32_squash_input_denormal(float32 a, float_status *status)
 340{
 341    if (status->flush_inputs_to_zero) {
 342        if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 343            float_raise(float_flag_input_denormal, status);
 344            return make_float32(float32_val(a) & 0x80000000);
 345        }
 346    }
 347    return a;
 348}
 349
 350/*----------------------------------------------------------------------------
 351| Normalizes the subnormal single-precision floating-point value represented
 352| by the denormalized significand `aSig'.  The normalized exponent and
 353| significand are stored at the locations pointed to by `zExpPtr' and
 354| `zSigPtr', respectively.
 355*----------------------------------------------------------------------------*/
 356
 357static void
 358 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
 359{
 360    int8_t shiftCount;
 361
 362    shiftCount = countLeadingZeros32( aSig ) - 8;
 363    *zSigPtr = aSig<<shiftCount;
 364    *zExpPtr = 1 - shiftCount;
 365
 366}
 367
 368/*----------------------------------------------------------------------------
 369| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 370| single-precision floating-point value, returning the result.  After being
 371| shifted into the proper positions, the three fields are simply added
 372| together to form the result.  This means that any integer portion of `zSig'
 373| will be added into the exponent.  Since a properly normalized significand
 374| will have an integer portion equal to 1, the `zExp' input should be 1 less
 375| than the desired result exponent whenever `zSig' is a complete, normalized
 376| significand.
 377*----------------------------------------------------------------------------*/
 378
 379static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
 380{
 381
 382    return make_float32(
 383          ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 384
 385}
 386
 387/*----------------------------------------------------------------------------
 388| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 389| and significand `zSig', and returns the proper single-precision floating-
 390| point value corresponding to the abstract input.  Ordinarily, the abstract
 391| value is simply rounded and packed into the single-precision format, with
 392| the inexact exception raised if the abstract input cannot be represented
 393| exactly.  However, if the abstract value is too large, the overflow and
 394| inexact exceptions are raised and an infinity or maximal finite value is
 395| returned.  If the abstract value is too small, the input value is rounded to
 396| a subnormal number, and the underflow and inexact exceptions are raised if
 397| the abstract input cannot be represented exactly as a subnormal single-
 398| precision floating-point number.
 399|     The input significand `zSig' has its binary point between bits 30
 400| and 29, which is 7 bits to the left of the usual location.  This shifted
 401| significand must be normalized or smaller.  If `zSig' is not normalized,
 402| `zExp' must be 0; in that case, the result returned is a subnormal number,
 403| and it must not require rounding.  In the usual case that `zSig' is
 404| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 405| The handling of underflow and overflow follows the IEC/IEEE Standard for
 406| Binary Floating-Point Arithmetic.
 407*----------------------------------------------------------------------------*/
 408
 409static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
 410                                   float_status *status)
 411{
 412    int8_t roundingMode;
 413    flag roundNearestEven;
 414    int8_t roundIncrement, roundBits;
 415    flag isTiny;
 416
 417    roundingMode = status->float_rounding_mode;
 418    roundNearestEven = ( roundingMode == float_round_nearest_even );
 419    switch (roundingMode) {
 420    case float_round_nearest_even:
 421    case float_round_ties_away:
 422        roundIncrement = 0x40;
 423        break;
 424    case float_round_to_zero:
 425        roundIncrement = 0;
 426        break;
 427    case float_round_up:
 428        roundIncrement = zSign ? 0 : 0x7f;
 429        break;
 430    case float_round_down:
 431        roundIncrement = zSign ? 0x7f : 0;
 432        break;
 433    default:
 434        abort();
 435        break;
 436    }
 437    roundBits = zSig & 0x7F;
 438    if ( 0xFD <= (uint16_t) zExp ) {
 439        if (    ( 0xFD < zExp )
 440             || (    ( zExp == 0xFD )
 441                  && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 442           ) {
 443            float_raise(float_flag_overflow | float_flag_inexact, status);
 444            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 445        }
 446        if ( zExp < 0 ) {
 447            if (status->flush_to_zero) {
 448                float_raise(float_flag_output_denormal, status);
 449                return packFloat32(zSign, 0, 0);
 450            }
 451            isTiny =
 452                (status->float_detect_tininess
 453                 == float_tininess_before_rounding)
 454                || ( zExp < -1 )
 455                || ( zSig + roundIncrement < 0x80000000 );
 456            shift32RightJamming( zSig, - zExp, &zSig );
 457            zExp = 0;
 458            roundBits = zSig & 0x7F;
 459            if (isTiny && roundBits) {
 460                float_raise(float_flag_underflow, status);
 461            }
 462        }
 463    }
 464    if (roundBits) {
 465        status->float_exception_flags |= float_flag_inexact;
 466    }
 467    zSig = ( zSig + roundIncrement )>>7;
 468    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 469    if ( zSig == 0 ) zExp = 0;
 470    return packFloat32( zSign, zExp, zSig );
 471
 472}
 473
 474/*----------------------------------------------------------------------------
 475| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 476| and significand `zSig', and returns the proper single-precision floating-
 477| point value corresponding to the abstract input.  This routine is just like
 478| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 479| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 480| floating-point exponent.
 481*----------------------------------------------------------------------------*/
 482
 483static float32
 484 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
 485                              float_status *status)
 486{
 487    int8_t shiftCount;
 488
 489    shiftCount = countLeadingZeros32( zSig ) - 1;
 490    return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
 491                               status);
 492
 493}
 494
 495/*----------------------------------------------------------------------------
 496| Returns the fraction bits of the double-precision floating-point value `a'.
 497*----------------------------------------------------------------------------*/
 498
 499static inline uint64_t extractFloat64Frac( float64 a )
 500{
 501
 502    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 503
 504}
 505
 506/*----------------------------------------------------------------------------
 507| Returns the exponent bits of the double-precision floating-point value `a'.
 508*----------------------------------------------------------------------------*/
 509
 510static inline int extractFloat64Exp(float64 a)
 511{
 512
 513    return ( float64_val(a)>>52 ) & 0x7FF;
 514
 515}
 516
 517/*----------------------------------------------------------------------------
 518| Returns the sign bit of the double-precision floating-point value `a'.
 519*----------------------------------------------------------------------------*/
 520
 521static inline flag extractFloat64Sign( float64 a )
 522{
 523
 524    return float64_val(a)>>63;
 525
 526}
 527
 528/*----------------------------------------------------------------------------
 529| If `a' is denormal and we are in flush-to-zero mode then set the
 530| input-denormal exception and return zero. Otherwise just return the value.
 531*----------------------------------------------------------------------------*/
 532float64 float64_squash_input_denormal(float64 a, float_status *status)
 533{
 534    if (status->flush_inputs_to_zero) {
 535        if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 536            float_raise(float_flag_input_denormal, status);
 537            return make_float64(float64_val(a) & (1ULL << 63));
 538        }
 539    }
 540    return a;
 541}
 542
 543/*----------------------------------------------------------------------------
 544| Normalizes the subnormal double-precision floating-point value represented
 545| by the denormalized significand `aSig'.  The normalized exponent and
 546| significand are stored at the locations pointed to by `zExpPtr' and
 547| `zSigPtr', respectively.
 548*----------------------------------------------------------------------------*/
 549
 550static void
 551 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
 552{
 553    int8_t shiftCount;
 554
 555    shiftCount = countLeadingZeros64( aSig ) - 11;
 556    *zSigPtr = aSig<<shiftCount;
 557    *zExpPtr = 1 - shiftCount;
 558
 559}
 560
 561/*----------------------------------------------------------------------------
 562| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 563| double-precision floating-point value, returning the result.  After being
 564| shifted into the proper positions, the three fields are simply added
 565| together to form the result.  This means that any integer portion of `zSig'
 566| will be added into the exponent.  Since a properly normalized significand
 567| will have an integer portion equal to 1, the `zExp' input should be 1 less
 568| than the desired result exponent whenever `zSig' is a complete, normalized
 569| significand.
 570*----------------------------------------------------------------------------*/
 571
 572static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
 573{
 574
 575    return make_float64(
 576        ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 577
 578}
 579
 580/*----------------------------------------------------------------------------
 581| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 582| and significand `zSig', and returns the proper double-precision floating-
 583| point value corresponding to the abstract input.  Ordinarily, the abstract
 584| value is simply rounded and packed into the double-precision format, with
 585| the inexact exception raised if the abstract input cannot be represented
 586| exactly.  However, if the abstract value is too large, the overflow and
 587| inexact exceptions are raised and an infinity or maximal finite value is
 588| returned.  If the abstract value is too small, the input value is rounded to
 589| a subnormal number, and the underflow and inexact exceptions are raised if
 590| the abstract input cannot be represented exactly as a subnormal double-
 591| precision floating-point number.
 592|     The input significand `zSig' has its binary point between bits 62
 593| and 61, which is 10 bits to the left of the usual location.  This shifted
 594| significand must be normalized or smaller.  If `zSig' is not normalized,
 595| `zExp' must be 0; in that case, the result returned is a subnormal number,
 596| and it must not require rounding.  In the usual case that `zSig' is
 597| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 598| The handling of underflow and overflow follows the IEC/IEEE Standard for
 599| Binary Floating-Point Arithmetic.
 600*----------------------------------------------------------------------------*/
 601
 602static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
 603                                   float_status *status)
 604{
 605    int8_t roundingMode;
 606    flag roundNearestEven;
 607    int roundIncrement, roundBits;
 608    flag isTiny;
 609
 610    roundingMode = status->float_rounding_mode;
 611    roundNearestEven = ( roundingMode == float_round_nearest_even );
 612    switch (roundingMode) {
 613    case float_round_nearest_even:
 614    case float_round_ties_away:
 615        roundIncrement = 0x200;
 616        break;
 617    case float_round_to_zero:
 618        roundIncrement = 0;
 619        break;
 620    case float_round_up:
 621        roundIncrement = zSign ? 0 : 0x3ff;
 622        break;
 623    case float_round_down:
 624        roundIncrement = zSign ? 0x3ff : 0;
 625        break;
 626    default:
 627        abort();
 628    }
 629    roundBits = zSig & 0x3FF;
 630    if ( 0x7FD <= (uint16_t) zExp ) {
 631        if (    ( 0x7FD < zExp )
 632             || (    ( zExp == 0x7FD )
 633                  && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 634           ) {
 635            float_raise(float_flag_overflow | float_flag_inexact, status);
 636            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 637        }
 638        if ( zExp < 0 ) {
 639            if (status->flush_to_zero) {
 640                float_raise(float_flag_output_denormal, status);
 641                return packFloat64(zSign, 0, 0);
 642            }
 643            isTiny =
 644                   (status->float_detect_tininess
 645                    == float_tininess_before_rounding)
 646                || ( zExp < -1 )
 647                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 648            shift64RightJamming( zSig, - zExp, &zSig );
 649            zExp = 0;
 650            roundBits = zSig & 0x3FF;
 651            if (isTiny && roundBits) {
 652                float_raise(float_flag_underflow, status);
 653            }
 654        }
 655    }
 656    if (roundBits) {
 657        status->float_exception_flags |= float_flag_inexact;
 658    }
 659    zSig = ( zSig + roundIncrement )>>10;
 660    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 661    if ( zSig == 0 ) zExp = 0;
 662    return packFloat64( zSign, zExp, zSig );
 663
 664}
 665
 666/*----------------------------------------------------------------------------
 667| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 668| and significand `zSig', and returns the proper double-precision floating-
 669| point value corresponding to the abstract input.  This routine is just like
 670| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 671| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 672| floating-point exponent.
 673*----------------------------------------------------------------------------*/
 674
 675static float64
 676 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
 677                              float_status *status)
 678{
 679    int8_t shiftCount;
 680
 681    shiftCount = countLeadingZeros64( zSig ) - 1;
 682    return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
 683                               status);
 684
 685}
 686
 687/*----------------------------------------------------------------------------
 688| Returns the fraction bits of the extended double-precision floating-point
 689| value `a'.
 690*----------------------------------------------------------------------------*/
 691
 692static inline uint64_t extractFloatx80Frac( floatx80 a )
 693{
 694
 695    return a.low;
 696
 697}
 698
 699/*----------------------------------------------------------------------------
 700| Returns the exponent bits of the extended double-precision floating-point
 701| value `a'.
 702*----------------------------------------------------------------------------*/
 703
 704static inline int32_t extractFloatx80Exp( floatx80 a )
 705{
 706
 707    return a.high & 0x7FFF;
 708
 709}
 710
 711/*----------------------------------------------------------------------------
 712| Returns the sign bit of the extended double-precision floating-point value
 713| `a'.
 714*----------------------------------------------------------------------------*/
 715
 716static inline flag extractFloatx80Sign( floatx80 a )
 717{
 718
 719    return a.high>>15;
 720
 721}
 722
 723/*----------------------------------------------------------------------------
 724| Normalizes the subnormal extended double-precision floating-point value
 725| represented by the denormalized significand `aSig'.  The normalized exponent
 726| and significand are stored at the locations pointed to by `zExpPtr' and
 727| `zSigPtr', respectively.
 728*----------------------------------------------------------------------------*/
 729
 730static void
 731 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
 732{
 733    int8_t shiftCount;
 734
 735    shiftCount = countLeadingZeros64( aSig );
 736    *zSigPtr = aSig<<shiftCount;
 737    *zExpPtr = 1 - shiftCount;
 738
 739}
 740
 741/*----------------------------------------------------------------------------
 742| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 743| extended double-precision floating-point value, returning the result.
 744*----------------------------------------------------------------------------*/
 745
 746static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
 747{
 748    floatx80 z;
 749
 750    z.low = zSig;
 751    z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 752    return z;
 753
 754}
 755
 756/*----------------------------------------------------------------------------
 757| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 758| and extended significand formed by the concatenation of `zSig0' and `zSig1',
 759| and returns the proper extended double-precision floating-point value
 760| corresponding to the abstract input.  Ordinarily, the abstract value is
 761| rounded and packed into the extended double-precision format, with the
 762| inexact exception raised if the abstract input cannot be represented
 763| exactly.  However, if the abstract value is too large, the overflow and
 764| inexact exceptions are raised and an infinity or maximal finite value is
 765| returned.  If the abstract value is too small, the input value is rounded to
 766| a subnormal number, and the underflow and inexact exceptions are raised if
 767| the abstract input cannot be represented exactly as a subnormal extended
 768| double-precision floating-point number.
 769|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 770| number of bits as single or double precision, respectively.  Otherwise, the
 771| result is rounded to the full precision of the extended double-precision
 772| format.
 773|     The input significand must be normalized or smaller.  If the input
 774| significand is not normalized, `zExp' must be 0; in that case, the result
 775| returned is a subnormal number, and it must not require rounding.  The
 776| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 777| Floating-Point Arithmetic.
 778*----------------------------------------------------------------------------*/
 779
 780static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
 781                                     int32_t zExp, uint64_t zSig0, uint64_t zSig1,
 782                                     float_status *status)
 783{
 784    int8_t roundingMode;
 785    flag roundNearestEven, increment, isTiny;
 786    int64_t roundIncrement, roundMask, roundBits;
 787
 788    roundingMode = status->float_rounding_mode;
 789    roundNearestEven = ( roundingMode == float_round_nearest_even );
 790    if ( roundingPrecision == 80 ) goto precision80;
 791    if ( roundingPrecision == 64 ) {
 792        roundIncrement = LIT64( 0x0000000000000400 );
 793        roundMask = LIT64( 0x00000000000007FF );
 794    }
 795    else if ( roundingPrecision == 32 ) {
 796        roundIncrement = LIT64( 0x0000008000000000 );
 797        roundMask = LIT64( 0x000000FFFFFFFFFF );
 798    }
 799    else {
 800        goto precision80;
 801    }
 802    zSig0 |= ( zSig1 != 0 );
 803    switch (roundingMode) {
 804    case float_round_nearest_even:
 805    case float_round_ties_away:
 806        break;
 807    case float_round_to_zero:
 808        roundIncrement = 0;
 809        break;
 810    case float_round_up:
 811        roundIncrement = zSign ? 0 : roundMask;
 812        break;
 813    case float_round_down:
 814        roundIncrement = zSign ? roundMask : 0;
 815        break;
 816    default:
 817        abort();
 818    }
 819    roundBits = zSig0 & roundMask;
 820    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 821        if (    ( 0x7FFE < zExp )
 822             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 823           ) {
 824            goto overflow;
 825        }
 826        if ( zExp <= 0 ) {
 827            if (status->flush_to_zero) {
 828                float_raise(float_flag_output_denormal, status);
 829                return packFloatx80(zSign, 0, 0);
 830            }
 831            isTiny =
 832                   (status->float_detect_tininess
 833                    == float_tininess_before_rounding)
 834                || ( zExp < 0 )
 835                || ( zSig0 <= zSig0 + roundIncrement );
 836            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 837            zExp = 0;
 838            roundBits = zSig0 & roundMask;
 839            if (isTiny && roundBits) {
 840                float_raise(float_flag_underflow, status);
 841            }
 842            if (roundBits) {
 843                status->float_exception_flags |= float_flag_inexact;
 844            }
 845            zSig0 += roundIncrement;
 846            if ( (int64_t) zSig0 < 0 ) zExp = 1;
 847            roundIncrement = roundMask + 1;
 848            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 849                roundMask |= roundIncrement;
 850            }
 851            zSig0 &= ~ roundMask;
 852            return packFloatx80( zSign, zExp, zSig0 );
 853        }
 854    }
 855    if (roundBits) {
 856        status->float_exception_flags |= float_flag_inexact;
 857    }
 858    zSig0 += roundIncrement;
 859    if ( zSig0 < roundIncrement ) {
 860        ++zExp;
 861        zSig0 = LIT64( 0x8000000000000000 );
 862    }
 863    roundIncrement = roundMask + 1;
 864    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 865        roundMask |= roundIncrement;
 866    }
 867    zSig0 &= ~ roundMask;
 868    if ( zSig0 == 0 ) zExp = 0;
 869    return packFloatx80( zSign, zExp, zSig0 );
 870 precision80:
 871    switch (roundingMode) {
 872    case float_round_nearest_even:
 873    case float_round_ties_away:
 874        increment = ((int64_t)zSig1 < 0);
 875        break;
 876    case float_round_to_zero:
 877        increment = 0;
 878        break;
 879    case float_round_up:
 880        increment = !zSign && zSig1;
 881        break;
 882    case float_round_down:
 883        increment = zSign && zSig1;
 884        break;
 885    default:
 886        abort();
 887    }
 888    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 889        if (    ( 0x7FFE < zExp )
 890             || (    ( zExp == 0x7FFE )
 891                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 892                  && increment
 893                )
 894           ) {
 895            roundMask = 0;
 896 overflow:
 897            float_raise(float_flag_overflow | float_flag_inexact, status);
 898            if (    ( roundingMode == float_round_to_zero )
 899                 || ( zSign && ( roundingMode == float_round_up ) )
 900                 || ( ! zSign && ( roundingMode == float_round_down ) )
 901               ) {
 902                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 903            }
 904            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 905        }
 906        if ( zExp <= 0 ) {
 907            isTiny =
 908                   (status->float_detect_tininess
 909                    == float_tininess_before_rounding)
 910                || ( zExp < 0 )
 911                || ! increment
 912                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 913            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 914            zExp = 0;
 915            if (isTiny && zSig1) {
 916                float_raise(float_flag_underflow, status);
 917            }
 918            if (zSig1) {
 919                status->float_exception_flags |= float_flag_inexact;
 920            }
 921            switch (roundingMode) {
 922            case float_round_nearest_even:
 923            case float_round_ties_away:
 924                increment = ((int64_t)zSig1 < 0);
 925                break;
 926            case float_round_to_zero:
 927                increment = 0;
 928                break;
 929            case float_round_up:
 930                increment = !zSign && zSig1;
 931                break;
 932            case float_round_down:
 933                increment = zSign && zSig1;
 934                break;
 935            default:
 936                abort();
 937            }
 938            if ( increment ) {
 939                ++zSig0;
 940                zSig0 &=
 941                    ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 942                if ( (int64_t) zSig0 < 0 ) zExp = 1;
 943            }
 944            return packFloatx80( zSign, zExp, zSig0 );
 945        }
 946    }
 947    if (zSig1) {
 948        status->float_exception_flags |= float_flag_inexact;
 949    }
 950    if ( increment ) {
 951        ++zSig0;
 952        if ( zSig0 == 0 ) {
 953            ++zExp;
 954            zSig0 = LIT64( 0x8000000000000000 );
 955        }
 956        else {
 957            zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 958        }
 959    }
 960    else {
 961        if ( zSig0 == 0 ) zExp = 0;
 962    }
 963    return packFloatx80( zSign, zExp, zSig0 );
 964
 965}
 966
 967/*----------------------------------------------------------------------------
 968| Takes an abstract floating-point value having sign `zSign', exponent
 969| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 970| and returns the proper extended double-precision floating-point value
 971| corresponding to the abstract input.  This routine is just like
 972| `roundAndPackFloatx80' except that the input significand does not have to be
 973| normalized.
 974*----------------------------------------------------------------------------*/
 975
 976static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
 977                                              flag zSign, int32_t zExp,
 978                                              uint64_t zSig0, uint64_t zSig1,
 979                                              float_status *status)
 980{
 981    int8_t shiftCount;
 982
 983    if ( zSig0 == 0 ) {
 984        zSig0 = zSig1;
 985        zSig1 = 0;
 986        zExp -= 64;
 987    }
 988    shiftCount = countLeadingZeros64( zSig0 );
 989    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 990    zExp -= shiftCount;
 991    return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
 992                                zSig0, zSig1, status);
 993
 994}
 995
 996/*----------------------------------------------------------------------------
 997| Returns the least-significant 64 fraction bits of the quadruple-precision
 998| floating-point value `a'.
 999*----------------------------------------------------------------------------*/
1000
1001static inline uint64_t extractFloat128Frac1( float128 a )
1002{
1003
1004    return a.low;
1005
1006}
1007
1008/*----------------------------------------------------------------------------
1009| Returns the most-significant 48 fraction bits of the quadruple-precision
1010| floating-point value `a'.
1011*----------------------------------------------------------------------------*/
1012
1013static inline uint64_t extractFloat128Frac0( float128 a )
1014{
1015
1016    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1017
1018}
1019
1020/*----------------------------------------------------------------------------
1021| Returns the exponent bits of the quadruple-precision floating-point value
1022| `a'.
1023*----------------------------------------------------------------------------*/
1024
1025static inline int32_t extractFloat128Exp( float128 a )
1026{
1027
1028    return ( a.high>>48 ) & 0x7FFF;
1029
1030}
1031
1032/*----------------------------------------------------------------------------
1033| Returns the sign bit of the quadruple-precision floating-point value `a'.
1034*----------------------------------------------------------------------------*/
1035
1036static inline flag extractFloat128Sign( float128 a )
1037{
1038
1039    return a.high>>63;
1040
1041}
1042
1043/*----------------------------------------------------------------------------
1044| Normalizes the subnormal quadruple-precision floating-point value
1045| represented by the denormalized significand formed by the concatenation of
1046| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
1047| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
1048| significand are stored at the location pointed to by `zSig0Ptr', and the
1049| least significant 64 bits of the normalized significand are stored at the
1050| location pointed to by `zSig1Ptr'.
1051*----------------------------------------------------------------------------*/
1052
1053static void
1054 normalizeFloat128Subnormal(
1055     uint64_t aSig0,
1056     uint64_t aSig1,
1057     int32_t *zExpPtr,
1058     uint64_t *zSig0Ptr,
1059     uint64_t *zSig1Ptr
1060 )
1061{
1062    int8_t shiftCount;
1063
1064    if ( aSig0 == 0 ) {
1065        shiftCount = countLeadingZeros64( aSig1 ) - 15;
1066        if ( shiftCount < 0 ) {
1067            *zSig0Ptr = aSig1>>( - shiftCount );
1068            *zSig1Ptr = aSig1<<( shiftCount & 63 );
1069        }
1070        else {
1071            *zSig0Ptr = aSig1<<shiftCount;
1072            *zSig1Ptr = 0;
1073        }
1074        *zExpPtr = - shiftCount - 63;
1075    }
1076    else {
1077        shiftCount = countLeadingZeros64( aSig0 ) - 15;
1078        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1079        *zExpPtr = 1 - shiftCount;
1080    }
1081
1082}
1083
1084/*----------------------------------------------------------------------------
1085| Packs the sign `zSign', the exponent `zExp', and the significand formed
1086| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1087| floating-point value, returning the result.  After being shifted into the
1088| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1089| added together to form the most significant 32 bits of the result.  This
1090| means that any integer portion of `zSig0' will be added into the exponent.
1091| Since a properly normalized significand will have an integer portion equal
1092| to 1, the `zExp' input should be 1 less than the desired result exponent
1093| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1094| significand.
1095*----------------------------------------------------------------------------*/
1096
1097static inline float128
1098 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
1099{
1100    float128 z;
1101
1102    z.low = zSig1;
1103    z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1104    return z;
1105
1106}
1107
1108/*----------------------------------------------------------------------------
1109| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1110| and extended significand formed by the concatenation of `zSig0', `zSig1',
1111| and `zSig2', and returns the proper quadruple-precision floating-point value
1112| corresponding to the abstract input.  Ordinarily, the abstract value is
1113| simply rounded and packed into the quadruple-precision format, with the
1114| inexact exception raised if the abstract input cannot be represented
1115| exactly.  However, if the abstract value is too large, the overflow and
1116| inexact exceptions are raised and an infinity or maximal finite value is
1117| returned.  If the abstract value is too small, the input value is rounded to
1118| a subnormal number, and the underflow and inexact exceptions are raised if
1119| the abstract input cannot be represented exactly as a subnormal quadruple-
1120| precision floating-point number.
1121|     The input significand must be normalized or smaller.  If the input
1122| significand is not normalized, `zExp' must be 0; in that case, the result
1123| returned is a subnormal number, and it must not require rounding.  In the
1124| usual case that the input significand is normalized, `zExp' must be 1 less
1125| than the ``true'' floating-point exponent.  The handling of underflow and
1126| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127*----------------------------------------------------------------------------*/
1128
1129static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
1130                                     uint64_t zSig0, uint64_t zSig1,
1131                                     uint64_t zSig2, float_status *status)
1132{
1133    int8_t roundingMode;
1134    flag roundNearestEven, increment, isTiny;
1135
1136    roundingMode = status->float_rounding_mode;
1137    roundNearestEven = ( roundingMode == float_round_nearest_even );
1138    switch (roundingMode) {
1139    case float_round_nearest_even:
1140    case float_round_ties_away:
1141        increment = ((int64_t)zSig2 < 0);
1142        break;
1143    case float_round_to_zero:
1144        increment = 0;
1145        break;
1146    case float_round_up:
1147        increment = !zSign && zSig2;
1148        break;
1149    case float_round_down:
1150        increment = zSign && zSig2;
1151        break;
1152    default:
1153        abort();
1154    }
1155    if ( 0x7FFD <= (uint32_t) zExp ) {
1156        if (    ( 0x7FFD < zExp )
1157             || (    ( zExp == 0x7FFD )
1158                  && eq128(
1159                         LIT64( 0x0001FFFFFFFFFFFF ),
1160                         LIT64( 0xFFFFFFFFFFFFFFFF ),
1161                         zSig0,
1162                         zSig1
1163                     )
1164                  && increment
1165                )
1166           ) {
1167            float_raise(float_flag_overflow | float_flag_inexact, status);
1168            if (    ( roundingMode == float_round_to_zero )
1169                 || ( zSign && ( roundingMode == float_round_up ) )
1170                 || ( ! zSign && ( roundingMode == float_round_down ) )
1171               ) {
1172                return
1173                    packFloat128(
1174                        zSign,
1175                        0x7FFE,
1176                        LIT64( 0x0000FFFFFFFFFFFF ),
1177                        LIT64( 0xFFFFFFFFFFFFFFFF )
1178                    );
1179            }
1180            return packFloat128( zSign, 0x7FFF, 0, 0 );
1181        }
1182        if ( zExp < 0 ) {
1183            if (status->flush_to_zero) {
1184                float_raise(float_flag_output_denormal, status);
1185                return packFloat128(zSign, 0, 0, 0);
1186            }
1187            isTiny =
1188                   (status->float_detect_tininess
1189                    == float_tininess_before_rounding)
1190                || ( zExp < -1 )
1191                || ! increment
1192                || lt128(
1193                       zSig0,
1194                       zSig1,
1195                       LIT64( 0x0001FFFFFFFFFFFF ),
1196                       LIT64( 0xFFFFFFFFFFFFFFFF )
1197                   );
1198            shift128ExtraRightJamming(
1199                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1200            zExp = 0;
1201            if (isTiny && zSig2) {
1202                float_raise(float_flag_underflow, status);
1203            }
1204            switch (roundingMode) {
1205            case float_round_nearest_even:
1206            case float_round_ties_away:
1207                increment = ((int64_t)zSig2 < 0);
1208                break;
1209            case float_round_to_zero:
1210                increment = 0;
1211                break;
1212            case float_round_up:
1213                increment = !zSign && zSig2;
1214                break;
1215            case float_round_down:
1216                increment = zSign && zSig2;
1217                break;
1218            default:
1219                abort();
1220            }
1221        }
1222    }
1223    if (zSig2) {
1224        status->float_exception_flags |= float_flag_inexact;
1225    }
1226    if ( increment ) {
1227        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1228        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1229    }
1230    else {
1231        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1232    }
1233    return packFloat128( zSign, zExp, zSig0, zSig1 );
1234
1235}
1236
1237/*----------------------------------------------------------------------------
1238| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1239| and significand formed by the concatenation of `zSig0' and `zSig1', and
1240| returns the proper quadruple-precision floating-point value corresponding
1241| to the abstract input.  This routine is just like `roundAndPackFloat128'
1242| except that the input significand has fewer bits and does not have to be
1243| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1244| point exponent.
1245*----------------------------------------------------------------------------*/
1246
1247static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
1248                                              uint64_t zSig0, uint64_t zSig1,
1249                                              float_status *status)
1250{
1251    int8_t shiftCount;
1252    uint64_t zSig2;
1253
1254    if ( zSig0 == 0 ) {
1255        zSig0 = zSig1;
1256        zSig1 = 0;
1257        zExp -= 64;
1258    }
1259    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1260    if ( 0 <= shiftCount ) {
1261        zSig2 = 0;
1262        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1263    }
1264    else {
1265        shift128ExtraRightJamming(
1266            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1267    }
1268    zExp -= shiftCount;
1269    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
1270
1271}
1272
1273/*----------------------------------------------------------------------------
1274| Returns the result of converting the 32-bit two's complement integer `a'
1275| to the single-precision floating-point format.  The conversion is performed
1276| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1277*----------------------------------------------------------------------------*/
1278
1279float32 int32_to_float32(int32_t a, float_status *status)
1280{
1281    flag zSign;
1282
1283    if ( a == 0 ) return float32_zero;
1284    if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1285    zSign = ( a < 0 );
1286    return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
1287}
1288
1289/*----------------------------------------------------------------------------
1290| Returns the result of converting the 32-bit two's complement integer `a'
1291| to the double-precision floating-point format.  The conversion is performed
1292| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1293*----------------------------------------------------------------------------*/
1294
1295float64 int32_to_float64(int32_t a, float_status *status)
1296{
1297    flag zSign;
1298    uint32_t absA;
1299    int8_t shiftCount;
1300    uint64_t zSig;
1301
1302    if ( a == 0 ) return float64_zero;
1303    zSign = ( a < 0 );
1304    absA = zSign ? - a : a;
1305    shiftCount = countLeadingZeros32( absA ) + 21;
1306    zSig = absA;
1307    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1308
1309}
1310
1311/*----------------------------------------------------------------------------
1312| Returns the result of converting the 32-bit two's complement integer `a'
1313| to the extended double-precision floating-point format.  The conversion
1314| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1315| Arithmetic.
1316*----------------------------------------------------------------------------*/
1317
1318floatx80 int32_to_floatx80(int32_t a, float_status *status)
1319{
1320    flag zSign;
1321    uint32_t absA;
1322    int8_t shiftCount;
1323    uint64_t zSig;
1324
1325    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1326    zSign = ( a < 0 );
1327    absA = zSign ? - a : a;
1328    shiftCount = countLeadingZeros32( absA ) + 32;
1329    zSig = absA;
1330    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1331
1332}
1333
1334/*----------------------------------------------------------------------------
1335| Returns the result of converting the 32-bit two's complement integer `a' to
1336| the quadruple-precision floating-point format.  The conversion is performed
1337| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1338*----------------------------------------------------------------------------*/
1339
1340float128 int32_to_float128(int32_t a, float_status *status)
1341{
1342    flag zSign;
1343    uint32_t absA;
1344    int8_t shiftCount;
1345    uint64_t zSig0;
1346
1347    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1348    zSign = ( a < 0 );
1349    absA = zSign ? - a : a;
1350    shiftCount = countLeadingZeros32( absA ) + 17;
1351    zSig0 = absA;
1352    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1353
1354}
1355
1356/*----------------------------------------------------------------------------
1357| Returns the result of converting the 64-bit two's complement integer `a'
1358| to the single-precision floating-point format.  The conversion is performed
1359| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1360*----------------------------------------------------------------------------*/
1361
1362float32 int64_to_float32(int64_t a, float_status *status)
1363{
1364    flag zSign;
1365    uint64_t absA;
1366    int8_t shiftCount;
1367
1368    if ( a == 0 ) return float32_zero;
1369    zSign = ( a < 0 );
1370    absA = zSign ? - a : a;
1371    shiftCount = countLeadingZeros64( absA ) - 40;
1372    if ( 0 <= shiftCount ) {
1373        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1374    }
1375    else {
1376        shiftCount += 7;
1377        if ( shiftCount < 0 ) {
1378            shift64RightJamming( absA, - shiftCount, &absA );
1379        }
1380        else {
1381            absA <<= shiftCount;
1382        }
1383        return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
1384    }
1385
1386}
1387
1388/*----------------------------------------------------------------------------
1389| Returns the result of converting the 64-bit two's complement integer `a'
1390| to the double-precision floating-point format.  The conversion is performed
1391| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1392*----------------------------------------------------------------------------*/
1393
1394float64 int64_to_float64(int64_t a, float_status *status)
1395{
1396    flag zSign;
1397
1398    if ( a == 0 ) return float64_zero;
1399    if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1400        return packFloat64( 1, 0x43E, 0 );
1401    }
1402    zSign = ( a < 0 );
1403    return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
1404}
1405
1406/*----------------------------------------------------------------------------
1407| Returns the result of converting the 64-bit two's complement integer `a'
1408| to the extended double-precision floating-point format.  The conversion
1409| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1410| Arithmetic.
1411*----------------------------------------------------------------------------*/
1412
1413floatx80 int64_to_floatx80(int64_t a, float_status *status)
1414{
1415    flag zSign;
1416    uint64_t absA;
1417    int8_t shiftCount;
1418
1419    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1420    zSign = ( a < 0 );
1421    absA = zSign ? - a : a;
1422    shiftCount = countLeadingZeros64( absA );
1423    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1424
1425}
1426
1427/*----------------------------------------------------------------------------
1428| Returns the result of converting the 64-bit two's complement integer `a' to
1429| the quadruple-precision floating-point format.  The conversion is performed
1430| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1431*----------------------------------------------------------------------------*/
1432
1433float128 int64_to_float128(int64_t a, float_status *status)
1434{
1435    flag zSign;
1436    uint64_t absA;
1437    int8_t shiftCount;
1438    int32_t zExp;
1439    uint64_t zSig0, zSig1;
1440
1441    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1442    zSign = ( a < 0 );
1443    absA = zSign ? - a : a;
1444    shiftCount = countLeadingZeros64( absA ) + 49;
1445    zExp = 0x406E - shiftCount;
1446    if ( 64 <= shiftCount ) {
1447        zSig1 = 0;
1448        zSig0 = absA;
1449        shiftCount -= 64;
1450    }
1451    else {
1452        zSig1 = absA;
1453        zSig0 = 0;
1454    }
1455    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1456    return packFloat128( zSign, zExp, zSig0, zSig1 );
1457
1458}
1459
1460/*----------------------------------------------------------------------------
1461| Returns the result of converting the 64-bit unsigned integer `a'
1462| to the single-precision floating-point format.  The conversion is performed
1463| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1464*----------------------------------------------------------------------------*/
1465
1466float32 uint64_to_float32(uint64_t a, float_status *status)
1467{
1468    int shiftcount;
1469
1470    if (a == 0) {
1471        return float32_zero;
1472    }
1473
1474    /* Determine (left) shift needed to put first set bit into bit posn 23
1475     * (since packFloat32() expects the binary point between bits 23 and 22);
1476     * this is the fast case for smallish numbers.
1477     */
1478    shiftcount = countLeadingZeros64(a) - 40;
1479    if (shiftcount >= 0) {
1480        return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1481    }
1482    /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1483     * expects the binary point between bits 30 and 29, hence the + 7.
1484     */
1485    shiftcount += 7;
1486    if (shiftcount < 0) {
1487        shift64RightJamming(a, -shiftcount, &a);
1488    } else {
1489        a <<= shiftcount;
1490    }
1491
1492    return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
1493}
1494
1495/*----------------------------------------------------------------------------
1496| Returns the result of converting the 64-bit unsigned integer `a'
1497| to the double-precision floating-point format.  The conversion is performed
1498| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1499*----------------------------------------------------------------------------*/
1500
1501float64 uint64_to_float64(uint64_t a, float_status *status)
1502{
1503    int exp = 0x43C;
1504    int shiftcount;
1505
1506    if (a == 0) {
1507        return float64_zero;
1508    }
1509
1510    shiftcount = countLeadingZeros64(a) - 1;
1511    if (shiftcount < 0) {
1512        shift64RightJamming(a, -shiftcount, &a);
1513    } else {
1514        a <<= shiftcount;
1515    }
1516    return roundAndPackFloat64(0, exp - shiftcount, a, status);
1517}
1518
1519/*----------------------------------------------------------------------------
1520| Returns the result of converting the 64-bit unsigned integer `a'
1521| to the quadruple-precision floating-point format.  The conversion is performed
1522| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1523*----------------------------------------------------------------------------*/
1524
1525float128 uint64_to_float128(uint64_t a, float_status *status)
1526{
1527    if (a == 0) {
1528        return float128_zero;
1529    }
1530    return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1531}
1532
1533/*----------------------------------------------------------------------------
1534| Returns the result of converting the single-precision floating-point value
1535| `a' to the 32-bit two's complement integer format.  The conversion is
1536| performed according to the IEC/IEEE Standard for Binary Floating-Point
1537| Arithmetic---which means in particular that the conversion is rounded
1538| according to the current rounding mode.  If `a' is a NaN, the largest
1539| positive integer is returned.  Otherwise, if the conversion overflows, the
1540| largest integer with the same sign as `a' is returned.
1541*----------------------------------------------------------------------------*/
1542
1543int32_t float32_to_int32(float32 a, float_status *status)
1544{
1545    flag aSign;
1546    int aExp;
1547    int shiftCount;
1548    uint32_t aSig;
1549    uint64_t aSig64;
1550
1551    a = float32_squash_input_denormal(a, status);
1552    aSig = extractFloat32Frac( a );
1553    aExp = extractFloat32Exp( a );
1554    aSign = extractFloat32Sign( a );
1555    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1556    if ( aExp ) aSig |= 0x00800000;
1557    shiftCount = 0xAF - aExp;
1558    aSig64 = aSig;
1559    aSig64 <<= 32;
1560    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1561    return roundAndPackInt32(aSign, aSig64, status);
1562
1563}
1564
1565/*----------------------------------------------------------------------------
1566| Returns the result of converting the single-precision floating-point value
1567| `a' to the 32-bit two's complement integer format.  The conversion is
1568| performed according to the IEC/IEEE Standard for Binary Floating-Point
1569| Arithmetic, except that the conversion is always rounded toward zero.
1570| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1571| the conversion overflows, the largest integer with the same sign as `a' is
1572| returned.
1573*----------------------------------------------------------------------------*/
1574
1575int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
1576{
1577    flag aSign;
1578    int aExp;
1579    int shiftCount;
1580    uint32_t aSig;
1581    int32_t z;
1582    a = float32_squash_input_denormal(a, status);
1583
1584    aSig = extractFloat32Frac( a );
1585    aExp = extractFloat32Exp( a );
1586    aSign = extractFloat32Sign( a );
1587    shiftCount = aExp - 0x9E;
1588    if ( 0 <= shiftCount ) {
1589        if ( float32_val(a) != 0xCF000000 ) {
1590            float_raise(float_flag_invalid, status);
1591            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1592        }
1593        return (int32_t) 0x80000000;
1594    }
1595    else if ( aExp <= 0x7E ) {
1596        if (aExp | aSig) {
1597            status->float_exception_flags |= float_flag_inexact;
1598        }
1599        return 0;
1600    }
1601    aSig = ( aSig | 0x00800000 )<<8;
1602    z = aSig>>( - shiftCount );
1603    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1604        status->float_exception_flags |= float_flag_inexact;
1605    }
1606    if ( aSign ) z = - z;
1607    return z;
1608
1609}
1610
1611/*----------------------------------------------------------------------------
1612| Returns the result of converting the single-precision floating-point value
1613| `a' to the 16-bit two's complement integer format.  The conversion is
1614| performed according to the IEC/IEEE Standard for Binary Floating-Point
1615| Arithmetic, except that the conversion is always rounded toward zero.
1616| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1617| the conversion overflows, the largest integer with the same sign as `a' is
1618| returned.
1619*----------------------------------------------------------------------------*/
1620
1621int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
1622{
1623    flag aSign;
1624    int aExp;
1625    int shiftCount;
1626    uint32_t aSig;
1627    int32_t z;
1628
1629    aSig = extractFloat32Frac( a );
1630    aExp = extractFloat32Exp( a );
1631    aSign = extractFloat32Sign( a );
1632    shiftCount = aExp - 0x8E;
1633    if ( 0 <= shiftCount ) {
1634        if ( float32_val(a) != 0xC7000000 ) {
1635            float_raise(float_flag_invalid, status);
1636            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1637                return 0x7FFF;
1638            }
1639        }
1640        return (int32_t) 0xffff8000;
1641    }
1642    else if ( aExp <= 0x7E ) {
1643        if ( aExp | aSig ) {
1644            status->float_exception_flags |= float_flag_inexact;
1645        }
1646        return 0;
1647    }
1648    shiftCount -= 0x10;
1649    aSig = ( aSig | 0x00800000 )<<8;
1650    z = aSig>>( - shiftCount );
1651    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1652        status->float_exception_flags |= float_flag_inexact;
1653    }
1654    if ( aSign ) {
1655        z = - z;
1656    }
1657    return z;
1658
1659}
1660
1661/*----------------------------------------------------------------------------
1662| Returns the result of converting the single-precision floating-point value
1663| `a' to the 64-bit two's complement integer format.  The conversion is
1664| performed according to the IEC/IEEE Standard for Binary Floating-Point
1665| Arithmetic---which means in particular that the conversion is rounded
1666| according to the current rounding mode.  If `a' is a NaN, the largest
1667| positive integer is returned.  Otherwise, if the conversion overflows, the
1668| largest integer with the same sign as `a' is returned.
1669*----------------------------------------------------------------------------*/
1670
1671int64_t float32_to_int64(float32 a, float_status *status)
1672{
1673    flag aSign;
1674    int aExp;
1675    int shiftCount;
1676    uint32_t aSig;
1677    uint64_t aSig64, aSigExtra;
1678    a = float32_squash_input_denormal(a, status);
1679
1680    aSig = extractFloat32Frac( a );
1681    aExp = extractFloat32Exp( a );
1682    aSign = extractFloat32Sign( a );
1683    shiftCount = 0xBE - aExp;
1684    if ( shiftCount < 0 ) {
1685        float_raise(float_flag_invalid, status);
1686        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1687            return LIT64( 0x7FFFFFFFFFFFFFFF );
1688        }
1689        return (int64_t) LIT64( 0x8000000000000000 );
1690    }
1691    if ( aExp ) aSig |= 0x00800000;
1692    aSig64 = aSig;
1693    aSig64 <<= 40;
1694    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1695    return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
1696
1697}
1698
1699/*----------------------------------------------------------------------------
1700| Returns the result of converting the single-precision floating-point value
1701| `a' to the 64-bit unsigned integer format.  The conversion is
1702| performed according to the IEC/IEEE Standard for Binary Floating-Point
1703| Arithmetic---which means in particular that the conversion is rounded
1704| according to the current rounding mode.  If `a' is a NaN, the largest
1705| unsigned integer is returned.  Otherwise, if the conversion overflows, the
1706| largest unsigned integer is returned.  If the 'a' is negative, the result
1707| is rounded and zero is returned; values that do not round to zero will
1708| raise the inexact exception flag.
1709*----------------------------------------------------------------------------*/
1710
1711uint64_t float32_to_uint64(float32 a, float_status *status)
1712{
1713    flag aSign;
1714    int aExp;
1715    int shiftCount;
1716    uint32_t aSig;
1717    uint64_t aSig64, aSigExtra;
1718    a = float32_squash_input_denormal(a, status);
1719
1720    aSig = extractFloat32Frac(a);
1721    aExp = extractFloat32Exp(a);
1722    aSign = extractFloat32Sign(a);
1723    if ((aSign) && (aExp > 126)) {
1724        float_raise(float_flag_invalid, status);
1725        if (float32_is_any_nan(a)) {
1726            return LIT64(0xFFFFFFFFFFFFFFFF);
1727        } else {
1728            return 0;
1729        }
1730    }
1731    shiftCount = 0xBE - aExp;
1732    if (aExp) {
1733        aSig |= 0x00800000;
1734    }
1735    if (shiftCount < 0) {
1736        float_raise(float_flag_invalid, status);
1737        return LIT64(0xFFFFFFFFFFFFFFFF);
1738    }
1739
1740    aSig64 = aSig;
1741    aSig64 <<= 40;
1742    shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1743    return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
1744}
1745
1746/*----------------------------------------------------------------------------
1747| Returns the result of converting the single-precision floating-point value
1748| `a' to the 64-bit unsigned integer format.  The conversion is
1749| performed according to the IEC/IEEE Standard for Binary Floating-Point
1750| Arithmetic, except that the conversion is always rounded toward zero.  If
1751| `a' is a NaN, the largest unsigned integer is returned.  Otherwise, if the
1752| conversion overflows, the largest unsigned integer is returned.  If the
1753| 'a' is negative, the result is rounded and zero is returned; values that do
1754| not round to zero will raise the inexact flag.
1755*----------------------------------------------------------------------------*/
1756
1757uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
1758{
1759    signed char current_rounding_mode = status->float_rounding_mode;
1760    set_float_rounding_mode(float_round_to_zero, status);
1761    int64_t v = float32_to_uint64(a, status);
1762    set_float_rounding_mode(current_rounding_mode, status);
1763    return v;
1764}
1765
1766/*----------------------------------------------------------------------------
1767| Returns the result of converting the single-precision floating-point value
1768| `a' to the 64-bit two's complement integer format.  The conversion is
1769| performed according to the IEC/IEEE Standard for Binary Floating-Point
1770| Arithmetic, except that the conversion is always rounded toward zero.  If
1771| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1772| conversion overflows, the largest integer with the same sign as `a' is
1773| returned.
1774*----------------------------------------------------------------------------*/
1775
1776int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
1777{
1778    flag aSign;
1779    int aExp;
1780    int shiftCount;
1781    uint32_t aSig;
1782    uint64_t aSig64;
1783    int64_t z;
1784    a = float32_squash_input_denormal(a, status);
1785
1786    aSig = extractFloat32Frac( a );
1787    aExp = extractFloat32Exp( a );
1788    aSign = extractFloat32Sign( a );
1789    shiftCount = aExp - 0xBE;
1790    if ( 0 <= shiftCount ) {
1791        if ( float32_val(a) != 0xDF000000 ) {
1792            float_raise(float_flag_invalid, status);
1793            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1794                return LIT64( 0x7FFFFFFFFFFFFFFF );
1795            }
1796        }
1797        return (int64_t) LIT64( 0x8000000000000000 );
1798    }
1799    else if ( aExp <= 0x7E ) {
1800        if (aExp | aSig) {
1801            status->float_exception_flags |= float_flag_inexact;
1802        }
1803        return 0;
1804    }
1805    aSig64 = aSig | 0x00800000;
1806    aSig64 <<= 40;
1807    z = aSig64>>( - shiftCount );
1808    if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1809        status->float_exception_flags |= float_flag_inexact;
1810    }
1811    if ( aSign ) z = - z;
1812    return z;
1813
1814}
1815
1816/*----------------------------------------------------------------------------
1817| Returns the result of converting the single-precision floating-point value
1818| `a' to the double-precision floating-point format.  The conversion is
1819| performed according to the IEC/IEEE Standard for Binary Floating-Point
1820| Arithmetic.
1821*----------------------------------------------------------------------------*/
1822
1823float64 float32_to_float64(float32 a, float_status *status)
1824{
1825    flag aSign;
1826    int aExp;
1827    uint32_t aSig;
1828    a = float32_squash_input_denormal(a, status);
1829
1830    aSig = extractFloat32Frac( a );
1831    aExp = extractFloat32Exp( a );
1832    aSign = extractFloat32Sign( a );
1833    if ( aExp == 0xFF ) {
1834        if (aSig) {
1835            return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1836        }
1837        return packFloat64( aSign, 0x7FF, 0 );
1838    }
1839    if ( aExp == 0 ) {
1840        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1841        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1842        --aExp;
1843    }
1844    return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1845
1846}
1847
1848/*----------------------------------------------------------------------------
1849| Returns the result of converting the single-precision floating-point value
1850| `a' to the extended double-precision floating-point format.  The conversion
1851| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1852| Arithmetic.
1853*----------------------------------------------------------------------------*/
1854
1855floatx80 float32_to_floatx80(float32 a, float_status *status)
1856{
1857    flag aSign;
1858    int aExp;
1859    uint32_t aSig;
1860
1861    a = float32_squash_input_denormal(a, status);
1862    aSig = extractFloat32Frac( a );
1863    aExp = extractFloat32Exp( a );
1864    aSign = extractFloat32Sign( a );
1865    if ( aExp == 0xFF ) {
1866        if (aSig) {
1867            return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1868        }
1869        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1870    }
1871    if ( aExp == 0 ) {
1872        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1873        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1874    }
1875    aSig |= 0x00800000;
1876    return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1877
1878}
1879
1880/*----------------------------------------------------------------------------
1881| Returns the result of converting the single-precision floating-point value
1882| `a' to the double-precision floating-point format.  The conversion is
1883| performed according to the IEC/IEEE Standard for Binary Floating-Point
1884| Arithmetic.
1885*----------------------------------------------------------------------------*/
1886
1887float128 float32_to_float128(float32 a, float_status *status)
1888{
1889    flag aSign;
1890    int aExp;
1891    uint32_t aSig;
1892
1893    a = float32_squash_input_denormal(a, status);
1894    aSig = extractFloat32Frac( a );
1895    aExp = extractFloat32Exp( a );
1896    aSign = extractFloat32Sign( a );
1897    if ( aExp == 0xFF ) {
1898        if (aSig) {
1899            return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1900        }
1901        return packFloat128( aSign, 0x7FFF, 0, 0 );
1902    }
1903    if ( aExp == 0 ) {
1904        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1905        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1906        --aExp;
1907    }
1908    return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1909
1910}
1911
1912/*----------------------------------------------------------------------------
1913| Rounds the single-precision floating-point value `a' to an integer, and
1914| returns the result as a single-precision floating-point value.  The
1915| operation is performed according to the IEC/IEEE Standard for Binary
1916| Floating-Point Arithmetic.
1917*----------------------------------------------------------------------------*/
1918
1919float32 float32_round_to_int(float32 a, float_status *status)
1920{
1921    flag aSign;
1922    int aExp;
1923    uint32_t lastBitMask, roundBitsMask;
1924    uint32_t z;
1925    a = float32_squash_input_denormal(a, status);
1926
1927    aExp = extractFloat32Exp( a );
1928    if ( 0x96 <= aExp ) {
1929        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1930            return propagateFloat32NaN(a, a, status);
1931        }
1932        return a;
1933    }
1934    if ( aExp <= 0x7E ) {
1935        if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1936        status->float_exception_flags |= float_flag_inexact;
1937        aSign = extractFloat32Sign( a );
1938        switch (status->float_rounding_mode) {
1939         case float_round_nearest_even:
1940            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1941                return packFloat32( aSign, 0x7F, 0 );
1942            }
1943            break;
1944        case float_round_ties_away:
1945            if (aExp == 0x7E) {
1946                return packFloat32(aSign, 0x7F, 0);
1947            }
1948            break;
1949         case float_round_down:
1950            return make_float32(aSign ? 0xBF800000 : 0);
1951         case float_round_up:
1952            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1953        }
1954        return packFloat32( aSign, 0, 0 );
1955    }
1956    lastBitMask = 1;
1957    lastBitMask <<= 0x96 - aExp;
1958    roundBitsMask = lastBitMask - 1;
1959    z = float32_val(a);
1960    switch (status->float_rounding_mode) {
1961    case float_round_nearest_even:
1962        z += lastBitMask>>1;
1963        if ((z & roundBitsMask) == 0) {
1964            z &= ~lastBitMask;
1965        }
1966        break;
1967    case float_round_ties_away:
1968        z += lastBitMask >> 1;
1969        break;
1970    case float_round_to_zero:
1971        break;
1972    case float_round_up:
1973        if (!extractFloat32Sign(make_float32(z))) {
1974            z += roundBitsMask;
1975        }
1976        break;
1977    case float_round_down:
1978        if (extractFloat32Sign(make_float32(z))) {
1979            z += roundBitsMask;
1980        }
1981        break;
1982    default:
1983        abort();
1984    }
1985    z &= ~ roundBitsMask;
1986    if (z != float32_val(a)) {
1987        status->float_exception_flags |= float_flag_inexact;
1988    }
1989    return make_float32(z);
1990
1991}
1992
1993/*----------------------------------------------------------------------------
1994| Returns the result of adding the absolute values of the single-precision
1995| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1996| before being returned.  `zSign' is ignored if the result is a NaN.
1997| The addition is performed according to the IEC/IEEE Standard for Binary
1998| Floating-Point Arithmetic.
1999*----------------------------------------------------------------------------*/
2000
2001static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
2002                              float_status *status)
2003{
2004    int aExp, bExp, zExp;
2005    uint32_t aSig, bSig, zSig;
2006    int expDiff;
2007
2008    aSig = extractFloat32Frac( a );
2009    aExp = extractFloat32Exp( a );
2010    bSig = extractFloat32Frac( b );
2011    bExp = extractFloat32Exp( b );
2012    expDiff = aExp - bExp;
2013    aSig <<= 6;
2014    bSig <<= 6;
2015    if ( 0 < expDiff ) {
2016        if ( aExp == 0xFF ) {
2017            if (aSig) {
2018                return propagateFloat32NaN(a, b, status);
2019            }
2020            return a;
2021        }
2022        if ( bExp == 0 ) {
2023            --expDiff;
2024        }
2025        else {
2026            bSig |= 0x20000000;
2027        }
2028        shift32RightJamming( bSig, expDiff, &bSig );
2029        zExp = aExp;
2030    }
2031    else if ( expDiff < 0 ) {
2032        if ( bExp == 0xFF ) {
2033            if (bSig) {
2034                return propagateFloat32NaN(a, b, status);
2035            }
2036            return packFloat32( zSign, 0xFF, 0 );
2037        }
2038        if ( aExp == 0 ) {
2039            ++expDiff;
2040        }
2041        else {
2042            aSig |= 0x20000000;
2043        }
2044        shift32RightJamming( aSig, - expDiff, &aSig );
2045        zExp = bExp;
2046    }
2047    else {
2048        if ( aExp == 0xFF ) {
2049            if (aSig | bSig) {
2050                return propagateFloat32NaN(a, b, status);
2051            }
2052            return a;
2053        }
2054        if ( aExp == 0 ) {
2055            if (status->flush_to_zero) {
2056                if (aSig | bSig) {
2057                    float_raise(float_flag_output_denormal, status);
2058                }
2059                return packFloat32(zSign, 0, 0);
2060            }
2061            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2062        }
2063        zSig = 0x40000000 + aSig + bSig;
2064        zExp = aExp;
2065        goto roundAndPack;
2066    }
2067    aSig |= 0x20000000;
2068    zSig = ( aSig + bSig )<<1;
2069    --zExp;
2070    if ( (int32_t) zSig < 0 ) {
2071        zSig = aSig + bSig;
2072        ++zExp;
2073    }
2074 roundAndPack:
2075    return roundAndPackFloat32(zSign, zExp, zSig, status);
2076
2077}
2078
2079/*----------------------------------------------------------------------------
2080| Returns the result of subtracting the absolute values of the single-
2081| precision floating-point values `a' and `b'.  If `zSign' is 1, the
2082| difference is negated before being returned.  `zSign' is ignored if the
2083| result is a NaN.  The subtraction is performed according to the IEC/IEEE
2084| Standard for Binary Floating-Point Arithmetic.
2085*----------------------------------------------------------------------------*/
2086
2087static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2088                              float_status *status)
2089{
2090    int aExp, bExp, zExp;
2091    uint32_t aSig, bSig, zSig;
2092    int expDiff;
2093
2094    aSig = extractFloat32Frac( a );
2095    aExp = extractFloat32Exp( a );
2096    bSig = extractFloat32Frac( b );
2097    bExp = extractFloat32Exp( b );
2098    expDiff = aExp - bExp;
2099    aSig <<= 7;
2100    bSig <<= 7;
2101    if ( 0 < expDiff ) goto aExpBigger;
2102    if ( expDiff < 0 ) goto bExpBigger;
2103    if ( aExp == 0xFF ) {
2104        if (aSig | bSig) {
2105            return propagateFloat32NaN(a, b, status);
2106        }
2107        float_raise(float_flag_invalid, status);
2108        return float32_default_nan(status);
2109    }
2110    if ( aExp == 0 ) {
2111        aExp = 1;
2112        bExp = 1;
2113    }
2114    if ( bSig < aSig ) goto aBigger;
2115    if ( aSig < bSig ) goto bBigger;
2116    return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
2117 bExpBigger:
2118    if ( bExp == 0xFF ) {
2119        if (bSig) {
2120            return propagateFloat32NaN(a, b, status);
2121        }
2122        return packFloat32( zSign ^ 1, 0xFF, 0 );
2123    }
2124    if ( aExp == 0 ) {
2125        ++expDiff;
2126    }
2127    else {
2128        aSig |= 0x40000000;
2129    }
2130    shift32RightJamming( aSig, - expDiff, &aSig );
2131    bSig |= 0x40000000;
2132 bBigger:
2133    zSig = bSig - aSig;
2134    zExp = bExp;
2135    zSign ^= 1;
2136    goto normalizeRoundAndPack;
2137 aExpBigger:
2138    if ( aExp == 0xFF ) {
2139        if (aSig) {
2140            return propagateFloat32NaN(a, b, status);
2141        }
2142        return a;
2143    }
2144    if ( bExp == 0 ) {
2145        --expDiff;
2146    }
2147    else {
2148        bSig |= 0x40000000;
2149    }
2150    shift32RightJamming( bSig, expDiff, &bSig );
2151    aSig |= 0x40000000;
2152 aBigger:
2153    zSig = aSig - bSig;
2154    zExp = aExp;
2155 normalizeRoundAndPack:
2156    --zExp;
2157    return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
2158
2159}
2160
2161/*----------------------------------------------------------------------------
2162| Returns the result of adding the single-precision floating-point values `a'
2163| and `b'.  The operation is performed according to the IEC/IEEE Standard for
2164| Binary Floating-Point Arithmetic.
2165*----------------------------------------------------------------------------*/
2166
2167float32 float32_add(float32 a, float32 b, float_status *status)
2168{
2169    flag aSign, bSign;
2170    a = float32_squash_input_denormal(a, status);
2171    b = float32_squash_input_denormal(b, status);
2172
2173    aSign = extractFloat32Sign( a );
2174    bSign = extractFloat32Sign( b );
2175    if ( aSign == bSign ) {
2176        return addFloat32Sigs(a, b, aSign, status);
2177    }
2178    else {
2179        return subFloat32Sigs(a, b, aSign, status);
2180    }
2181
2182}
2183
2184/*----------------------------------------------------------------------------
2185| Returns the result of subtracting the single-precision floating-point values
2186| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2187| for Binary Floating-Point Arithmetic.
2188*----------------------------------------------------------------------------*/
2189
2190float32 float32_sub(float32 a, float32 b, float_status *status)
2191{
2192    flag aSign, bSign;
2193    a = float32_squash_input_denormal(a, status);
2194    b = float32_squash_input_denormal(b, status);
2195
2196    aSign = extractFloat32Sign( a );
2197    bSign = extractFloat32Sign( b );
2198    if ( aSign == bSign ) {
2199        return subFloat32Sigs(a, b, aSign, status);
2200    }
2201    else {
2202        return addFloat32Sigs(a, b, aSign, status);
2203    }
2204
2205}
2206
2207/*----------------------------------------------------------------------------
2208| Returns the result of multiplying the single-precision floating-point values
2209| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2210| for Binary Floating-Point Arithmetic.
2211*----------------------------------------------------------------------------*/
2212
2213float32 float32_mul(float32 a, float32 b, float_status *status)
2214{
2215    flag aSign, bSign, zSign;
2216    int aExp, bExp, zExp;
2217    uint32_t aSig, bSig;
2218    uint64_t zSig64;
2219    uint32_t zSig;
2220
2221    a = float32_squash_input_denormal(a, status);
2222    b = float32_squash_input_denormal(b, status);
2223
2224    aSig = extractFloat32Frac( a );
2225    aExp = extractFloat32Exp( a );
2226    aSign = extractFloat32Sign( a );
2227    bSig = extractFloat32Frac( b );
2228    bExp = extractFloat32Exp( b );
2229    bSign = extractFloat32Sign( b );
2230    zSign = aSign ^ bSign;
2231    if ( aExp == 0xFF ) {
2232        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2233            return propagateFloat32NaN(a, b, status);
2234        }
2235        if ( ( bExp | bSig ) == 0 ) {
2236            float_raise(float_flag_invalid, status);
2237            return float32_default_nan(status);
2238        }
2239        return packFloat32( zSign, 0xFF, 0 );
2240    }
2241    if ( bExp == 0xFF ) {
2242        if (bSig) {
2243            return propagateFloat32NaN(a, b, status);
2244        }
2245        if ( ( aExp | aSig ) == 0 ) {
2246            float_raise(float_flag_invalid, status);
2247            return float32_default_nan(status);
2248        }
2249        return packFloat32( zSign, 0xFF, 0 );
2250    }
2251    if ( aExp == 0 ) {
2252        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2253        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2254    }
2255    if ( bExp == 0 ) {
2256        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2257        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2258    }
2259    zExp = aExp + bExp - 0x7F;
2260    aSig = ( aSig | 0x00800000 )<<7;
2261    bSig = ( bSig | 0x00800000 )<<8;
2262    shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2263    zSig = zSig64;
2264    if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2265        zSig <<= 1;
2266        --zExp;
2267    }
2268    return roundAndPackFloat32(zSign, zExp, zSig, status);
2269
2270}
2271
2272/*----------------------------------------------------------------------------
2273| Returns the result of dividing the single-precision floating-point value `a'
2274| by the corresponding value `b'.  The operation is performed according to the
2275| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2276*----------------------------------------------------------------------------*/
2277
2278float32 float32_div(float32 a, float32 b, float_status *status)
2279{
2280    flag aSign, bSign, zSign;
2281    int aExp, bExp, zExp;
2282    uint32_t aSig, bSig, zSig;
2283    a = float32_squash_input_denormal(a, status);
2284    b = float32_squash_input_denormal(b, status);
2285
2286    aSig = extractFloat32Frac( a );
2287    aExp = extractFloat32Exp( a );
2288    aSign = extractFloat32Sign( a );
2289    bSig = extractFloat32Frac( b );
2290    bExp = extractFloat32Exp( b );
2291    bSign = extractFloat32Sign( b );
2292    zSign = aSign ^ bSign;
2293    if ( aExp == 0xFF ) {
2294        if (aSig) {
2295            return propagateFloat32NaN(a, b, status);
2296        }
2297        if ( bExp == 0xFF ) {
2298            if (bSig) {
2299                return propagateFloat32NaN(a, b, status);
2300            }
2301            float_raise(float_flag_invalid, status);
2302            return float32_default_nan(status);
2303        }
2304        return packFloat32( zSign, 0xFF, 0 );
2305    }
2306    if ( bExp == 0xFF ) {
2307        if (bSig) {
2308            return propagateFloat32NaN(a, b, status);
2309        }
2310        return packFloat32( zSign, 0, 0 );
2311    }
2312    if ( bExp == 0 ) {
2313        if ( bSig == 0 ) {
2314            if ( ( aExp | aSig ) == 0 ) {
2315                float_raise(float_flag_invalid, status);
2316                return float32_default_nan(status);
2317            }
2318            float_raise(float_flag_divbyzero, status);
2319            return packFloat32( zSign, 0xFF, 0 );
2320        }
2321        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2322    }
2323    if ( aExp == 0 ) {
2324        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2325        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2326    }
2327    zExp = aExp - bExp + 0x7D;
2328    aSig = ( aSig | 0x00800000 )<<7;
2329    bSig = ( bSig | 0x00800000 )<<8;
2330    if ( bSig <= ( aSig + aSig ) ) {
2331        aSig >>= 1;
2332        ++zExp;
2333    }
2334    zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2335    if ( ( zSig & 0x3F ) == 0 ) {
2336        zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2337    }
2338    return roundAndPackFloat32(zSign, zExp, zSig, status);
2339
2340}
2341
2342/*----------------------------------------------------------------------------
2343| Returns the remainder of the single-precision floating-point value `a'
2344| with respect to the corresponding value `b'.  The operation is performed
2345| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2346*----------------------------------------------------------------------------*/
2347
2348float32 float32_rem(float32 a, float32 b, float_status *status)
2349{
2350    flag aSign, zSign;
2351    int aExp, bExp, expDiff;
2352    uint32_t aSig, bSig;
2353    uint32_t q;
2354    uint64_t aSig64, bSig64, q64;
2355    uint32_t alternateASig;
2356    int32_t sigMean;
2357    a = float32_squash_input_denormal(a, status);
2358    b = float32_squash_input_denormal(b, status);
2359
2360    aSig = extractFloat32Frac( a );
2361    aExp = extractFloat32Exp( a );
2362    aSign = extractFloat32Sign( a );
2363    bSig = extractFloat32Frac( b );
2364    bExp = extractFloat32Exp( b );
2365    if ( aExp == 0xFF ) {
2366        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2367            return propagateFloat32NaN(a, b, status);
2368        }
2369        float_raise(float_flag_invalid, status);
2370        return float32_default_nan(status);
2371    }
2372    if ( bExp == 0xFF ) {
2373        if (bSig) {
2374            return propagateFloat32NaN(a, b, status);
2375        }
2376        return a;
2377    }
2378    if ( bExp == 0 ) {
2379        if ( bSig == 0 ) {
2380            float_raise(float_flag_invalid, status);
2381            return float32_default_nan(status);
2382        }
2383        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2384    }
2385    if ( aExp == 0 ) {
2386        if ( aSig == 0 ) return a;
2387        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2388    }
2389    expDiff = aExp - bExp;
2390    aSig |= 0x00800000;
2391    bSig |= 0x00800000;
2392    if ( expDiff < 32 ) {
2393        aSig <<= 8;
2394        bSig <<= 8;
2395        if ( expDiff < 0 ) {
2396            if ( expDiff < -1 ) return a;
2397            aSig >>= 1;
2398        }
2399        q = ( bSig <= aSig );
2400        if ( q ) aSig -= bSig;
2401        if ( 0 < expDiff ) {
2402            q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2403            q >>= 32 - expDiff;
2404            bSig >>= 2;
2405            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2406        }
2407        else {
2408            aSig >>= 2;
2409            bSig >>= 2;
2410        }
2411    }
2412    else {
2413        if ( bSig <= aSig ) aSig -= bSig;
2414        aSig64 = ( (uint64_t) aSig )<<40;
2415        bSig64 = ( (uint64_t) bSig )<<40;
2416        expDiff -= 64;
2417        while ( 0 < expDiff ) {
2418            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2419            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2420            aSig64 = - ( ( bSig * q64 )<<38 );
2421            expDiff -= 62;
2422        }
2423        expDiff += 64;
2424        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2425        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2426        q = q64>>( 64 - expDiff );
2427        bSig <<= 6;
2428        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2429    }
2430    do {
2431        alternateASig = aSig;
2432        ++q;
2433        aSig -= bSig;
2434    } while ( 0 <= (int32_t) aSig );
2435    sigMean = aSig + alternateASig;
2436    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2437        aSig = alternateASig;
2438    }
2439    zSign = ( (int32_t) aSig < 0 );
2440    if ( zSign ) aSig = - aSig;
2441    return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
2442}
2443
2444/*----------------------------------------------------------------------------
2445| Returns the result of multiplying the single-precision floating-point values
2446| `a' and `b' then adding 'c', with no intermediate rounding step after the
2447| multiplication.  The operation is performed according to the IEC/IEEE
2448| Standard for Binary Floating-Point Arithmetic 754-2008.
2449| The flags argument allows the caller to select negation of the
2450| addend, the intermediate product, or the final result. (The difference
2451| between this and having the caller do a separate negation is that negating
2452| externally will flip the sign bit on NaNs.)
2453*----------------------------------------------------------------------------*/
2454
2455float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2456                       float_status *status)
2457{
2458    flag aSign, bSign, cSign, zSign;
2459    int aExp, bExp, cExp, pExp, zExp, expDiff;
2460    uint32_t aSig, bSig, cSig;
2461    flag pInf, pZero, pSign;
2462    uint64_t pSig64, cSig64, zSig64;
2463    uint32_t pSig;
2464    int shiftcount;
2465    flag signflip, infzero;
2466
2467    a = float32_squash_input_denormal(a, status);
2468    b = float32_squash_input_denormal(b, status);
2469    c = float32_squash_input_denormal(c, status);
2470    aSig = extractFloat32Frac(a);
2471    aExp = extractFloat32Exp(a);
2472    aSign = extractFloat32Sign(a);
2473    bSig = extractFloat32Frac(b);
2474    bExp = extractFloat32Exp(b);
2475    bSign = extractFloat32Sign(b);
2476    cSig = extractFloat32Frac(c);
2477    cExp = extractFloat32Exp(c);
2478    cSign = extractFloat32Sign(c);
2479
2480    infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2481               (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2482
2483    /* It is implementation-defined whether the cases of (0,inf,qnan)
2484     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2485     * they return if they do), so we have to hand this information
2486     * off to the target-specific pick-a-NaN routine.
2487     */
2488    if (((aExp == 0xff) && aSig) ||
2489        ((bExp == 0xff) && bSig) ||
2490        ((cExp == 0xff) && cSig)) {
2491        return propagateFloat32MulAddNaN(a, b, c, infzero, status);
2492    }
2493
2494    if (infzero) {
2495        float_raise(float_flag_invalid, status);
2496        return float32_default_nan(status);
2497    }
2498
2499    if (flags & float_muladd_negate_c) {
2500        cSign ^= 1;
2501    }
2502
2503    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2504
2505    /* Work out the sign and type of the product */
2506    pSign = aSign ^ bSign;
2507    if (flags & float_muladd_negate_product) {
2508        pSign ^= 1;
2509    }
2510    pInf = (aExp == 0xff) || (bExp == 0xff);
2511    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2512
2513    if (cExp == 0xff) {
2514        if (pInf && (pSign ^ cSign)) {
2515            /* addition of opposite-signed infinities => InvalidOperation */
2516            float_raise(float_flag_invalid, status);
2517            return float32_default_nan(status);
2518        }
2519        /* Otherwise generate an infinity of the same sign */
2520        return packFloat32(cSign ^ signflip, 0xff, 0);
2521    }
2522
2523    if (pInf) {
2524        return packFloat32(pSign ^ signflip, 0xff, 0);
2525    }
2526
2527    if (pZero) {
2528        if (cExp == 0) {
2529            if (cSig == 0) {
2530                /* Adding two exact zeroes */
2531                if (pSign == cSign) {
2532                    zSign = pSign;
2533                } else if (status->float_rounding_mode == float_round_down) {
2534                    zSign = 1;
2535                } else {
2536                    zSign = 0;
2537                }
2538                return packFloat32(zSign ^ signflip, 0, 0);
2539            }
2540            /* Exact zero plus a denorm */
2541            if (status->flush_to_zero) {
2542                float_raise(float_flag_output_denormal, status);
2543                return packFloat32(cSign ^ signflip, 0, 0);
2544            }
2545        }
2546        /* Zero plus something non-zero : just return the something */
2547        if (flags & float_muladd_halve_result) {
2548            if (cExp == 0) {
2549                normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2550            }
2551            /* Subtract one to halve, and one again because roundAndPackFloat32
2552             * wants one less than the true exponent.
2553             */
2554            cExp -= 2;
2555            cSig = (cSig | 0x00800000) << 7;
2556            return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
2557        }
2558        return packFloat32(cSign ^ signflip, cExp, cSig);
2559    }
2560
2561    if (aExp == 0) {
2562        normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2563    }
2564    if (bExp == 0) {
2565        normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2566    }
2567
2568    /* Calculate the actual result a * b + c */
2569
2570    /* Multiply first; this is easy. */
2571    /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2572     * because we want the true exponent, not the "one-less-than"
2573     * flavour that roundAndPackFloat32() takes.
2574     */
2575    pExp = aExp + bExp - 0x7e;
2576    aSig = (aSig | 0x00800000) << 7;
2577    bSig = (bSig | 0x00800000) << 8;
2578    pSig64 = (uint64_t)aSig * bSig;
2579    if ((int64_t)(pSig64 << 1) >= 0) {
2580        pSig64 <<= 1;
2581        pExp--;
2582    }
2583
2584    zSign = pSign ^ signflip;
2585
2586    /* Now pSig64 is the significand of the multiply, with the explicit bit in
2587     * position 62.
2588     */
2589    if (cExp == 0) {
2590        if (!cSig) {
2591            /* Throw out the special case of c being an exact zero now */
2592            shift64RightJamming(pSig64, 32, &pSig64);
2593            pSig = pSig64;
2594            if (flags & float_muladd_halve_result) {
2595                pExp--;
2596            }
2597            return roundAndPackFloat32(zSign, pExp - 1,
2598                                       pSig, status);
2599        }
2600        normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2601    }
2602
2603    cSig64 = (uint64_t)cSig << (62 - 23);
2604    cSig64 |= LIT64(0x4000000000000000);
2605    expDiff = pExp - cExp;
2606
2607    if (pSign == cSign) {
2608        /* Addition */
2609        if (expDiff > 0) {
2610            /* scale c to match p */
2611            shift64RightJamming(cSig64, expDiff, &cSig64);
2612            zExp = pExp;
2613        } else if (expDiff < 0) {
2614            /* scale p to match c */
2615            shift64RightJamming(pSig64, -expDiff, &pSig64);
2616            zExp = cExp;
2617        } else {
2618            /* no scaling needed */
2619            zExp = cExp;
2620        }
2621        /* Add significands and make sure explicit bit ends up in posn 62 */
2622        zSig64 = pSig64 + cSig64;
2623        if ((int64_t)zSig64 < 0) {
2624            shift64RightJamming(zSig64, 1, &zSig64);
2625        } else {
2626            zExp--;
2627        }
2628    } else {
2629        /* Subtraction */
2630        if (expDiff > 0) {
2631            shift64RightJamming(cSig64, expDiff, &cSig64);
2632            zSig64 = pSig64 - cSig64;
2633            zExp = pExp;
2634        } else if (expDiff < 0) {
2635            shift64RightJamming(pSig64, -expDiff, &pSig64);
2636            zSig64 = cSig64 - pSig64;
2637            zExp = cExp;
2638            zSign ^= 1;
2639        } else {
2640            zExp = pExp;
2641            if (cSig64 < pSig64) {
2642                zSig64 = pSig64 - cSig64;
2643            } else if (pSig64 < cSig64) {
2644                zSig64 = cSig64 - pSig64;
2645                zSign ^= 1;
2646            } else {
2647                /* Exact zero */
2648                zSign = signflip;
2649                if (status->float_rounding_mode == float_round_down) {
2650                    zSign ^= 1;
2651                }
2652                return packFloat32(zSign, 0, 0);
2653            }
2654        }
2655        --zExp;
2656        /* Normalize to put the explicit bit back into bit 62. */
2657        shiftcount = countLeadingZeros64(zSig64) - 1;
2658        zSig64 <<= shiftcount;
2659        zExp -= shiftcount;
2660    }
2661    if (flags & float_muladd_halve_result) {
2662        zExp--;
2663    }
2664
2665    shift64RightJamming(zSig64, 32, &zSig64);
2666    return roundAndPackFloat32(zSign, zExp, zSig64, status);
2667}
2668
2669
2670/*----------------------------------------------------------------------------
2671| Returns the square root of the single-precision floating-point value `a'.
2672| The operation is performed according to the IEC/IEEE Standard for Binary
2673| Floating-Point Arithmetic.
2674*----------------------------------------------------------------------------*/
2675
2676float32 float32_sqrt(float32 a, float_status *status)
2677{
2678    flag aSign;
2679    int aExp, zExp;
2680    uint32_t aSig, zSig;
2681    uint64_t rem, term;
2682    a = float32_squash_input_denormal(a, status);
2683
2684    aSig = extractFloat32Frac( a );
2685    aExp = extractFloat32Exp( a );
2686    aSign = extractFloat32Sign( a );
2687    if ( aExp == 0xFF ) {
2688        if (aSig) {
2689            return propagateFloat32NaN(a, float32_zero, status);
2690        }
2691        if ( ! aSign ) return a;
2692        float_raise(float_flag_invalid, status);
2693        return float32_default_nan(status);
2694    }
2695    if ( aSign ) {
2696        if ( ( aExp | aSig ) == 0 ) return a;
2697        float_raise(float_flag_invalid, status);
2698        return float32_default_nan(status);
2699    }
2700    if ( aExp == 0 ) {
2701        if ( aSig == 0 ) return float32_zero;
2702        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2703    }
2704    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2705    aSig = ( aSig | 0x00800000 )<<8;
2706    zSig = estimateSqrt32( aExp, aSig ) + 2;
2707    if ( ( zSig & 0x7F ) <= 5 ) {
2708        if ( zSig < 2 ) {
2709            zSig = 0x7FFFFFFF;
2710            goto roundAndPack;
2711        }
2712        aSig >>= aExp & 1;
2713        term = ( (uint64_t) zSig ) * zSig;
2714        rem = ( ( (uint64_t) aSig )<<32 ) - term;
2715        while ( (int64_t) rem < 0 ) {
2716            --zSig;
2717            rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2718        }
2719        zSig |= ( rem != 0 );
2720    }
2721    shift32RightJamming( zSig, 1, &zSig );
2722 roundAndPack:
2723    return roundAndPackFloat32(0, zExp, zSig, status);
2724
2725}
2726
2727/*----------------------------------------------------------------------------
2728| Returns the binary exponential of the single-precision floating-point value
2729| `a'. The operation is performed according to the IEC/IEEE Standard for
2730| Binary Floating-Point Arithmetic.
2731|
2732| Uses the following identities:
2733|
2734| 1. -------------------------------------------------------------------------
2735|      x    x*ln(2)
2736|     2  = e
2737|
2738| 2. -------------------------------------------------------------------------
2739|                      2     3     4     5           n
2740|      x        x     x     x     x     x           x
2741|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2742|               1!    2!    3!    4!    5!          n!
2743*----------------------------------------------------------------------------*/
2744
2745static const float64 float32_exp2_coefficients[15] =
2746{
2747    const_float64( 0x3ff0000000000000ll ), /*  1 */
2748    const_float64( 0x3fe0000000000000ll ), /*  2 */
2749    const_float64( 0x3fc5555555555555ll ), /*  3 */
2750    const_float64( 0x3fa5555555555555ll ), /*  4 */
2751    const_float64( 0x3f81111111111111ll ), /*  5 */
2752    const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2753    const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2754    const_float64( 0x3efa01a01a01a01all ), /*  8 */
2755    const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2756    const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2757    const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2758    const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2759    const_float64( 0x3de6124613a86d09ll ), /* 13 */
2760    const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2761    const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2762};
2763
2764float32 float32_exp2(float32 a, float_status *status)
2765{
2766    flag aSign;
2767    int aExp;
2768    uint32_t aSig;
2769    float64 r, x, xn;
2770    int i;
2771    a = float32_squash_input_denormal(a, status);
2772
2773    aSig = extractFloat32Frac( a );
2774    aExp = extractFloat32Exp( a );
2775    aSign = extractFloat32Sign( a );
2776
2777    if ( aExp == 0xFF) {
2778        if (aSig) {
2779            return propagateFloat32NaN(a, float32_zero, status);
2780        }
2781        return (aSign) ? float32_zero : a;
2782    }
2783    if (aExp == 0) {
2784        if (aSig == 0) return float32_one;
2785    }
2786
2787    float_raise(float_flag_inexact, status);
2788
2789    /* ******************************* */
2790    /* using float64 for approximation */
2791    /* ******************************* */
2792    x = float32_to_float64(a, status);
2793    x = float64_mul(x, float64_ln2, status);
2794
2795    xn = x;
2796    r = float64_one;
2797    for (i = 0 ; i < 15 ; i++) {
2798        float64 f;
2799
2800        f = float64_mul(xn, float32_exp2_coefficients[i], status);
2801        r = float64_add(r, f, status);
2802
2803        xn = float64_mul(xn, x, status);
2804    }
2805
2806    return float64_to_float32(r, status);
2807}
2808
2809/*----------------------------------------------------------------------------
2810| Returns the binary log of the single-precision floating-point value `a'.
2811| The operation is performed according to the IEC/IEEE Standard for Binary
2812| Floating-Point Arithmetic.
2813*----------------------------------------------------------------------------*/
2814float32 float32_log2(float32 a, float_status *status)
2815{
2816    flag aSign, zSign;
2817    int aExp;
2818    uint32_t aSig, zSig, i;
2819
2820    a = float32_squash_input_denormal(a, status);
2821    aSig = extractFloat32Frac( a );
2822    aExp = extractFloat32Exp( a );
2823    aSign = extractFloat32Sign( a );
2824
2825    if ( aExp == 0 ) {
2826        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2827        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2828    }
2829    if ( aSign ) {
2830        float_raise(float_flag_invalid, status);
2831        return float32_default_nan(status);
2832    }
2833    if ( aExp == 0xFF ) {
2834        if (aSig) {
2835            return propagateFloat32NaN(a, float32_zero, status);
2836        }
2837        return a;
2838    }
2839
2840    aExp -= 0x7F;
2841    aSig |= 0x00800000;
2842    zSign = aExp < 0;
2843    zSig = aExp << 23;
2844
2845    for (i = 1 << 22; i > 0; i >>= 1) {
2846        aSig = ( (uint64_t)aSig * aSig ) >> 23;
2847        if ( aSig & 0x01000000 ) {
2848            aSig >>= 1;
2849            zSig |= i;
2850        }
2851    }
2852
2853    if ( zSign )
2854        zSig = -zSig;
2855
2856    return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
2857}
2858
2859/*----------------------------------------------------------------------------
2860| Returns 1 if the single-precision floating-point value `a' is equal to
2861| the corresponding value `b', and 0 otherwise.  The invalid exception is
2862| raised if either operand is a NaN.  Otherwise, the comparison is performed
2863| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2864*----------------------------------------------------------------------------*/
2865
2866int float32_eq(float32 a, float32 b, float_status *status)
2867{
2868    uint32_t av, bv;
2869    a = float32_squash_input_denormal(a, status);
2870    b = float32_squash_input_denormal(b, status);
2871
2872    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2873         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2874       ) {
2875        float_raise(float_flag_invalid, status);
2876        return 0;
2877    }
2878    av = float32_val(a);
2879    bv = float32_val(b);
2880    return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2881}
2882
2883/*----------------------------------------------------------------------------
2884| Returns 1 if the single-precision floating-point value `a' is less than
2885| or equal to the corresponding value `b', and 0 otherwise.  The invalid
2886| exception is raised if either operand is a NaN.  The comparison is performed
2887| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2888*----------------------------------------------------------------------------*/
2889
2890int float32_le(float32 a, float32 b, float_status *status)
2891{
2892    flag aSign, bSign;
2893    uint32_t av, bv;
2894    a = float32_squash_input_denormal(a, status);
2895    b = float32_squash_input_denormal(b, status);
2896
2897    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2898         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2899       ) {
2900        float_raise(float_flag_invalid, status);
2901        return 0;
2902    }
2903    aSign = extractFloat32Sign( a );
2904    bSign = extractFloat32Sign( b );
2905    av = float32_val(a);
2906    bv = float32_val(b);
2907    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2908    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2909
2910}
2911
2912/*----------------------------------------------------------------------------
2913| Returns 1 if the single-precision floating-point value `a' is less than
2914| the corresponding value `b', and 0 otherwise.  The invalid exception is
2915| raised if either operand is a NaN.  The comparison is performed according
2916| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2917*----------------------------------------------------------------------------*/
2918
2919int float32_lt(float32 a, float32 b, float_status *status)
2920{
2921    flag aSign, bSign;
2922    uint32_t av, bv;
2923    a = float32_squash_input_denormal(a, status);
2924    b = float32_squash_input_denormal(b, status);
2925
2926    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2927         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2928       ) {
2929        float_raise(float_flag_invalid, status);
2930        return 0;
2931    }
2932    aSign = extractFloat32Sign( a );
2933    bSign = extractFloat32Sign( b );
2934    av = float32_val(a);
2935    bv = float32_val(b);
2936    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2937    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2938
2939}
2940
2941/*----------------------------------------------------------------------------
2942| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2943| be compared, and 0 otherwise.  The invalid exception is raised if either
2944| operand is a NaN.  The comparison is performed according to the IEC/IEEE
2945| Standard for Binary Floating-Point Arithmetic.
2946*----------------------------------------------------------------------------*/
2947
2948int float32_unordered(float32 a, float32 b, float_status *status)
2949{
2950    a = float32_squash_input_denormal(a, status);
2951    b = float32_squash_input_denormal(b, status);
2952
2953    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2954         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2955       ) {
2956        float_raise(float_flag_invalid, status);
2957        return 1;
2958    }
2959    return 0;
2960}
2961
2962/*----------------------------------------------------------------------------
2963| Returns 1 if the single-precision floating-point value `a' is equal to
2964| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2965| exception.  The comparison is performed according to the IEC/IEEE Standard
2966| for Binary Floating-Point Arithmetic.
2967*----------------------------------------------------------------------------*/
2968
2969int float32_eq_quiet(float32 a, float32 b, float_status *status)
2970{
2971    a = float32_squash_input_denormal(a, status);
2972    b = float32_squash_input_denormal(b, status);
2973
2974    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2975         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2976       ) {
2977        if (float32_is_signaling_nan(a, status)
2978         || float32_is_signaling_nan(b, status)) {
2979            float_raise(float_flag_invalid, status);
2980        }
2981        return 0;
2982    }
2983    return ( float32_val(a) == float32_val(b) ) ||
2984            ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2985}
2986
2987/*----------------------------------------------------------------------------
2988| Returns 1 if the single-precision floating-point value `a' is less than or
2989| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2990| cause an exception.  Otherwise, the comparison is performed according to the
2991| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2992*----------------------------------------------------------------------------*/
2993
2994int float32_le_quiet(float32 a, float32 b, float_status *status)
2995{
2996    flag aSign, bSign;
2997    uint32_t av, bv;
2998    a = float32_squash_input_denormal(a, status);
2999    b = float32_squash_input_denormal(b, status);
3000
3001    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3002         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3003       ) {
3004        if (float32_is_signaling_nan(a, status)
3005         || float32_is_signaling_nan(b, status)) {
3006            float_raise(float_flag_invalid, status);
3007        }
3008        return 0;
3009    }
3010    aSign = extractFloat32Sign( a );
3011    bSign = extractFloat32Sign( b );
3012    av = float32_val(a);
3013    bv = float32_val(b);
3014    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3015    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3016
3017}
3018
3019/*----------------------------------------------------------------------------
3020| Returns 1 if the single-precision floating-point value `a' is less than
3021| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3022| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3023| Standard for Binary Floating-Point Arithmetic.
3024*----------------------------------------------------------------------------*/
3025
3026int float32_lt_quiet(float32 a, float32 b, float_status *status)
3027{
3028    flag aSign, bSign;
3029    uint32_t av, bv;
3030    a = float32_squash_input_denormal(a, status);
3031    b = float32_squash_input_denormal(b, status);
3032
3033    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3034         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3035       ) {
3036        if (float32_is_signaling_nan(a, status)
3037         || float32_is_signaling_nan(b, status)) {
3038            float_raise(float_flag_invalid, status);
3039        }
3040        return 0;
3041    }
3042    aSign = extractFloat32Sign( a );
3043    bSign = extractFloat32Sign( b );
3044    av = float32_val(a);
3045    bv = float32_val(b);
3046    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3047    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3048
3049}
3050
3051/*----------------------------------------------------------------------------
3052| Returns 1 if the single-precision floating-point values `a' and `b' cannot
3053| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3054| comparison is performed according to the IEC/IEEE Standard for Binary
3055| Floating-Point Arithmetic.
3056*----------------------------------------------------------------------------*/
3057
3058int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3059{
3060    a = float32_squash_input_denormal(a, status);
3061    b = float32_squash_input_denormal(b, status);
3062
3063    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3064         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3065       ) {
3066        if (float32_is_signaling_nan(a, status)
3067         || float32_is_signaling_nan(b, status)) {
3068            float_raise(float_flag_invalid, status);
3069        }
3070        return 1;
3071    }
3072    return 0;
3073}
3074
3075/*----------------------------------------------------------------------------
3076| Returns the result of converting the double-precision floating-point value
3077| `a' to the 32-bit two's complement integer format.  The conversion is
3078| performed according to the IEC/IEEE Standard for Binary Floating-Point
3079| Arithmetic---which means in particular that the conversion is rounded
3080| according to the current rounding mode.  If `a' is a NaN, the largest
3081| positive integer is returned.  Otherwise, if the conversion overflows, the
3082| largest integer with the same sign as `a' is returned.
3083*----------------------------------------------------------------------------*/
3084
3085int32_t float64_to_int32(float64 a, float_status *status)
3086{
3087    flag aSign;
3088    int aExp;
3089    int shiftCount;
3090    uint64_t aSig;
3091    a = float64_squash_input_denormal(a, status);
3092
3093    aSig = extractFloat64Frac( a );
3094    aExp = extractFloat64Exp( a );
3095    aSign = extractFloat64Sign( a );
3096    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3097    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3098    shiftCount = 0x42C - aExp;
3099    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3100    return roundAndPackInt32(aSign, aSig, status);
3101
3102}
3103
3104/*----------------------------------------------------------------------------
3105| Returns the result of converting the double-precision floating-point value
3106| `a' to the 32-bit two's complement integer format.  The conversion is
3107| performed according to the IEC/IEEE Standard for Binary Floating-Point
3108| Arithmetic, except that the conversion is always rounded toward zero.
3109| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3110| the conversion overflows, the largest integer with the same sign as `a' is
3111| returned.
3112*----------------------------------------------------------------------------*/
3113
3114int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
3115{
3116    flag aSign;
3117    int aExp;
3118    int shiftCount;
3119    uint64_t aSig, savedASig;
3120    int32_t z;
3121    a = float64_squash_input_denormal(a, status);
3122
3123    aSig = extractFloat64Frac( a );
3124    aExp = extractFloat64Exp( a );
3125    aSign = extractFloat64Sign( a );
3126    if ( 0x41E < aExp ) {
3127        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3128        goto invalid;
3129    }
3130    else if ( aExp < 0x3FF ) {
3131        if (aExp || aSig) {
3132            status->float_exception_flags |= float_flag_inexact;
3133        }
3134        return 0;
3135    }
3136    aSig |= LIT64( 0x0010000000000000 );
3137    shiftCount = 0x433 - aExp;
3138    savedASig = aSig;
3139    aSig >>= shiftCount;
3140    z = aSig;
3141    if ( aSign ) z = - z;
3142    if ( ( z < 0 ) ^ aSign ) {
3143 invalid:
3144        float_raise(float_flag_invalid, status);
3145        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3146    }
3147    if ( ( aSig<<shiftCount ) != savedASig ) {
3148        status->float_exception_flags |= float_flag_inexact;
3149    }
3150    return z;
3151
3152}
3153
3154/*----------------------------------------------------------------------------
3155| Returns the result of converting the double-precision floating-point value
3156| `a' to the 16-bit two's complement integer format.  The conversion is
3157| performed according to the IEC/IEEE Standard for Binary Floating-Point
3158| Arithmetic, except that the conversion is always rounded toward zero.
3159| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3160| the conversion overflows, the largest integer with the same sign as `a' is
3161| returned.
3162*----------------------------------------------------------------------------*/
3163
3164int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3165{
3166    flag aSign;
3167    int aExp;
3168    int shiftCount;
3169    uint64_t aSig, savedASig;
3170    int32_t z;
3171
3172    aSig = extractFloat64Frac( a );
3173    aExp = extractFloat64Exp( a );
3174    aSign = extractFloat64Sign( a );
3175    if ( 0x40E < aExp ) {
3176        if ( ( aExp == 0x7FF ) && aSig ) {
3177            aSign = 0;
3178        }
3179        goto invalid;
3180    }
3181    else if ( aExp < 0x3FF ) {
3182        if ( aExp || aSig ) {
3183            status->float_exception_flags |= float_flag_inexact;
3184        }
3185        return 0;
3186    }
3187    aSig |= LIT64( 0x0010000000000000 );
3188    shiftCount = 0x433 - aExp;
3189    savedASig = aSig;
3190    aSig >>= shiftCount;
3191    z = aSig;
3192    if ( aSign ) {
3193        z = - z;
3194    }
3195    if ( ( (int16_t)z < 0 ) ^ aSign ) {
3196 invalid:
3197        float_raise(float_flag_invalid, status);
3198        return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3199    }
3200    if ( ( aSig<<shiftCount ) != savedASig ) {
3201        status->float_exception_flags |= float_flag_inexact;
3202    }
3203    return z;
3204}
3205
3206/*----------------------------------------------------------------------------
3207| Returns the result of converting the double-precision floating-point value
3208| `a' to the 64-bit two's complement integer format.  The conversion is
3209| performed according to the IEC/IEEE Standard for Binary Floating-Point
3210| Arithmetic---which means in particular that the conversion is rounded
3211| according to the current rounding mode.  If `a' is a NaN, the largest
3212| positive integer is returned.  Otherwise, if the conversion overflows, the
3213| largest integer with the same sign as `a' is returned.
3214*----------------------------------------------------------------------------*/
3215
3216int64_t float64_to_int64(float64 a, float_status *status)
3217{
3218    flag aSign;
3219    int aExp;
3220    int shiftCount;
3221    uint64_t aSig, aSigExtra;
3222    a = float64_squash_input_denormal(a, status);
3223
3224    aSig = extractFloat64Frac( a );
3225    aExp = extractFloat64Exp( a );
3226    aSign = extractFloat64Sign( a );
3227    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3228    shiftCount = 0x433 - aExp;
3229    if ( shiftCount <= 0 ) {
3230        if ( 0x43E < aExp ) {
3231            float_raise(float_flag_invalid, status);
3232            if (    ! aSign
3233                 || (    ( aExp == 0x7FF )
3234                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
3235               ) {
3236                return LIT64( 0x7FFFFFFFFFFFFFFF );
3237            }
3238            return (int64_t) LIT64( 0x8000000000000000 );
3239        }
3240        aSigExtra = 0;
3241        aSig <<= - shiftCount;
3242    }
3243    else {
3244        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3245    }
3246    return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3247
3248}
3249
3250/*----------------------------------------------------------------------------
3251| Returns the result of converting the double-precision floating-point value
3252| `a' to the 64-bit two's complement integer format.  The conversion is
3253| performed according to the IEC/IEEE Standard for Binary Floating-Point
3254| Arithmetic, except that the conversion is always rounded toward zero.
3255| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3256| the conversion overflows, the largest integer with the same sign as `a' is
3257| returned.
3258*----------------------------------------------------------------------------*/
3259
3260int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
3261{
3262    flag aSign;
3263    int aExp;
3264    int shiftCount;
3265    uint64_t aSig;
3266    int64_t z;
3267    a = float64_squash_input_denormal(a, status);
3268
3269    aSig = extractFloat64Frac( a );
3270    aExp = extractFloat64Exp( a );
3271    aSign = extractFloat64Sign( a );
3272    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3273    shiftCount = aExp - 0x433;
3274    if ( 0 <= shiftCount ) {
3275        if ( 0x43E <= aExp ) {
3276            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3277                float_raise(float_flag_invalid, status);
3278                if (    ! aSign
3279                     || (    ( aExp == 0x7FF )
3280                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
3281                   ) {
3282                    return LIT64( 0x7FFFFFFFFFFFFFFF );
3283                }
3284            }
3285            return (int64_t) LIT64( 0x8000000000000000 );
3286        }
3287        z = aSig<<shiftCount;
3288    }
3289    else {
3290        if ( aExp < 0x3FE ) {
3291            if (aExp | aSig) {
3292                status->float_exception_flags |= float_flag_inexact;
3293            }
3294            return 0;
3295        }
3296        z = aSig>>( - shiftCount );
3297        if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3298            status->float_exception_flags |= float_flag_inexact;
3299        }
3300    }
3301    if ( aSign ) z = - z;
3302    return z;
3303
3304}
3305
3306/*----------------------------------------------------------------------------
3307| Returns the result of converting the double-precision floating-point value
3308| `a' to the single-precision floating-point format.  The conversion is
3309| performed according to the IEC/IEEE Standard for Binary Floating-Point
3310| Arithmetic.
3311*----------------------------------------------------------------------------*/
3312
3313float32 float64_to_float32(float64 a, float_status *status)
3314{
3315    flag aSign;
3316    int aExp;
3317    uint64_t aSig;
3318    uint32_t zSig;
3319    a = float64_squash_input_denormal(a, status);
3320
3321    aSig = extractFloat64Frac( a );
3322    aExp = extractFloat64Exp( a );
3323    aSign = extractFloat64Sign( a );
3324    if ( aExp == 0x7FF ) {
3325        if (aSig) {
3326            return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3327        }
3328        return packFloat32( aSign, 0xFF, 0 );
3329    }
3330    shift64RightJamming( aSig, 22, &aSig );
3331    zSig = aSig;
3332    if ( aExp || zSig ) {
3333        zSig |= 0x40000000;
3334        aExp -= 0x381;
3335    }
3336    return roundAndPackFloat32(aSign, aExp, zSig, status);
3337
3338}
3339
3340
3341/*----------------------------------------------------------------------------
3342| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3343| half-precision floating-point value, returning the result.  After being
3344| shifted into the proper positions, the three fields are simply added
3345| together to form the result.  This means that any integer portion of `zSig'
3346| will be added into the exponent.  Since a properly normalized significand
3347| will have an integer portion equal to 1, the `zExp' input should be 1 less
3348| than the desired result exponent whenever `zSig' is a complete, normalized
3349| significand.
3350*----------------------------------------------------------------------------*/
3351static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3352{
3353    return make_float16(
3354        (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3355}
3356
3357/*----------------------------------------------------------------------------
3358| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3359| and significand `zSig', and returns the proper half-precision floating-
3360| point value corresponding to the abstract input.  Ordinarily, the abstract
3361| value is simply rounded and packed into the half-precision format, with
3362| the inexact exception raised if the abstract input cannot be represented
3363| exactly.  However, if the abstract value is too large, the overflow and
3364| inexact exceptions are raised and an infinity or maximal finite value is
3365| returned.  If the abstract value is too small, the input value is rounded to
3366| a subnormal number, and the underflow and inexact exceptions are raised if
3367| the abstract input cannot be represented exactly as a subnormal half-
3368| precision floating-point number.
3369| The `ieee' flag indicates whether to use IEEE standard half precision, or
3370| ARM-style "alternative representation", which omits the NaN and Inf
3371| encodings in order to raise the maximum representable exponent by one.
3372|     The input significand `zSig' has its binary point between bits 22
3373| and 23, which is 13 bits to the left of the usual location.  This shifted
3374| significand must be normalized or smaller.  If `zSig' is not normalized,
3375| `zExp' must be 0; in that case, the result returned is a subnormal number,
3376| and it must not require rounding.  In the usual case that `zSig' is
3377| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3378| Note the slightly odd position of the binary point in zSig compared with the
3379| other roundAndPackFloat functions. This should probably be fixed if we
3380| need to implement more float16 routines than just conversion.
3381| The handling of underflow and overflow follows the IEC/IEEE Standard for
3382| Binary Floating-Point Arithmetic.
3383*----------------------------------------------------------------------------*/
3384
3385static float16 roundAndPackFloat16(flag zSign, int zExp,
3386                                   uint32_t zSig, flag ieee,
3387                                   float_status *status)
3388{
3389    int maxexp = ieee ? 29 : 30;
3390    uint32_t mask;
3391    uint32_t increment;
3392    bool rounding_bumps_exp;
3393    bool is_tiny = false;
3394
3395    /* Calculate the mask of bits of the mantissa which are not
3396     * representable in half-precision and will be lost.
3397     */
3398    if (zExp < 1) {
3399        /* Will be denormal in halfprec */
3400        mask = 0x00ffffff;
3401        if (zExp >= -11) {
3402            mask >>= 11 + zExp;
3403        }
3404    } else {
3405        /* Normal number in halfprec */
3406        mask = 0x00001fff;
3407    }
3408
3409    switch (status->float_rounding_mode) {
3410    case float_round_nearest_even:
3411        increment = (mask + 1) >> 1;
3412        if ((zSig & mask) == increment) {
3413            increment = zSig & (increment << 1);
3414        }
3415        break;
3416    case float_round_ties_away:
3417        increment = (mask + 1) >> 1;
3418        break;
3419    case float_round_up:
3420        increment = zSign ? 0 : mask;
3421        break;
3422    case float_round_down:
3423        increment = zSign ? mask : 0;
3424        break;
3425    default: /* round_to_zero */
3426        increment = 0;
3427        break;
3428    }
3429
3430    rounding_bumps_exp = (zSig + increment >= 0x01000000);
3431
3432    if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3433        if (ieee) {
3434            float_raise(float_flag_overflow | float_flag_inexact, status);
3435            return packFloat16(zSign, 0x1f, 0);
3436        } else {
3437            float_raise(float_flag_invalid, status);
3438            return packFloat16(zSign, 0x1f, 0x3ff);
3439        }
3440    }
3441
3442    if (zExp < 0) {
3443        /* Note that flush-to-zero does not affect half-precision results */
3444        is_tiny =
3445            (status->float_detect_tininess == float_tininess_before_rounding)
3446            || (zExp < -1)
3447            || (!rounding_bumps_exp);
3448    }
3449    if (zSig & mask) {
3450        float_raise(float_flag_inexact, status);
3451        if (is_tiny) {
3452            float_raise(float_flag_underflow, status);
3453        }
3454    }
3455
3456    zSig += increment;
3457    if (rounding_bumps_exp) {
3458        zSig >>= 1;
3459        zExp++;
3460    }
3461
3462    if (zExp < -10) {
3463        return packFloat16(zSign, 0, 0);
3464    }
3465    if (zExp < 0) {
3466        zSig >>= -zExp;
3467        zExp = 0;
3468    }
3469    return packFloat16(zSign, zExp, zSig >> 13);
3470}
3471
3472static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3473                                      uint32_t *zSigPtr)
3474{
3475    int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3476    *zSigPtr = aSig << shiftCount;
3477    *zExpPtr = 1 - shiftCount;
3478}
3479
3480/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3481   The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3482
3483float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3484{
3485    flag aSign;
3486    int aExp;
3487    uint32_t aSig;
3488
3489    aSign = extractFloat16Sign(a);
3490    aExp = extractFloat16Exp(a);
3491    aSig = extractFloat16Frac(a);
3492
3493    if (aExp == 0x1f && ieee) {
3494        if (aSig) {
3495            return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3496        }
3497        return packFloat32(aSign, 0xff, 0);
3498    }
3499    if (aExp == 0) {
3500        if (aSig == 0) {
3501            return packFloat32(aSign, 0, 0);
3502        }
3503
3504        normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3505        aExp--;
3506    }
3507    return packFloat32( aSign, aExp + 0x70, aSig << 13);
3508}
3509
3510float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3511{
3512    flag aSign;
3513    int aExp;
3514    uint32_t aSig;
3515
3516    a = float32_squash_input_denormal(a, status);
3517
3518    aSig = extractFloat32Frac( a );
3519    aExp = extractFloat32Exp( a );
3520    aSign = extractFloat32Sign( a );
3521    if ( aExp == 0xFF ) {
3522        if (aSig) {
3523            /* Input is a NaN */
3524            if (!ieee) {
3525                float_raise(float_flag_invalid, status);
3526                return packFloat16(aSign, 0, 0);
3527            }
3528            return commonNaNToFloat16(
3529                float32ToCommonNaN(a, status), status);
3530        }
3531        /* Infinity */
3532        if (!ieee) {
3533            float_raise(float_flag_invalid, status);
3534            return packFloat16(aSign, 0x1f, 0x3ff);
3535        }
3536        return packFloat16(aSign, 0x1f, 0);
3537    }
3538    if (aExp == 0 && aSig == 0) {
3539        return packFloat16(aSign, 0, 0);
3540    }
3541    /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3542     * even if the input is denormal; however this is harmless because
3543     * the largest possible single-precision denormal is still smaller
3544     * than the smallest representable half-precision denormal, and so we
3545     * will end up ignoring aSig and returning via the "always return zero"
3546     * codepath.
3547     */
3548    aSig |= 0x00800000;
3549    aExp -= 0x71;
3550
3551    return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3552}
3553
3554float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3555{
3556    flag aSign;
3557    int aExp;
3558    uint32_t aSig;
3559
3560    aSign = extractFloat16Sign(a);
3561    aExp = extractFloat16Exp(a);
3562    aSig = extractFloat16Frac(a);
3563
3564    if (aExp == 0x1f && ieee) {
3565        if (aSig) {
3566            return commonNaNToFloat64(
3567                float16ToCommonNaN(a, status), status);
3568        }
3569        return packFloat64(aSign, 0x7ff, 0);
3570    }
3571    if (aExp == 0) {
3572        if (aSig == 0) {
3573            return packFloat64(aSign, 0, 0);
3574        }
3575
3576        normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3577        aExp--;
3578    }
3579    return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3580}
3581
3582float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3583{
3584    flag aSign;
3585    int aExp;
3586    uint64_t aSig;
3587    uint32_t zSig;
3588
3589    a = float64_squash_input_denormal(a, status);
3590
3591    aSig = extractFloat64Frac(a);
3592    aExp = extractFloat64Exp(a);
3593    aSign = extractFloat64Sign(a);
3594    if (aExp == 0x7FF) {
3595        if (aSig) {
3596            /* Input is a NaN */
3597            if (!ieee) {
3598                float_raise(float_flag_invalid, status);
3599                return packFloat16(aSign, 0, 0);
3600            }
3601            return commonNaNToFloat16(
3602                float64ToCommonNaN(a, status), status);
3603        }
3604        /* Infinity */
3605        if (!ieee) {
3606            float_raise(float_flag_invalid, status);
3607            return packFloat16(aSign, 0x1f, 0x3ff);
3608        }
3609        return packFloat16(aSign, 0x1f, 0);
3610    }
3611    shift64RightJamming(aSig, 29, &aSig);
3612    zSig = aSig;
3613    if (aExp == 0 && zSig == 0) {
3614        return packFloat16(aSign, 0, 0);
3615    }
3616    /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3617     * even if the input is denormal; however this is harmless because
3618     * the largest possible single-precision denormal is still smaller
3619     * than the smallest representable half-precision denormal, and so we
3620     * will end up ignoring aSig and returning via the "always return zero"
3621     * codepath.
3622     */
3623    zSig |= 0x00800000;
3624    aExp -= 0x3F1;
3625
3626    return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3627}
3628
3629/*----------------------------------------------------------------------------
3630| Returns the result of converting the double-precision floating-point value
3631| `a' to the extended double-precision floating-point format.  The conversion
3632| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3633| Arithmetic.
3634*----------------------------------------------------------------------------*/
3635
3636floatx80 float64_to_floatx80(float64 a, float_status *status)
3637{
3638    flag aSign;
3639    int aExp;
3640    uint64_t aSig;
3641
3642    a = float64_squash_input_denormal(a, status);
3643    aSig = extractFloat64Frac( a );
3644    aExp = extractFloat64Exp( a );
3645    aSign = extractFloat64Sign( a );
3646    if ( aExp == 0x7FF ) {
3647        if (aSig) {
3648            return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3649        }
3650        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3651    }
3652    if ( aExp == 0 ) {
3653        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3654        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3655    }
3656    return
3657        packFloatx80(
3658            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3659
3660}
3661
3662/*----------------------------------------------------------------------------
3663| Returns the result of converting the double-precision floating-point value
3664| `a' to the quadruple-precision floating-point format.  The conversion is
3665| performed according to the IEC/IEEE Standard for Binary Floating-Point
3666| Arithmetic.
3667*----------------------------------------------------------------------------*/
3668
3669float128 float64_to_float128(float64 a, float_status *status)
3670{
3671    flag aSign;
3672    int aExp;
3673    uint64_t aSig, zSig0, zSig1;
3674
3675    a = float64_squash_input_denormal(a, status);
3676    aSig = extractFloat64Frac( a );
3677    aExp = extractFloat64Exp( a );
3678    aSign = extractFloat64Sign( a );
3679    if ( aExp == 0x7FF ) {
3680        if (aSig) {
3681            return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3682        }
3683        return packFloat128( aSign, 0x7FFF, 0, 0 );
3684    }
3685    if ( aExp == 0 ) {
3686        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3687        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3688        --aExp;
3689    }
3690    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3691    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3692
3693}
3694
3695/*----------------------------------------------------------------------------
3696| Rounds the double-precision floating-point value `a' to an integer, and
3697| returns the result as a double-precision floating-point value.  The
3698| operation is performed according to the IEC/IEEE Standard for Binary
3699| Floating-Point Arithmetic.
3700*----------------------------------------------------------------------------*/
3701
3702float64 float64_round_to_int(float64 a, float_status *status)
3703{
3704    flag aSign;
3705    int aExp;
3706    uint64_t lastBitMask, roundBitsMask;
3707    uint64_t z;
3708    a = float64_squash_input_denormal(a, status);
3709
3710    aExp = extractFloat64Exp( a );
3711    if ( 0x433 <= aExp ) {
3712        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3713            return propagateFloat64NaN(a, a, status);
3714        }
3715        return a;
3716    }
3717    if ( aExp < 0x3FF ) {
3718        if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3719        status->float_exception_flags |= float_flag_inexact;
3720        aSign = extractFloat64Sign( a );
3721        switch (status->float_rounding_mode) {
3722         case float_round_nearest_even:
3723            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3724                return packFloat64( aSign, 0x3FF, 0 );
3725            }
3726            break;
3727        case float_round_ties_away:
3728            if (aExp == 0x3FE) {
3729                return packFloat64(aSign, 0x3ff, 0);
3730            }
3731            break;
3732         case float_round_down:
3733            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3734         case float_round_up:
3735            return make_float64(
3736            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3737        }
3738        return packFloat64( aSign, 0, 0 );
3739    }
3740    lastBitMask = 1;
3741    lastBitMask <<= 0x433 - aExp;
3742    roundBitsMask = lastBitMask - 1;
3743    z = float64_val(a);
3744    switch (status->float_rounding_mode) {
3745    case float_round_nearest_even:
3746        z += lastBitMask >> 1;
3747        if ((z & roundBitsMask) == 0) {
3748            z &= ~lastBitMask;
3749        }
3750        break;
3751    case float_round_ties_away:
3752        z += lastBitMask >> 1;
3753        break;
3754    case float_round_to_zero:
3755        break;
3756    case float_round_up:
3757        if (!extractFloat64Sign(make_float64(z))) {
3758            z += roundBitsMask;
3759        }
3760        break;
3761    case float_round_down:
3762        if (extractFloat64Sign(make_float64(z))) {
3763            z += roundBitsMask;
3764        }
3765        break;
3766    default:
3767        abort();
3768    }
3769    z &= ~ roundBitsMask;
3770    if (z != float64_val(a)) {
3771        status->float_exception_flags |= float_flag_inexact;
3772    }
3773    return make_float64(z);
3774
3775}
3776
3777float64 float64_trunc_to_int(float64 a, float_status *status)
3778{
3779    int oldmode;
3780    float64 res;
3781    oldmode = status->float_rounding_mode;
3782    status->float_rounding_mode = float_round_to_zero;
3783    res = float64_round_to_int(a, status);
3784    status->float_rounding_mode = oldmode;
3785    return res;
3786}
3787
3788/*----------------------------------------------------------------------------
3789| Returns the result of adding the absolute values of the double-precision
3790| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3791| before being returned.  `zSign' is ignored if the result is a NaN.
3792| The addition is performed according to the IEC/IEEE Standard for Binary
3793| Floating-Point Arithmetic.
3794*----------------------------------------------------------------------------*/
3795
3796static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3797                              float_status *status)
3798{
3799    int aExp, bExp, zExp;
3800    uint64_t aSig, bSig, zSig;
3801    int expDiff;
3802
3803    aSig = extractFloat64Frac( a );
3804    aExp = extractFloat64Exp( a );
3805    bSig = extractFloat64Frac( b );
3806    bExp = extractFloat64Exp( b );
3807    expDiff = aExp - bExp;
3808    aSig <<= 9;
3809    bSig <<= 9;
3810    if ( 0 < expDiff ) {
3811        if ( aExp == 0x7FF ) {
3812            if (aSig) {
3813                return propagateFloat64NaN(a, b, status);
3814            }
3815            return a;
3816        }
3817        if ( bExp == 0 ) {
3818            --expDiff;
3819        }
3820        else {
3821            bSig |= LIT64( 0x2000000000000000 );
3822        }
3823        shift64RightJamming( bSig, expDiff, &bSig );
3824        zExp = aExp;
3825    }
3826    else if ( expDiff < 0 ) {
3827        if ( bExp == 0x7FF ) {
3828            if (bSig) {
3829                return propagateFloat64NaN(a, b, status);
3830            }
3831            return packFloat64( zSign, 0x7FF, 0 );
3832        }
3833        if ( aExp == 0 ) {
3834            ++expDiff;
3835        }
3836        else {
3837            aSig |= LIT64( 0x2000000000000000 );
3838        }
3839        shift64RightJamming( aSig, - expDiff, &aSig );
3840        zExp = bExp;
3841    }
3842    else {
3843        if ( aExp == 0x7FF ) {
3844            if (aSig | bSig) {
3845                return propagateFloat64NaN(a, b, status);
3846            }
3847            return a;
3848        }
3849        if ( aExp == 0 ) {
3850            if (status->flush_to_zero) {
3851                if (aSig | bSig) {
3852                    float_raise(float_flag_output_denormal, status);
3853                }
3854                return packFloat64(zSign, 0, 0);
3855            }
3856            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3857        }
3858        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3859        zExp = aExp;
3860        goto roundAndPack;
3861    }
3862    aSig |= LIT64( 0x2000000000000000 );
3863    zSig = ( aSig + bSig )<<1;
3864    --zExp;
3865    if ( (int64_t) zSig < 0 ) {
3866        zSig = aSig + bSig;
3867        ++zExp;
3868    }
3869 roundAndPack:
3870    return roundAndPackFloat64(zSign, zExp, zSig, status);
3871
3872}
3873
3874/*----------------------------------------------------------------------------
3875| Returns the result of subtracting the absolute values of the double-
3876| precision floating-point values `a' and `b'.  If `zSign' is 1, the
3877| difference is negated before being returned.  `zSign' is ignored if the
3878| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3879| Standard for Binary Floating-Point Arithmetic.
3880*----------------------------------------------------------------------------*/
3881
3882static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3883                              float_status *status)
3884{
3885    int aExp, bExp, zExp;
3886    uint64_t aSig, bSig, zSig;
3887    int expDiff;
3888
3889    aSig = extractFloat64Frac( a );
3890    aExp = extractFloat64Exp( a );
3891    bSig = extractFloat64Frac( b );
3892    bExp = extractFloat64Exp( b );
3893    expDiff = aExp - bExp;
3894    aSig <<= 10;
3895    bSig <<= 10;
3896    if ( 0 < expDiff ) goto aExpBigger;
3897    if ( expDiff < 0 ) goto bExpBigger;
3898    if ( aExp == 0x7FF ) {
3899        if (aSig | bSig) {
3900            return propagateFloat64NaN(a, b, status);
3901        }
3902        float_raise(float_flag_invalid, status);
3903        return float64_default_nan(status);
3904    }
3905    if ( aExp == 0 ) {
3906        aExp = 1;
3907        bExp = 1;
3908    }
3909    if ( bSig < aSig ) goto aBigger;
3910    if ( aSig < bSig ) goto bBigger;
3911    return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
3912 bExpBigger:
3913    if ( bExp == 0x7FF ) {
3914        if (bSig) {
3915            return propagateFloat64NaN(a, b, status);
3916        }
3917        return packFloat64( zSign ^ 1, 0x7FF, 0 );
3918    }
3919    if ( aExp == 0 ) {
3920        ++expDiff;
3921    }
3922    else {
3923        aSig |= LIT64( 0x4000000000000000 );
3924    }
3925    shift64RightJamming( aSig, - expDiff, &aSig );
3926    bSig |= LIT64( 0x4000000000000000 );
3927 bBigger:
3928    zSig = bSig - aSig;
3929    zExp = bExp;
3930    zSign ^= 1;
3931    goto normalizeRoundAndPack;
3932 aExpBigger:
3933    if ( aExp == 0x7FF ) {
3934        if (aSig) {
3935            return propagateFloat64NaN(a, b, status);
3936        }
3937        return a;
3938    }
3939    if ( bExp == 0 ) {
3940        --expDiff;
3941    }
3942    else {
3943        bSig |= LIT64( 0x4000000000000000 );
3944    }
3945    shift64RightJamming( bSig, expDiff, &bSig );
3946    aSig |= LIT64( 0x4000000000000000 );
3947 aBigger:
3948    zSig = aSig - bSig;
3949    zExp = aExp;
3950 normalizeRoundAndPack:
3951    --zExp;
3952    return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
3953
3954}
3955
3956/*----------------------------------------------------------------------------
3957| Returns the result of adding the double-precision floating-point values `a'
3958| and `b'.  The operation is performed according to the IEC/IEEE Standard for
3959| Binary Floating-Point Arithmetic.
3960*----------------------------------------------------------------------------*/
3961
3962float64 float64_add(float64 a, float64 b, float_status *status)
3963{
3964    flag aSign, bSign;
3965    a = float64_squash_input_denormal(a, status);
3966    b = float64_squash_input_denormal(b, status);
3967
3968    aSign = extractFloat64Sign( a );
3969    bSign = extractFloat64Sign( b );
3970    if ( aSign == bSign ) {
3971        return addFloat64Sigs(a, b, aSign, status);
3972    }
3973    else {
3974        return subFloat64Sigs(a, b, aSign, status);
3975    }
3976
3977}
3978
3979/*----------------------------------------------------------------------------
3980| Returns the result of subtracting the double-precision floating-point values
3981| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3982| for Binary Floating-Point Arithmetic.
3983*----------------------------------------------------------------------------*/
3984
3985float64 float64_sub(float64 a, float64 b, float_status *status)
3986{
3987    flag aSign, bSign;
3988    a = float64_squash_input_denormal(a, status);
3989    b = float64_squash_input_denormal(b, status);
3990
3991    aSign = extractFloat64Sign( a );
3992    bSign = extractFloat64Sign( b );
3993    if ( aSign == bSign ) {
3994        return subFloat64Sigs(a, b, aSign, status);
3995    }
3996    else {
3997        return addFloat64Sigs(a, b, aSign, status);
3998    }
3999
4000}
4001
4002/*----------------------------------------------------------------------------
4003| Returns the result of multiplying the double-precision floating-point values
4004| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4005| for Binary Floating-Point Arithmetic.
4006*----------------------------------------------------------------------------*/
4007
4008float64 float64_mul(float64 a, float64 b, float_status *status)
4009{
4010    flag aSign, bSign, zSign;
4011    int aExp, bExp, zExp;
4012    uint64_t aSig, bSig, zSig0, zSig1;
4013
4014    a = float64_squash_input_denormal(a, status);
4015    b = float64_squash_input_denormal(b, status);
4016
4017    aSig = extractFloat64Frac( a );
4018    aExp = extractFloat64Exp( a );
4019    aSign = extractFloat64Sign( a );
4020    bSig = extractFloat64Frac( b );
4021    bExp = extractFloat64Exp( b );
4022    bSign = extractFloat64Sign( b );
4023    zSign = aSign ^ bSign;
4024    if ( aExp == 0x7FF ) {
4025        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4026            return propagateFloat64NaN(a, b, status);
4027        }
4028        if ( ( bExp | bSig ) == 0 ) {
4029            float_raise(float_flag_invalid, status);
4030            return float64_default_nan(status);
4031        }
4032        return packFloat64( zSign, 0x7FF, 0 );
4033    }
4034    if ( bExp == 0x7FF ) {
4035        if (bSig) {
4036            return propagateFloat64NaN(a, b, status);
4037        }
4038        if ( ( aExp | aSig ) == 0 ) {
4039            float_raise(float_flag_invalid, status);
4040            return float64_default_nan(status);
4041        }
4042        return packFloat64( zSign, 0x7FF, 0 );
4043    }
4044    if ( aExp == 0 ) {
4045        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4046        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4047    }
4048    if ( bExp == 0 ) {
4049        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4050        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4051    }
4052    zExp = aExp + bExp - 0x3FF;
4053    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4054    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4055    mul64To128( aSig, bSig, &zSig0, &zSig1 );
4056    zSig0 |= ( zSig1 != 0 );
4057    if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
4058        zSig0 <<= 1;
4059        --zExp;
4060    }
4061    return roundAndPackFloat64(zSign, zExp, zSig0, status);
4062
4063}
4064
4065/*----------------------------------------------------------------------------
4066| Returns the result of dividing the double-precision floating-point value `a'
4067| by the corresponding value `b'.  The operation is performed according to
4068| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4069*----------------------------------------------------------------------------*/
4070
4071float64 float64_div(float64 a, float64 b, float_status *status)
4072{
4073    flag aSign, bSign, zSign;
4074    int aExp, bExp, zExp;
4075    uint64_t aSig, bSig, zSig;
4076    uint64_t rem0, rem1;
4077    uint64_t term0, term1;
4078    a = float64_squash_input_denormal(a, status);
4079    b = float64_squash_input_denormal(b, status);
4080
4081    aSig = extractFloat64Frac( a );
4082    aExp = extractFloat64Exp( a );
4083    aSign = extractFloat64Sign( a );
4084    bSig = extractFloat64Frac( b );
4085    bExp = extractFloat64Exp( b );
4086    bSign = extractFloat64Sign( b );
4087    zSign = aSign ^ bSign;
4088    if ( aExp == 0x7FF ) {
4089        if (aSig) {
4090            return propagateFloat64NaN(a, b, status);
4091        }
4092        if ( bExp == 0x7FF ) {
4093            if (bSig) {
4094                return propagateFloat64NaN(a, b, status);
4095            }
4096            float_raise(float_flag_invalid, status);
4097            return float64_default_nan(status);
4098        }
4099        return packFloat64( zSign, 0x7FF, 0 );
4100    }
4101    if ( bExp == 0x7FF ) {
4102        if (bSig) {
4103            return propagateFloat64NaN(a, b, status);
4104        }
4105        return packFloat64( zSign, 0, 0 );
4106    }
4107    if ( bExp == 0 ) {
4108        if ( bSig == 0 ) {
4109            if ( ( aExp | aSig ) == 0 ) {
4110                float_raise(float_flag_invalid, status);
4111                return float64_default_nan(status);
4112            }
4113            float_raise(float_flag_divbyzero, status);
4114            return packFloat64( zSign, 0x7FF, 0 );
4115        }
4116        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4117    }
4118    if ( aExp == 0 ) {
4119        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4120        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4121    }
4122    zExp = aExp - bExp + 0x3FD;
4123    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4124    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4125    if ( bSig <= ( aSig + aSig ) ) {
4126        aSig >>= 1;
4127        ++zExp;
4128    }
4129    zSig = estimateDiv128To64( aSig, 0, bSig );
4130    if ( ( zSig & 0x1FF ) <= 2 ) {
4131        mul64To128( bSig, zSig, &term0, &term1 );
4132        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4133        while ( (int64_t) rem0 < 0 ) {
4134            --zSig;
4135            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4136        }
4137        zSig |= ( rem1 != 0 );
4138    }
4139    return roundAndPackFloat64(zSign, zExp, zSig, status);
4140
4141}
4142
4143/*----------------------------------------------------------------------------
4144| Returns the remainder of the double-precision floating-point value `a'
4145| with respect to the corresponding value `b'.  The operation is performed
4146| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4147*----------------------------------------------------------------------------*/
4148
4149float64 float64_rem(float64 a, float64 b, float_status *status)
4150{
4151    flag aSign, zSign;
4152    int aExp, bExp, expDiff;
4153    uint64_t aSig, bSig;
4154    uint64_t q, alternateASig;
4155    int64_t sigMean;
4156
4157    a = float64_squash_input_denormal(a, status);
4158    b = float64_squash_input_denormal(b, status);
4159    aSig = extractFloat64Frac( a );
4160    aExp = extractFloat64Exp( a );
4161    aSign = extractFloat64Sign( a );
4162    bSig = extractFloat64Frac( b );
4163    bExp = extractFloat64Exp( b );
4164    if ( aExp == 0x7FF ) {
4165        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4166            return propagateFloat64NaN(a, b, status);
4167        }
4168        float_raise(float_flag_invalid, status);
4169        return float64_default_nan(status);
4170    }
4171    if ( bExp == 0x7FF ) {
4172        if (bSig) {
4173            return propagateFloat64NaN(a, b, status);
4174        }
4175        return a;
4176    }
4177    if ( bExp == 0 ) {
4178        if ( bSig == 0 ) {
4179            float_raise(float_flag_invalid, status);
4180            return float64_default_nan(status);
4181        }
4182        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4183    }
4184    if ( aExp == 0 ) {
4185        if ( aSig == 0 ) return a;
4186        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4187    }
4188    expDiff = aExp - bExp;
4189    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4190    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4191    if ( expDiff < 0 ) {
4192        if ( expDiff < -1 ) return a;
4193        aSig >>= 1;
4194    }
4195    q = ( bSig <= aSig );
4196    if ( q ) aSig -= bSig;
4197    expDiff -= 64;
4198    while ( 0 < expDiff ) {
4199        q = estimateDiv128To64( aSig, 0, bSig );
4200        q = ( 2 < q ) ? q - 2 : 0;
4201        aSig = - ( ( bSig>>2 ) * q );
4202        expDiff -= 62;
4203    }
4204    expDiff += 64;
4205    if ( 0 < expDiff ) {
4206        q = estimateDiv128To64( aSig, 0, bSig );
4207        q = ( 2 < q ) ? q - 2 : 0;
4208        q >>= 64 - expDiff;
4209        bSig >>= 2;
4210        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4211    }
4212    else {
4213        aSig >>= 2;
4214        bSig >>= 2;
4215    }
4216    do {
4217        alternateASig = aSig;
4218        ++q;
4219        aSig -= bSig;
4220    } while ( 0 <= (int64_t) aSig );
4221    sigMean = aSig + alternateASig;
4222    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4223        aSig = alternateASig;
4224    }
4225    zSign = ( (int64_t) aSig < 0 );
4226    if ( zSign ) aSig = - aSig;
4227    return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4228
4229}
4230
4231/*----------------------------------------------------------------------------
4232| Returns the result of multiplying the double-precision floating-point values
4233| `a' and `b' then adding 'c', with no intermediate rounding step after the
4234| multiplication.  The operation is performed according to the IEC/IEEE
4235| Standard for Binary Floating-Point Arithmetic 754-2008.
4236| The flags argument allows the caller to select negation of the
4237| addend, the intermediate product, or the final result. (The difference
4238| between this and having the caller do a separate negation is that negating
4239| externally will flip the sign bit on NaNs.)
4240*----------------------------------------------------------------------------*/
4241
4242float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4243                       float_status *status)
4244{
4245    flag aSign, bSign, cSign, zSign;
4246    int aExp, bExp, cExp, pExp, zExp, expDiff;
4247    uint64_t aSig, bSig, cSig;
4248    flag pInf, pZero, pSign;
4249    uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4250    int shiftcount;
4251    flag signflip, infzero;
4252
4253    a = float64_squash_input_denormal(a, status);
4254    b = float64_squash_input_denormal(b, status);
4255    c = float64_squash_input_denormal(c, status);
4256    aSig = extractFloat64Frac(a);
4257    aExp = extractFloat64Exp(a);
4258    aSign = extractFloat64Sign(a);
4259    bSig = extractFloat64Frac(b);
4260    bExp = extractFloat64Exp(b);
4261    bSign = extractFloat64Sign(b);
4262    cSig = extractFloat64Frac(c);
4263    cExp = extractFloat64Exp(c);
4264    cSign = extractFloat64Sign(c);
4265
4266    infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4267               (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4268
4269    /* It is implementation-defined whether the cases of (0,inf,qnan)
4270     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4271     * they return if they do), so we have to hand this information
4272     * off to the target-specific pick-a-NaN routine.
4273     */
4274    if (((aExp == 0x7ff) && aSig) ||
4275        ((bExp == 0x7ff) && bSig) ||
4276        ((cExp == 0x7ff) && cSig)) {
4277        return propagateFloat64MulAddNaN(a, b, c, infzero, status);
4278    }
4279
4280    if (infzero) {
4281        float_raise(float_flag_invalid, status);
4282        return float64_default_nan(status);
4283    }
4284
4285    if (flags & float_muladd_negate_c) {
4286        cSign ^= 1;
4287    }
4288
4289    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4290
4291    /* Work out the sign and type of the product */
4292    pSign = aSign ^ bSign;
4293    if (flags & float_muladd_negate_product) {
4294        pSign ^= 1;
4295    }
4296    pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4297    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4298
4299    if (cExp == 0x7ff) {
4300        if (pInf && (pSign ^ cSign)) {
4301            /* addition of opposite-signed infinities => InvalidOperation */
4302            float_raise(float_flag_invalid, status);
4303            return float64_default_nan(status);
4304        }
4305        /* Otherwise generate an infinity of the same sign */
4306        return packFloat64(cSign ^ signflip, 0x7ff, 0);
4307    }
4308
4309    if (pInf) {
4310        return packFloat64(pSign ^ signflip, 0x7ff, 0);
4311    }
4312
4313    if (pZero) {
4314        if (cExp == 0) {
4315            if (cSig == 0) {
4316                /* Adding two exact zeroes */
4317                if (pSign == cSign) {
4318                    zSign = pSign;
4319                } else if (status->float_rounding_mode == float_round_down) {
4320                    zSign = 1;
4321                } else {
4322                    zSign = 0;
4323                }
4324                return packFloat64(zSign ^ signflip, 0, 0);
4325            }
4326            /* Exact zero plus a denorm */
4327            if (status->flush_to_zero) {
4328                float_raise(float_flag_output_denormal, status);
4329                return packFloat64(cSign ^ signflip, 0, 0);
4330            }
4331        }
4332        /* Zero plus something non-zero : just return the something */
4333        if (flags & float_muladd_halve_result) {
4334            if (cExp == 0) {
4335                normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4336            }
4337            /* Subtract one to halve, and one again because roundAndPackFloat64
4338             * wants one less than the true exponent.
4339             */
4340            cExp -= 2;
4341            cSig = (cSig | 0x0010000000000000ULL) << 10;
4342            return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
4343        }
4344        return packFloat64(cSign ^ signflip, cExp, cSig);
4345    }
4346
4347    if (aExp == 0) {
4348        normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4349    }
4350    if (bExp == 0) {
4351        normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4352    }
4353
4354    /* Calculate the actual result a * b + c */
4355
4356    /* Multiply first; this is easy. */
4357    /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4358     * because we want the true exponent, not the "one-less-than"
4359     * flavour that roundAndPackFloat64() takes.
4360     */
4361    pExp = aExp + bExp - 0x3fe;
4362    aSig = (aSig | LIT64(0x0010000000000000))<<10;
4363    bSig = (bSig | LIT64(0x0010000000000000))<<11;
4364    mul64To128(aSig, bSig, &pSig0, &pSig1);
4365    if ((int64_t)(pSig0 << 1) >= 0) {
4366        shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4367        pExp--;
4368    }
4369
4370    zSign = pSign ^ signflip;
4371
4372    /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4373     * bit in position 126.
4374     */
4375    if (cExp == 0) {
4376        if (!cSig) {
4377            /* Throw out the special case of c being an exact zero now */
4378            shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4379            if (flags & float_muladd_halve_result) {
4380                pExp--;
4381            }
4382            return roundAndPackFloat64(zSign, pExp - 1,
4383                                       pSig1, status);
4384        }
4385        normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4386    }
4387
4388    /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4389     * significand of the addend, with the explicit bit in position 126.
4390     */
4391    cSig0 = cSig << (126 - 64 - 52);
4392    cSig1 = 0;
4393    cSig0 |= LIT64(0x4000000000000000);
4394    expDiff = pExp - cExp;
4395
4396    if (pSign == cSign) {
4397        /* Addition */
4398        if (expDiff > 0) {
4399            /* scale c to match p */
4400            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4401            zExp = pExp;
4402        } else if (expDiff < 0) {
4403            /* scale p to match c */
4404            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4405            zExp = cExp;
4406        } else {
4407            /* no scaling needed */
4408            zExp = cExp;
4409        }
4410        /* Add significands and make sure explicit bit ends up in posn 126 */
4411        add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4412        if ((int64_t)zSig0 < 0) {
4413            shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4414        } else {
4415            zExp--;
4416        }
4417        shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4418        if (flags & float_muladd_halve_result) {
4419            zExp--;
4420        }
4421        return roundAndPackFloat64(zSign, zExp, zSig1, status);
4422    } else {
4423        /* Subtraction */
4424        if (expDiff > 0) {
4425            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4426            sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4427            zExp = pExp;
4428        } else if (expDiff < 0) {
4429            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4430            sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4431            zExp = cExp;
4432            zSign ^= 1;
4433        } else {
4434            zExp = pExp;
4435            if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4436                sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4437            } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4438                sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4439                zSign ^= 1;
4440            } else {
4441                /* Exact zero */
4442                zSign = signflip;
4443                if (status->float_rounding_mode == float_round_down) {
4444                    zSign ^= 1;
4445                }
4446                return packFloat64(zSign, 0, 0);
4447            }
4448        }
4449        --zExp;
4450        /* Do the equivalent of normalizeRoundAndPackFloat64() but
4451         * starting with the significand in a pair of uint64_t.
4452         */
4453        if (zSig0) {
4454            shiftcount = countLeadingZeros64(zSig0) - 1;
4455            shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4456            if (zSig1) {
4457                zSig0 |= 1;
4458            }
4459            zExp -= shiftcount;
4460        } else {
4461            shiftcount = countLeadingZeros64(zSig1);
4462            if (shiftcount == 0) {
4463                zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4464                zExp -= 63;
4465            } else {
4466                shiftcount--;
4467                zSig0 = zSig1 << shiftcount;
4468                zExp -= (shiftcount + 64);
4469            }
4470        }
4471        if (flags & float_muladd_halve_result) {
4472            zExp--;
4473        }
4474        return roundAndPackFloat64(zSign, zExp, zSig0, status);
4475    }
4476}
4477
4478/*----------------------------------------------------------------------------
4479| Returns the square root of the double-precision floating-point value `a'.
4480| The operation is performed according to the IEC/IEEE Standard for Binary
4481| Floating-Point Arithmetic.
4482*----------------------------------------------------------------------------*/
4483
4484float64 float64_sqrt(float64 a, float_status *status)
4485{
4486    flag aSign;
4487    int aExp, zExp;
4488    uint64_t aSig, zSig, doubleZSig;
4489    uint64_t rem0, rem1, term0, term1;
4490    a = float64_squash_input_denormal(a, status);
4491
4492    aSig = extractFloat64Frac( a );
4493    aExp = extractFloat64Exp( a );
4494    aSign = extractFloat64Sign( a );
4495    if ( aExp == 0x7FF ) {
4496        if (aSig) {
4497            return propagateFloat64NaN(a, a, status);
4498        }
4499        if ( ! aSign ) return a;
4500        float_raise(float_flag_invalid, status);
4501        return float64_default_nan(status);
4502    }
4503    if ( aSign ) {
4504        if ( ( aExp | aSig ) == 0 ) return a;
4505        float_raise(float_flag_invalid, status);
4506        return float64_default_nan(status);
4507    }
4508    if ( aExp == 0 ) {
4509        if ( aSig == 0 ) return float64_zero;
4510        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4511    }
4512    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4513    aSig |= LIT64( 0x0010000000000000 );
4514    zSig = estimateSqrt32( aExp, aSig>>21 );
4515    aSig <<= 9 - ( aExp & 1 );
4516    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4517    if ( ( zSig & 0x1FF ) <= 5 ) {
4518        doubleZSig = zSig<<1;
4519        mul64To128( zSig, zSig, &term0, &term1 );
4520        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4521        while ( (int64_t) rem0 < 0 ) {
4522            --zSig;
4523            doubleZSig -= 2;
4524            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4525        }
4526        zSig |= ( ( rem0 | rem1 ) != 0 );
4527    }
4528    return roundAndPackFloat64(0, zExp, zSig, status);
4529
4530}
4531
4532/*----------------------------------------------------------------------------
4533| Returns the binary log of the double-precision floating-point value `a'.
4534| The operation is performed according to the IEC/IEEE Standard for Binary
4535| Floating-Point Arithmetic.
4536*----------------------------------------------------------------------------*/
4537float64 float64_log2(float64 a, float_status *status)
4538{
4539    flag aSign, zSign;
4540    int aExp;
4541    uint64_t aSig, aSig0, aSig1, zSig, i;
4542    a = float64_squash_input_denormal(a, status);
4543
4544    aSig = extractFloat64Frac( a );
4545    aExp = extractFloat64Exp( a );
4546    aSign = extractFloat64Sign( a );
4547
4548    if ( aExp == 0 ) {
4549        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4550        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4551    }
4552    if ( aSign ) {
4553        float_raise(float_flag_invalid, status);
4554        return float64_default_nan(status);
4555    }
4556    if ( aExp == 0x7FF ) {
4557        if (aSig) {
4558            return propagateFloat64NaN(a, float64_zero, status);
4559        }
4560        return a;
4561    }
4562
4563    aExp -= 0x3FF;
4564    aSig |= LIT64( 0x0010000000000000 );
4565    zSign = aExp < 0;
4566    zSig = (uint64_t)aExp << 52;
4567    for (i = 1LL << 51; i > 0; i >>= 1) {
4568        mul64To128( aSig, aSig, &aSig0, &aSig1 );
4569        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4570        if ( aSig & LIT64( 0x0020000000000000 ) ) {
4571            aSig >>= 1;
4572            zSig |= i;
4573        }
4574    }
4575
4576    if ( zSign )
4577        zSig = -zSig;
4578    return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4579}
4580
4581/*----------------------------------------------------------------------------
4582| Returns 1 if the double-precision floating-point value `a' is equal to the
4583| corresponding value `b', and 0 otherwise.  The invalid exception is raised
4584| if either operand is a NaN.  Otherwise, the comparison is performed
4585| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4586*----------------------------------------------------------------------------*/
4587
4588int float64_eq(float64 a, float64 b, float_status *status)
4589{
4590    uint64_t av, bv;
4591    a = float64_squash_input_denormal(a, status);
4592    b = float64_squash_input_denormal(b, status);
4593
4594    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4595         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4596       ) {
4597        float_raise(float_flag_invalid, status);
4598        return 0;
4599    }
4600    av = float64_val(a);
4601    bv = float64_val(b);
4602    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4603
4604}
4605
4606/*----------------------------------------------------------------------------
4607| Returns 1 if the double-precision floating-point value `a' is less than or
4608| equal to the corresponding value `b', and 0 otherwise.  The invalid
4609| exception is raised if either operand is a NaN.  The comparison is performed
4610| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4611*----------------------------------------------------------------------------*/
4612
4613int float64_le(float64 a, float64 b, float_status *status)
4614{
4615    flag aSign, bSign;
4616    uint64_t av, bv;
4617    a = float64_squash_input_denormal(a, status);
4618    b = float64_squash_input_denormal(b, status);
4619
4620    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4621         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4622       ) {
4623        float_raise(float_flag_invalid, status);
4624        return 0;
4625    }
4626    aSign = extractFloat64Sign( a );
4627    bSign = extractFloat64Sign( b );
4628    av = float64_val(a);
4629    bv = float64_val(b);
4630    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4631    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4632
4633}
4634
4635/*----------------------------------------------------------------------------
4636| Returns 1 if the double-precision floating-point value `a' is less than
4637| the corresponding value `b', and 0 otherwise.  The invalid exception is
4638| raised if either operand is a NaN.  The comparison is performed according
4639| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4640*----------------------------------------------------------------------------*/
4641
4642int float64_lt(float64 a, float64 b, float_status *status)
4643{
4644    flag aSign, bSign;
4645    uint64_t av, bv;
4646
4647    a = float64_squash_input_denormal(a, status);
4648    b = float64_squash_input_denormal(b, status);
4649    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4650         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4651       ) {
4652        float_raise(float_flag_invalid, status);
4653        return 0;
4654    }
4655    aSign = extractFloat64Sign( a );
4656    bSign = extractFloat64Sign( b );
4657    av = float64_val(a);
4658    bv = float64_val(b);
4659    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4660    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4661
4662}
4663
4664/*----------------------------------------------------------------------------
4665| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4666| be compared, and 0 otherwise.  The invalid exception is raised if either
4667| operand is a NaN.  The comparison is performed according to the IEC/IEEE
4668| Standard for Binary Floating-Point Arithmetic.
4669*----------------------------------------------------------------------------*/
4670
4671int float64_unordered(float64 a, float64 b, float_status *status)
4672{
4673    a = float64_squash_input_denormal(a, status);
4674    b = float64_squash_input_denormal(b, status);
4675
4676    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4677         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4678       ) {
4679        float_raise(float_flag_invalid, status);
4680        return 1;
4681    }
4682    return 0;
4683}
4684
4685/*----------------------------------------------------------------------------
4686| Returns 1 if the double-precision floating-point value `a' is equal to the
4687| corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4688| exception.The comparison is performed according to the IEC/IEEE Standard
4689| for Binary Floating-Point Arithmetic.
4690*----------------------------------------------------------------------------*/
4691
4692int float64_eq_quiet(float64 a, float64 b, float_status *status)
4693{
4694    uint64_t av, bv;
4695    a = float64_squash_input_denormal(a, status);
4696    b = float64_squash_input_denormal(b, status);
4697
4698    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4699         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4700       ) {
4701        if (float64_is_signaling_nan(a, status)
4702         || float64_is_signaling_nan(b, status)) {
4703            float_raise(float_flag_invalid, status);
4704        }
4705        return 0;
4706    }
4707    av = float64_val(a);
4708    bv = float64_val(b);
4709    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4710
4711}
4712
4713/*----------------------------------------------------------------------------
4714| Returns 1 if the double-precision floating-point value `a' is less than or
4715| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4716| cause an exception.  Otherwise, the comparison is performed according to the
4717| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4718*----------------------------------------------------------------------------*/
4719
4720int float64_le_quiet(float64 a, float64 b, float_status *status)
4721{
4722    flag aSign, bSign;
4723    uint64_t av, bv;
4724    a = float64_squash_input_denormal(a, status);
4725    b = float64_squash_input_denormal(b, status);
4726
4727    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4728         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4729       ) {
4730        if (float64_is_signaling_nan(a, status)
4731         || float64_is_signaling_nan(b, status)) {
4732            float_raise(float_flag_invalid, status);
4733        }
4734        return 0;
4735    }
4736    aSign = extractFloat64Sign( a );
4737    bSign = extractFloat64Sign( b );
4738    av = float64_val(a);
4739    bv = float64_val(b);
4740    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4741    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4742
4743}
4744
4745/*----------------------------------------------------------------------------
4746| Returns 1 if the double-precision floating-point value `a' is less than
4747| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4748| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4749| Standard for Binary Floating-Point Arithmetic.
4750*----------------------------------------------------------------------------*/
4751
4752int float64_lt_quiet(float64 a, float64 b, float_status *status)
4753{
4754    flag aSign, bSign;
4755    uint64_t av, bv;
4756    a = float64_squash_input_denormal(a, status);
4757    b = float64_squash_input_denormal(b, status);
4758
4759    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4760         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4761       ) {
4762        if (float64_is_signaling_nan(a, status)
4763         || float64_is_signaling_nan(b, status)) {
4764            float_raise(float_flag_invalid, status);
4765        }
4766        return 0;
4767    }
4768    aSign = extractFloat64Sign( a );
4769    bSign = extractFloat64Sign( b );
4770    av = float64_val(a);
4771    bv = float64_val(b);
4772    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4773    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4774
4775}
4776
4777/*----------------------------------------------------------------------------
4778| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4779| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4780| comparison is performed according to the IEC/IEEE Standard for Binary
4781| Floating-Point Arithmetic.
4782*----------------------------------------------------------------------------*/
4783
4784int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4785{
4786    a = float64_squash_input_denormal(a, status);
4787    b = float64_squash_input_denormal(b, status);
4788
4789    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4790         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4791       ) {
4792        if (float64_is_signaling_nan(a, status)
4793         || float64_is_signaling_nan(b, status)) {
4794            float_raise(float_flag_invalid, status);
4795        }
4796        return 1;
4797    }
4798    return 0;
4799}
4800
4801/*----------------------------------------------------------------------------
4802| Returns the result of converting the extended double-precision floating-
4803| point value `a' to the 32-bit two's complement integer format.  The
4804| conversion is performed according to the IEC/IEEE Standard for Binary
4805| Floating-Point Arithmetic---which means in particular that the conversion
4806| is rounded according to the current rounding mode.  If `a' is a NaN, the
4807| largest positive integer is returned.  Otherwise, if the conversion
4808| overflows, the largest integer with the same sign as `a' is returned.
4809*----------------------------------------------------------------------------*/
4810
4811int32_t floatx80_to_int32(floatx80 a, float_status *status)
4812{
4813    flag aSign;
4814    int32_t aExp, shiftCount;
4815    uint64_t aSig;
4816
4817    if (floatx80_invalid_encoding(a)) {
4818        float_raise(float_flag_invalid, status);
4819        return 1 << 31;
4820    }
4821    aSig = extractFloatx80Frac( a );
4822    aExp = extractFloatx80Exp( a );
4823    aSign = extractFloatx80Sign( a );
4824    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4825    shiftCount = 0x4037 - aExp;
4826    if ( shiftCount <= 0 ) shiftCount = 1;
4827    shift64RightJamming( aSig, shiftCount, &aSig );
4828    return roundAndPackInt32(aSign, aSig, status);
4829
4830}
4831
4832/*----------------------------------------------------------------------------
4833| Returns the result of converting the extended double-precision floating-
4834| point value `a' to the 32-bit two's complement integer format.  The
4835| conversion is performed according to the IEC/IEEE Standard for Binary
4836| Floating-Point Arithmetic, except that the conversion is always rounded
4837| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4838| Otherwise, if the conversion overflows, the largest integer with the same
4839| sign as `a' is returned.
4840*----------------------------------------------------------------------------*/
4841
4842int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4843{
4844    flag aSign;
4845    int32_t aExp, shiftCount;
4846    uint64_t aSig, savedASig;
4847    int32_t z;
4848
4849    if (floatx80_invalid_encoding(a)) {
4850        float_raise(float_flag_invalid, status);
4851        return 1 << 31;
4852    }
4853    aSig = extractFloatx80Frac( a );
4854    aExp = extractFloatx80Exp( a );
4855    aSign = extractFloatx80Sign( a );
4856    if ( 0x401E < aExp ) {
4857        if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4858        goto invalid;
4859    }
4860    else if ( aExp < 0x3FFF ) {
4861        if (aExp || aSig) {
4862            status->float_exception_flags |= float_flag_inexact;
4863        }
4864        return 0;
4865    }
4866    shiftCount = 0x403E - aExp;
4867    savedASig = aSig;
4868    aSig >>= shiftCount;
4869    z = aSig;
4870    if ( aSign ) z = - z;
4871    if ( ( z < 0 ) ^ aSign ) {
4872 invalid:
4873        float_raise(float_flag_invalid, status);
4874        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4875    }
4876    if ( ( aSig<<shiftCount ) != savedASig ) {
4877        status->float_exception_flags |= float_flag_inexact;
4878    }
4879    return z;
4880
4881}
4882
4883/*----------------------------------------------------------------------------
4884| Returns the result of converting the extended double-precision floating-
4885| point value `a' to the 64-bit two's complement integer format.  The
4886| conversion is performed according to the IEC/IEEE Standard for Binary
4887| Floating-Point Arithmetic---which means in particular that the conversion
4888| is rounded according to the current rounding mode.  If `a' is a NaN,
4889| the largest positive integer is returned.  Otherwise, if the conversion
4890| overflows, the largest integer with the same sign as `a' is returned.
4891*----------------------------------------------------------------------------*/
4892
4893int64_t floatx80_to_int64(floatx80 a, float_status *status)
4894{
4895    flag aSign;
4896    int32_t aExp, shiftCount;
4897    uint64_t aSig, aSigExtra;
4898
4899    if (floatx80_invalid_encoding(a)) {
4900        float_raise(float_flag_invalid, status);
4901        return 1ULL << 63;
4902    }
4903    aSig = extractFloatx80Frac( a );
4904    aExp = extractFloatx80Exp( a );
4905    aSign = extractFloatx80Sign( a );
4906    shiftCount = 0x403E - aExp;
4907    if ( shiftCount <= 0 ) {
4908        if ( shiftCount ) {
4909            float_raise(float_flag_invalid, status);
4910            if (    ! aSign
4911                 || (    ( aExp == 0x7FFF )
4912                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
4913               ) {
4914                return LIT64( 0x7FFFFFFFFFFFFFFF );
4915            }
4916            return (int64_t) LIT64( 0x8000000000000000 );
4917        }
4918        aSigExtra = 0;
4919    }
4920    else {
4921        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4922    }
4923    return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4924
4925}
4926
4927/*----------------------------------------------------------------------------
4928| Returns the result of converting the extended double-precision floating-
4929| point value `a' to the 64-bit two's complement integer format.  The
4930| conversion is performed according to the IEC/IEEE Standard for Binary
4931| Floating-Point Arithmetic, except that the conversion is always rounded
4932| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4933| Otherwise, if the conversion overflows, the largest integer with the same
4934| sign as `a' is returned.
4935*----------------------------------------------------------------------------*/
4936
4937int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4938{
4939    flag aSign;
4940    int32_t aExp, shiftCount;
4941    uint64_t aSig;
4942    int64_t z;
4943
4944    if (floatx80_invalid_encoding(a)) {
4945        float_raise(float_flag_invalid, status);
4946        return 1ULL << 63;
4947    }
4948    aSig = extractFloatx80Frac( a );
4949    aExp = extractFloatx80Exp( a );
4950    aSign = extractFloatx80Sign( a );
4951    shiftCount = aExp - 0x403E;
4952    if ( 0 <= shiftCount ) {
4953        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4954        if ( ( a.high != 0xC03E ) || aSig ) {
4955            float_raise(float_flag_invalid, status);
4956            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4957                return LIT64( 0x7FFFFFFFFFFFFFFF );
4958            }
4959        }
4960        return (int64_t) LIT64( 0x8000000000000000 );
4961    }
4962    else if ( aExp < 0x3FFF ) {
4963        if (aExp | aSig) {
4964            status->float_exception_flags |= float_flag_inexact;
4965        }
4966        return 0;
4967    }
4968    z = aSig>>( - shiftCount );
4969    if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4970        status->float_exception_flags |= float_flag_inexact;
4971    }
4972    if ( aSign ) z = - z;
4973    return z;
4974
4975}
4976
4977/*----------------------------------------------------------------------------
4978| Returns the result of converting the extended double-precision floating-
4979| point value `a' to the single-precision floating-point format.  The
4980| conversion is performed according to the IEC/IEEE Standard for Binary
4981| Floating-Point Arithmetic.
4982*----------------------------------------------------------------------------*/
4983
4984float32 floatx80_to_float32(floatx80 a, float_status *status)
4985{
4986    flag aSign;
4987    int32_t aExp;
4988    uint64_t aSig;
4989
4990    if (floatx80_invalid_encoding(a)) {
4991        float_raise(float_flag_invalid, status);
4992        return float32_default_nan(status);
4993    }
4994    aSig = extractFloatx80Frac( a );
4995    aExp = extractFloatx80Exp( a );
4996    aSign = extractFloatx80Sign( a );
4997    if ( aExp == 0x7FFF ) {
4998        if ( (uint64_t) ( aSig<<1 ) ) {
4999            return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5000        }
5001        return packFloat32( aSign, 0xFF, 0 );
5002    }
5003    shift64RightJamming( aSig, 33, &aSig );
5004    if ( aExp || aSig ) aExp -= 0x3F81;
5005    return roundAndPackFloat32(aSign, aExp, aSig, status);
5006
5007}
5008
5009/*----------------------------------------------------------------------------
5010| Returns the result of converting the extended double-precision floating-
5011| point value `a' to the double-precision floating-point format.  The
5012| conversion is performed according to the IEC/IEEE Standard for Binary
5013| Floating-Point Arithmetic.
5014*----------------------------------------------------------------------------*/
5015
5016float64 floatx80_to_float64(floatx80 a, float_status *status)
5017{
5018    flag aSign;
5019    int32_t aExp;
5020    uint64_t aSig, zSig;
5021
5022    if (floatx80_invalid_encoding(a)) {
5023        float_raise(float_flag_invalid, status);
5024        return float64_default_nan(status);
5025    }
5026    aSig = extractFloatx80Frac( a );
5027    aExp = extractFloatx80Exp( a );
5028    aSign = extractFloatx80Sign( a );
5029    if ( aExp == 0x7FFF ) {
5030        if ( (uint64_t) ( aSig<<1 ) ) {
5031            return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5032        }
5033        return packFloat64( aSign, 0x7FF, 0 );
5034    }
5035    shift64RightJamming( aSig, 1, &zSig );
5036    if ( aExp || aSig ) aExp -= 0x3C01;
5037    return roundAndPackFloat64(aSign, aExp, zSig, status);
5038
5039}
5040
5041/*----------------------------------------------------------------------------
5042| Returns the result of converting the extended double-precision floating-
5043| point value `a' to the quadruple-precision floating-point format.  The
5044| conversion is performed according to the IEC/IEEE Standard for Binary
5045| Floating-Point Arithmetic.
5046*----------------------------------------------------------------------------*/
5047
5048float128 floatx80_to_float128(floatx80 a, float_status *status)
5049{
5050    flag aSign;
5051    int aExp;
5052    uint64_t aSig, zSig0, zSig1;
5053
5054    if (floatx80_invalid_encoding(a)) {
5055        float_raise(float_flag_invalid, status);
5056        return float128_default_nan(status);
5057    }
5058    aSig = extractFloatx80Frac( a );
5059    aExp = extractFloatx80Exp( a );
5060    aSign = extractFloatx80Sign( a );
5061    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5062        return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5063    }
5064    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5065    return packFloat128( aSign, aExp, zSig0, zSig1 );
5066
5067}
5068
5069/*----------------------------------------------------------------------------
5070| Rounds the extended double-precision floating-point value `a' to an integer,
5071| and returns the result as an extended quadruple-precision floating-point
5072| value.  The operation is performed according to the IEC/IEEE Standard for
5073| Binary Floating-Point Arithmetic.
5074*----------------------------------------------------------------------------*/
5075
5076floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5077{
5078    flag aSign;
5079    int32_t aExp;
5080    uint64_t lastBitMask, roundBitsMask;
5081    floatx80 z;
5082
5083    if (floatx80_invalid_encoding(a)) {
5084        float_raise(float_flag_invalid, status);
5085        return floatx80_default_nan(status);
5086    }
5087    aExp = extractFloatx80Exp( a );
5088    if ( 0x403E <= aExp ) {
5089        if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5090            return propagateFloatx80NaN(a, a, status);
5091        }
5092        return a;
5093    }
5094    if ( aExp < 0x3FFF ) {
5095        if (    ( aExp == 0 )
5096             && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5097            return a;
5098        }
5099        status->float_exception_flags |= float_flag_inexact;
5100        aSign = extractFloatx80Sign( a );
5101        switch (status->float_rounding_mode) {
5102         case float_round_nearest_even:
5103            if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5104               ) {
5105                return
5106                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5107            }
5108            break;
5109        case float_round_ties_away:
5110            if (aExp == 0x3FFE) {
5111                return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5112            }
5113            break;
5114         case float_round_down:
5115            return
5116                  aSign ?
5117                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5118                : packFloatx80( 0, 0, 0 );
5119         case float_round_up:
5120            return
5121                  aSign ? packFloatx80( 1, 0, 0 )
5122                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5123        }
5124        return packFloatx80( aSign, 0, 0 );
5125    }
5126    lastBitMask = 1;
5127    lastBitMask <<= 0x403E - aExp;
5128    roundBitsMask = lastBitMask - 1;
5129    z = a;
5130    switch (status->float_rounding_mode) {
5131    case float_round_nearest_even:
5132        z.low += lastBitMask>>1;
5133        if ((z.low & roundBitsMask) == 0) {
5134            z.low &= ~lastBitMask;
5135        }
5136        break;
5137    case float_round_ties_away:
5138        z.low += lastBitMask >> 1;
5139        break;
5140    case float_round_to_zero:
5141        break;
5142    case float_round_up:
5143        if (!extractFloatx80Sign(z)) {
5144            z.low += roundBitsMask;
5145        }
5146        break;
5147    case float_round_down:
5148        if (extractFloatx80Sign(z)) {
5149            z.low += roundBitsMask;
5150        }
5151        break;
5152    default:
5153        abort();
5154    }
5155    z.low &= ~ roundBitsMask;
5156    if ( z.low == 0 ) {
5157        ++z.high;
5158        z.low = LIT64( 0x8000000000000000 );
5159    }
5160    if (z.low != a.low) {
5161        status->float_exception_flags |= float_flag_inexact;
5162    }
5163    return z;
5164
5165}
5166
5167/*----------------------------------------------------------------------------
5168| Returns the result of adding the absolute values of the extended double-
5169| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5170| negated before being returned.  `zSign' is ignored if the result is a NaN.
5171| The addition is performed according to the IEC/IEEE Standard for Binary
5172| Floating-Point Arithmetic.
5173*----------------------------------------------------------------------------*/
5174
5175static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5176                                float_status *status)
5177{
5178    int32_t aExp, bExp, zExp;
5179    uint64_t aSig, bSig, zSig0, zSig1;
5180    int32_t expDiff;
5181
5182    aSig = extractFloatx80Frac( a );
5183    aExp = extractFloatx80Exp( a );
5184    bSig = extractFloatx80Frac( b );
5185    bExp = extractFloatx80Exp( b );
5186    expDiff = aExp - bExp;
5187    if ( 0 < expDiff ) {
5188        if ( aExp == 0x7FFF ) {
5189            if ((uint64_t)(aSig << 1)) {
5190                return propagateFloatx80NaN(a, b, status);
5191            }
5192            return a;
5193        }
5194        if ( bExp == 0 ) --expDiff;
5195        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5196        zExp = aExp;
5197    }
5198    else if ( expDiff < 0 ) {
5199        if ( bExp == 0x7FFF ) {
5200            if ((uint64_t)(bSig << 1)) {
5201                return propagateFloatx80NaN(a, b, status);
5202            }
5203            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5204        }
5205        if ( aExp == 0 ) ++expDiff;
5206        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5207        zExp = bExp;
5208    }
5209    else {
5210        if ( aExp == 0x7FFF ) {
5211            if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5212                return propagateFloatx80NaN(a, b, status);
5213            }
5214            return a;
5215        }
5216        zSig1 = 0;
5217        zSig0 = aSig + bSig;
5218        if ( aExp == 0 ) {
5219            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5220            goto roundAndPack;
5221        }
5222        zExp = aExp;
5223        goto shiftRight1;
5224    }
5225    zSig0 = aSig + bSig;
5226    if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5227 shiftRight1:
5228    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5229    zSig0 |= LIT64( 0x8000000000000000 );
5230    ++zExp;
5231 roundAndPack:
5232    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5233                                zSign, zExp, zSig0, zSig1, status);
5234}
5235
5236/*----------------------------------------------------------------------------
5237| Returns the result of subtracting the absolute values of the extended
5238| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5239| difference is negated before being returned.  `zSign' is ignored if the
5240| result is a NaN.  The subtraction is performed according to the IEC/IEEE
5241| Standard for Binary Floating-Point Arithmetic.
5242*----------------------------------------------------------------------------*/
5243
5244static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5245                                float_status *status)
5246{
5247    int32_t aExp, bExp, zExp;
5248    uint64_t aSig, bSig, zSig0, zSig1;
5249    int32_t expDiff;
5250
5251    aSig = extractFloatx80Frac( a );
5252    aExp = extractFloatx80Exp( a );
5253    bSig = extractFloatx80Frac( b );
5254    bExp = extractFloatx80Exp( b );
5255    expDiff = aExp - bExp;
5256    if ( 0 < expDiff ) goto aExpBigger;
5257    if ( expDiff < 0 ) goto bExpBigger;
5258    if ( aExp == 0x7FFF ) {
5259        if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5260            return propagateFloatx80NaN(a, b, status);
5261        }
5262        float_raise(float_flag_invalid, status);
5263        return floatx80_default_nan(status);
5264    }
5265    if ( aExp == 0 ) {
5266        aExp = 1;
5267        bExp = 1;
5268    }
5269    zSig1 = 0;
5270    if ( bSig < aSig ) goto aBigger;
5271    if ( aSig < bSig ) goto bBigger;
5272    return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5273 bExpBigger:
5274    if ( bExp == 0x7FFF ) {
5275        if ((uint64_t)(bSig << 1)) {
5276            return propagateFloatx80NaN(a, b, status);
5277        }
5278        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5279    }
5280    if ( aExp == 0 ) ++expDiff;
5281    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5282 bBigger:
5283    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5284    zExp = bExp;
5285    zSign ^= 1;
5286    goto normalizeRoundAndPack;
5287 aExpBigger:
5288    if ( aExp == 0x7FFF ) {
5289        if ((uint64_t)(aSig << 1)) {
5290            return propagateFloatx80NaN(a, b, status);
5291        }
5292        return a;
5293    }
5294    if ( bExp == 0 ) --expDiff;
5295    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5296 aBigger:
5297    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5298    zExp = aExp;
5299 normalizeRoundAndPack:
5300    return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5301                                         zSign, zExp, zSig0, zSig1, status);
5302}
5303
5304/*----------------------------------------------------------------------------
5305| Returns the result of adding the extended double-precision floating-point
5306| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5307| Standard for Binary Floating-Point Arithmetic.
5308*----------------------------------------------------------------------------*/
5309
5310floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5311{
5312    flag aSign, bSign;
5313
5314    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5315        float_raise(float_flag_invalid, status);
5316        return floatx80_default_nan(status);
5317    }
5318    aSign = extractFloatx80Sign( a );
5319    bSign = extractFloatx80Sign( b );
5320    if ( aSign == bSign ) {
5321        return addFloatx80Sigs(a, b, aSign, status);
5322    }
5323    else {
5324        return subFloatx80Sigs(a, b, aSign, status);
5325    }
5326
5327}
5328
5329/*----------------------------------------------------------------------------
5330| Returns the result of subtracting the extended double-precision floating-
5331| point values `a' and `b'.  The operation is performed according to the
5332| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5333*----------------------------------------------------------------------------*/
5334
5335floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5336{
5337    flag aSign, bSign;
5338
5339    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5340        float_raise(float_flag_invalid, status);
5341        return floatx80_default_nan(status);
5342    }
5343    aSign = extractFloatx80Sign( a );
5344    bSign = extractFloatx80Sign( b );
5345    if ( aSign == bSign ) {
5346        return subFloatx80Sigs(a, b, aSign, status);
5347    }
5348    else {
5349        return addFloatx80Sigs(a, b, aSign, status);
5350    }
5351
5352}
5353
5354/*----------------------------------------------------------------------------
5355| Returns the result of multiplying the extended double-precision floating-
5356| point values `a' and `b'.  The operation is performed according to the
5357| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5358*----------------------------------------------------------------------------*/
5359
5360floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5361{
5362    flag aSign, bSign, zSign;
5363    int32_t aExp, bExp, zExp;
5364    uint64_t aSig, bSig, zSig0, zSig1;
5365
5366    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5367        float_raise(float_flag_invalid, status);
5368        return floatx80_default_nan(status);
5369    }
5370    aSig = extractFloatx80Frac( a );
5371    aExp = extractFloatx80Exp( a );
5372    aSign = extractFloatx80Sign( a );
5373    bSig = extractFloatx80Frac( b );
5374    bExp = extractFloatx80Exp( b );
5375    bSign = extractFloatx80Sign( b );
5376    zSign = aSign ^ bSign;
5377    if ( aExp == 0x7FFF ) {
5378        if (    (uint64_t) ( aSig<<1 )
5379             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5380            return propagateFloatx80NaN(a, b, status);
5381        }
5382        if ( ( bExp | bSig ) == 0 ) goto invalid;
5383        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5384    }
5385    if ( bExp == 0x7FFF ) {
5386        if ((uint64_t)(bSig << 1)) {
5387            return propagateFloatx80NaN(a, b, status);
5388        }
5389        if ( ( aExp | aSig ) == 0 ) {
5390 invalid:
5391            float_raise(float_flag_invalid, status);
5392            return floatx80_default_nan(status);
5393        }
5394        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5395    }
5396    if ( aExp == 0 ) {
5397        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5398        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5399    }
5400    if ( bExp == 0 ) {
5401        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5402        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5403    }
5404    zExp = aExp + bExp - 0x3FFE;
5405    mul64To128( aSig, bSig, &zSig0, &zSig1 );
5406    if ( 0 < (int64_t) zSig0 ) {
5407        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5408        --zExp;
5409    }
5410    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5411                                zSign, zExp, zSig0, zSig1, status);
5412}
5413
5414/*----------------------------------------------------------------------------
5415| Returns the result of dividing the extended double-precision floating-point
5416| value `a' by the corresponding value `b'.  The operation is performed
5417| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5418*----------------------------------------------------------------------------*/
5419
5420floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5421{
5422    flag aSign, bSign, zSign;
5423    int32_t aExp, bExp, zExp;
5424    uint64_t aSig, bSig, zSig0, zSig1;
5425    uint64_t rem0, rem1, rem2, term0, term1, term2;
5426
5427    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5428        float_raise(float_flag_invalid, status);
5429        return floatx80_default_nan(status);
5430    }
5431    aSig = extractFloatx80Frac( a );
5432    aExp = extractFloatx80Exp( a );
5433    aSign = extractFloatx80Sign( a );
5434    bSig = extractFloatx80Frac( b );
5435    bExp = extractFloatx80Exp( b );
5436    bSign = extractFloatx80Sign( b );
5437    zSign = aSign ^ bSign;
5438    if ( aExp == 0x7FFF ) {
5439        if ((uint64_t)(aSig << 1)) {
5440            return propagateFloatx80NaN(a, b, status);
5441        }
5442        if ( bExp == 0x7FFF ) {
5443            if ((uint64_t)(bSig << 1)) {
5444                return propagateFloatx80NaN(a, b, status);
5445            }
5446            goto invalid;
5447        }
5448        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5449    }
5450    if ( bExp == 0x7FFF ) {
5451        if ((uint64_t)(bSig << 1)) {
5452            return propagateFloatx80NaN(a, b, status);
5453        }
5454        return packFloatx80( zSign, 0, 0 );
5455    }
5456    if ( bExp == 0 ) {
5457        if ( bSig == 0 ) {
5458            if ( ( aExp | aSig ) == 0 ) {
5459 invalid:
5460                float_raise(float_flag_invalid, status);
5461                return floatx80_default_nan(status);
5462            }
5463            float_raise(float_flag_divbyzero, status);
5464            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5465        }
5466        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5467    }
5468    if ( aExp == 0 ) {
5469        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5470        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5471    }
5472    zExp = aExp - bExp + 0x3FFE;
5473    rem1 = 0;
5474    if ( bSig <= aSig ) {
5475        shift128Right( aSig, 0, 1, &aSig, &rem1 );
5476        ++zExp;
5477    }
5478    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5479    mul64To128( bSig, zSig0, &term0, &term1 );
5480    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5481    while ( (int64_t) rem0 < 0 ) {
5482        --zSig0;
5483        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5484    }
5485    zSig1 = estimateDiv128To64( rem1, 0, bSig );
5486    if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5487        mul64To128( bSig, zSig1, &term1, &term2 );
5488        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5489        while ( (int64_t) rem1 < 0 ) {
5490            --zSig1;
5491            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5492        }
5493        zSig1 |= ( ( rem1 | rem2 ) != 0 );
5494    }
5495    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5496                                zSign, zExp, zSig0, zSig1, status);
5497}
5498
5499/*----------------------------------------------------------------------------
5500| Returns the remainder of the extended double-precision floating-point value
5501| `a' with respect to the corresponding value `b'.  The operation is performed
5502| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5503*----------------------------------------------------------------------------*/
5504
5505floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5506{
5507    flag aSign, zSign;
5508    int32_t aExp, bExp, expDiff;
5509    uint64_t aSig0, aSig1, bSig;
5510    uint64_t q, term0, term1, alternateASig0, alternateASig1;
5511
5512    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5513        float_raise(float_flag_invalid, status);
5514        return floatx80_default_nan(status);
5515    }
5516    aSig0 = extractFloatx80Frac( a );
5517    aExp = extractFloatx80Exp( a );
5518    aSign = extractFloatx80Sign( a );
5519    bSig = extractFloatx80Frac( b );
5520    bExp = extractFloatx80Exp( b );
5521    if ( aExp == 0x7FFF ) {
5522        if (    (uint64_t) ( aSig0<<1 )
5523             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5524            return propagateFloatx80NaN(a, b, status);
5525        }
5526        goto invalid;
5527    }
5528    if ( bExp == 0x7FFF ) {
5529        if ((uint64_t)(bSig << 1)) {
5530            return propagateFloatx80NaN(a, b, status);
5531        }
5532        return a;
5533    }
5534    if ( bExp == 0 ) {
5535        if ( bSig == 0 ) {
5536 invalid:
5537            float_raise(float_flag_invalid, status);
5538            return floatx80_default_nan(status);
5539        }
5540        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5541    }
5542    if ( aExp == 0 ) {
5543        if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5544        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5545    }
5546    bSig |= LIT64( 0x8000000000000000 );
5547    zSign = aSign;
5548    expDiff = aExp - bExp;
5549    aSig1 = 0;
5550    if ( expDiff < 0 ) {
5551        if ( expDiff < -1 ) return a;
5552        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5553        expDiff = 0;
5554    }
5555    q = ( bSig <= aSig0 );
5556    if ( q ) aSig0 -= bSig;
5557    expDiff -= 64;
5558    while ( 0 < expDiff ) {
5559        q = estimateDiv128To64( aSig0, aSig1, bSig );
5560        q = ( 2 < q ) ? q - 2 : 0;
5561        mul64To128( bSig, q, &term0, &term1 );
5562        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5563        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5564        expDiff -= 62;
5565    }
5566    expDiff += 64;
5567    if ( 0 < expDiff ) {
5568        q = estimateDiv128To64( aSig0, aSig1, bSig );
5569        q = ( 2 < q ) ? q - 2 : 0;
5570        q >>= 64 - expDiff;
5571        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5572        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5573        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5574        while ( le128( term0, term1, aSig0, aSig1 ) ) {
5575            ++q;
5576            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5577        }
5578    }
5579    else {
5580        term1 = 0;
5581        term0 = bSig;
5582    }
5583    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5584    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5585         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5586              && ( q & 1 ) )
5587       ) {
5588        aSig0 = alternateASig0;
5589        aSig1 = alternateASig1;
5590        zSign = ! zSign;
5591    }
5592    return
5593        normalizeRoundAndPackFloatx80(
5594            80, zSign, bExp + expDiff, aSig0, aSig1, status);
5595
5596}
5597
5598/*----------------------------------------------------------------------------
5599| Returns the square root of the extended double-precision floating-point
5600| value `a'.  The operation is performed according to the IEC/IEEE Standard
5601| for Binary Floating-Point Arithmetic.
5602*----------------------------------------------------------------------------*/
5603
5604floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5605{
5606    flag aSign;
5607    int32_t aExp, zExp;
5608    uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5609    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5610
5611    if (floatx80_invalid_encoding(a)) {
5612        float_raise(float_flag_invalid, status);
5613        return floatx80_default_nan(status);
5614    }
5615    aSig0 = extractFloatx80Frac( a );
5616    aExp = extractFloatx80Exp( a );
5617    aSign = extractFloatx80Sign( a );
5618    if ( aExp == 0x7FFF ) {
5619        if ((uint64_t)(aSig0 << 1)) {
5620            return propagateFloatx80NaN(a, a, status);
5621        }
5622        if ( ! aSign ) return a;
5623        goto invalid;
5624    }
5625    if ( aSign ) {
5626        if ( ( aExp | aSig0 ) == 0 ) return a;
5627 invalid:
5628        float_raise(float_flag_invalid, status);
5629        return floatx80_default_nan(status);
5630    }
5631    if ( aExp == 0 ) {
5632        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5633        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5634    }
5635    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5636    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5637    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5638    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5639    doubleZSig0 = zSig0<<1;
5640    mul64To128( zSig0, zSig0, &term0, &term1 );
5641    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5642    while ( (int64_t) rem0 < 0 ) {
5643        --zSig0;
5644        doubleZSig0 -= 2;
5645        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5646    }
5647    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5648    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5649        if ( zSig1 == 0 ) zSig1 = 1;
5650        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5651        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5652        mul64To128( zSig1, zSig1, &term2, &term3 );
5653        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5654        while ( (int64_t) rem1 < 0 ) {
5655            --zSig1;
5656            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5657            term3 |= 1;
5658            term2 |= doubleZSig0;
5659            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5660        }
5661        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5662    }
5663    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5664    zSig0 |= doubleZSig0;
5665    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5666                                0, zExp, zSig0, zSig1, status);
5667}
5668
5669/*----------------------------------------------------------------------------
5670| Returns 1 if the extended double-precision floating-point value `a' is equal
5671| to the corresponding value `b', and 0 otherwise.  The invalid exception is
5672| raised if either operand is a NaN.  Otherwise, the comparison is performed
5673| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5674*----------------------------------------------------------------------------*/
5675
5676int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5677{
5678
5679    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5680        || (extractFloatx80Exp(a) == 0x7FFF
5681            && (uint64_t) (extractFloatx80Frac(a) << 1))
5682        || (extractFloatx80Exp(b) == 0x7FFF
5683            && (uint64_t) (extractFloatx80Frac(b) << 1))
5684       ) {
5685        float_raise(float_flag_invalid, status);
5686        return 0;
5687    }
5688    return
5689           ( a.low == b.low )
5690        && (    ( a.high == b.high )
5691             || (    ( a.low == 0 )
5692                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5693           );
5694
5695}
5696
5697/*----------------------------------------------------------------------------
5698| Returns 1 if the extended double-precision floating-point value `a' is
5699| less than or equal to the corresponding value `b', and 0 otherwise.  The
5700| invalid exception is raised if either operand is a NaN.  The comparison is
5701| performed according to the IEC/IEEE Standard for Binary Floating-Point
5702| Arithmetic.
5703*----------------------------------------------------------------------------*/
5704
5705int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5706{
5707    flag aSign, bSign;
5708
5709    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5710        || (extractFloatx80Exp(a) == 0x7FFF
5711            && (uint64_t) (extractFloatx80Frac(a) << 1))
5712        || (extractFloatx80Exp(b) == 0x7FFF
5713            && (uint64_t) (extractFloatx80Frac(b) << 1))
5714       ) {
5715        float_raise(float_flag_invalid, status);
5716        return 0;
5717    }
5718    aSign = extractFloatx80Sign( a );
5719    bSign = extractFloatx80Sign( b );
5720    if ( aSign != bSign ) {
5721        return
5722               aSign
5723            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5724                 == 0 );
5725    }
5726    return
5727          aSign ? le128( b.high, b.low, a.high, a.low )
5728        : le128( a.high, a.low, b.high, b.low );
5729
5730}
5731
5732/*----------------------------------------------------------------------------
5733| Returns 1 if the extended double-precision floating-point value `a' is
5734| less than the corresponding value `b', and 0 otherwise.  The invalid
5735| exception is raised if either operand is a NaN.  The comparison is performed
5736| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5737*----------------------------------------------------------------------------*/
5738
5739int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5740{
5741    flag aSign, bSign;
5742
5743    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5744        || (extractFloatx80Exp(a) == 0x7FFF
5745            && (uint64_t) (extractFloatx80Frac(a) << 1))
5746        || (extractFloatx80Exp(b) == 0x7FFF
5747            && (uint64_t) (extractFloatx80Frac(b) << 1))
5748       ) {
5749        float_raise(float_flag_invalid, status);
5750        return 0;
5751    }
5752    aSign = extractFloatx80Sign( a );
5753    bSign = extractFloatx80Sign( b );
5754    if ( aSign != bSign ) {
5755        return
5756               aSign
5757            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5758                 != 0 );
5759    }
5760    return
5761          aSign ? lt128( b.high, b.low, a.high, a.low )
5762        : lt128( a.high, a.low, b.high, b.low );
5763
5764}
5765
5766/*----------------------------------------------------------------------------
5767| Returns 1 if the extended double-precision floating-point values `a' and `b'
5768| cannot be compared, and 0 otherwise.  The invalid exception is raised if
5769| either operand is a NaN.   The comparison is performed according to the
5770| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5771*----------------------------------------------------------------------------*/
5772int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5773{
5774    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5775        || (extractFloatx80Exp(a) == 0x7FFF
5776            && (uint64_t) (extractFloatx80Frac(a) << 1))
5777        || (extractFloatx80Exp(b) == 0x7FFF
5778            && (uint64_t) (extractFloatx80Frac(b) << 1))
5779       ) {
5780        float_raise(float_flag_invalid, status);
5781        return 1;
5782    }
5783    return 0;
5784}
5785
5786/*----------------------------------------------------------------------------
5787| Returns 1 if the extended double-precision floating-point value `a' is
5788| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5789| cause an exception.  The comparison is performed according to the IEC/IEEE
5790| Standard for Binary Floating-Point Arithmetic.
5791*----------------------------------------------------------------------------*/
5792
5793int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5794{
5795
5796    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5797        float_raise(float_flag_invalid, status);
5798        return 0;
5799    }
5800    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5801              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5802         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5803              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5804       ) {
5805        if (floatx80_is_signaling_nan(a, status)
5806         || floatx80_is_signaling_nan(b, status)) {
5807            float_raise(float_flag_invalid, status);
5808        }
5809        return 0;
5810    }
5811    return
5812           ( a.low == b.low )
5813        && (    ( a.high == b.high )
5814             || (    ( a.low == 0 )
5815                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5816           );
5817
5818}
5819
5820/*----------------------------------------------------------------------------
5821| Returns 1 if the extended double-precision floating-point value `a' is less
5822| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5823| do not cause an exception.  Otherwise, the comparison is performed according
5824| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5825*----------------------------------------------------------------------------*/
5826
5827int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5828{
5829    flag aSign, bSign;
5830
5831    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5832        float_raise(float_flag_invalid, status);
5833        return 0;
5834    }
5835    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5836              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5837         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5838              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5839       ) {
5840        if (floatx80_is_signaling_nan(a, status)
5841         || floatx80_is_signaling_nan(b, status)) {
5842            float_raise(float_flag_invalid, status);
5843        }
5844        return 0;
5845    }
5846    aSign = extractFloatx80Sign( a );
5847    bSign = extractFloatx80Sign( b );
5848    if ( aSign != bSign ) {
5849        return
5850               aSign
5851            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5852                 == 0 );
5853    }
5854    return
5855          aSign ? le128( b.high, b.low, a.high, a.low )
5856        : le128( a.high, a.low, b.high, b.low );
5857
5858}
5859
5860/*----------------------------------------------------------------------------
5861| Returns 1 if the extended double-precision floating-point value `a' is less
5862| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5863| an exception.  Otherwise, the comparison is performed according to the
5864| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5865*----------------------------------------------------------------------------*/
5866
5867int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5868{
5869    flag aSign, bSign;
5870
5871    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5872        float_raise(float_flag_invalid, status);
5873        return 0;
5874    }
5875    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5876              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5877         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5878              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5879       ) {
5880        if (floatx80_is_signaling_nan(a, status)
5881         || floatx80_is_signaling_nan(b, status)) {
5882            float_raise(float_flag_invalid, status);
5883        }
5884        return 0;
5885    }
5886    aSign = extractFloatx80Sign( a );
5887    bSign = extractFloatx80Sign( b );
5888    if ( aSign != bSign ) {
5889        return
5890               aSign
5891            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5892                 != 0 );
5893    }
5894    return
5895          aSign ? lt128( b.high, b.low, a.high, a.low )
5896        : lt128( a.high, a.low, b.high, b.low );
5897
5898}
5899
5900/*----------------------------------------------------------------------------
5901| Returns 1 if the extended double-precision floating-point values `a' and `b'
5902| cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5903| The comparison is performed according to the IEC/IEEE Standard for Binary
5904| Floating-Point Arithmetic.
5905*----------------------------------------------------------------------------*/
5906int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5907{
5908    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5909        float_raise(float_flag_invalid, status);
5910        return 1;
5911    }
5912    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5913              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5914         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5915              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5916       ) {
5917        if (floatx80_is_signaling_nan(a, status)
5918         || floatx80_is_signaling_nan(b, status)) {
5919            float_raise(float_flag_invalid, status);
5920        }
5921        return 1;
5922    }
5923    return 0;
5924}
5925
5926/*----------------------------------------------------------------------------
5927| Returns the result of converting the quadruple-precision floating-point
5928| value `a' to the 32-bit two's complement integer format.  The conversion
5929| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5930| Arithmetic---which means in particular that the conversion is rounded
5931| according to the current rounding mode.  If `a' is a NaN, the largest
5932| positive integer is returned.  Otherwise, if the conversion overflows, the
5933| largest integer with the same sign as `a' is returned.
5934*----------------------------------------------------------------------------*/
5935
5936int32_t float128_to_int32(float128 a, float_status *status)
5937{
5938    flag aSign;
5939    int32_t aExp, shiftCount;
5940    uint64_t aSig0, aSig1;
5941
5942    aSig1 = extractFloat128Frac1( a );
5943    aSig0 = extractFloat128Frac0( a );
5944    aExp = extractFloat128Exp( a );
5945    aSign = extractFloat128Sign( a );
5946    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5947    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5948    aSig0 |= ( aSig1 != 0 );
5949    shiftCount = 0x4028 - aExp;
5950    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5951    return roundAndPackInt32(aSign, aSig0, status);
5952
5953}
5954
5955/*----------------------------------------------------------------------------
5956| Returns the result of converting the quadruple-precision floating-point
5957| value `a' to the 32-bit two's complement integer format.  The conversion
5958| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5959| Arithmetic, except that the conversion is always rounded toward zero.  If
5960| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5961| conversion overflows, the largest integer with the same sign as `a' is
5962| returned.
5963*----------------------------------------------------------------------------*/
5964
5965int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5966{
5967    flag aSign;
5968    int32_t aExp, shiftCount;
5969    uint64_t aSig0, aSig1, savedASig;
5970    int32_t z;
5971
5972    aSig1 = extractFloat128Frac1( a );
5973    aSig0 = extractFloat128Frac0( a );
5974    aExp = extractFloat128Exp( a );
5975    aSign = extractFloat128Sign( a );
5976    aSig0 |= ( aSig1 != 0 );
5977    if ( 0x401E < aExp ) {
5978        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5979        goto invalid;
5980    }
5981    else if ( aExp < 0x3FFF ) {
5982        if (aExp || aSig0) {
5983            status->float_exception_flags |= float_flag_inexact;
5984        }
5985        return 0;
5986    }
5987    aSig0 |= LIT64( 0x0001000000000000 );
5988    shiftCount = 0x402F - aExp;
5989    savedASig = aSig0;
5990    aSig0 >>= shiftCount;
5991    z = aSig0;
5992    if ( aSign ) z = - z;
5993    if ( ( z < 0 ) ^ aSign ) {
5994 invalid:
5995        float_raise(float_flag_invalid, status);
5996        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5997    }
5998    if ( ( aSig0<<shiftCount ) != savedASig ) {
5999        status->float_exception_flags |= float_flag_inexact;
6000    }
6001    return z;
6002
6003}
6004
6005/*----------------------------------------------------------------------------
6006| Returns the result of converting the quadruple-precision floating-point
6007| value `a' to the 64-bit two's complement integer format.  The conversion
6008| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6009| Arithmetic---which means in particular that the conversion is rounded
6010| according to the current rounding mode.  If `a' is a NaN, the largest
6011| positive integer is returned.  Otherwise, if the conversion overflows, the
6012| largest integer with the same sign as `a' is returned.
6013*----------------------------------------------------------------------------*/
6014
6015int64_t float128_to_int64(float128 a, float_status *status)
6016{
6017    flag aSign;
6018    int32_t aExp, shiftCount;
6019    uint64_t aSig0, aSig1;
6020
6021    aSig1 = extractFloat128Frac1( a );
6022    aSig0 = extractFloat128Frac0( a );
6023    aExp = extractFloat128Exp( a );
6024    aSign = extractFloat128Sign( a );
6025    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6026    shiftCount = 0x402F - aExp;
6027    if ( shiftCount <= 0 ) {
6028        if ( 0x403E < aExp ) {
6029            float_raise(float_flag_invalid, status);
6030            if (    ! aSign
6031                 || (    ( aExp == 0x7FFF )
6032                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6033                    )
6034               ) {
6035                return LIT64( 0x7FFFFFFFFFFFFFFF );
6036            }
6037            return (int64_t) LIT64( 0x8000000000000000 );
6038        }
6039        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6040    }
6041    else {
6042        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6043    }
6044    return roundAndPackInt64(aSign, aSig0, aSig1, status);
6045
6046}
6047
6048/*----------------------------------------------------------------------------
6049| Returns the result of converting the quadruple-precision floating-point
6050| value `a' to the 64-bit two's complement integer format.  The conversion
6051| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6052| Arithmetic, except that the conversion is always rounded toward zero.
6053| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6054| the conversion overflows, the largest integer with the same sign as `a' is
6055| returned.
6056*----------------------------------------------------------------------------*/
6057
6058int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6059{
6060    flag aSign;
6061    int32_t aExp, shiftCount;
6062    uint64_t aSig0, aSig1;
6063    int64_t z;
6064
6065    aSig1 = extractFloat128Frac1( a );
6066    aSig0 = extractFloat128Frac0( a );
6067    aExp = extractFloat128Exp( a );
6068    aSign = extractFloat128Sign( a );
6069    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6070    shiftCount = aExp - 0x402F;
6071    if ( 0 < shiftCount ) {
6072        if ( 0x403E <= aExp ) {
6073            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6074            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6075                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6076                if (aSig1) {
6077                    status->float_exception_flags |= float_flag_inexact;
6078                }
6079            }
6080            else {
6081                float_raise(float_flag_invalid, status);
6082                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6083                    return LIT64( 0x7FFFFFFFFFFFFFFF );
6084                }
6085            }
6086            return (int64_t) LIT64( 0x8000000000000000 );
6087        }
6088        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6089        if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6090            status->float_exception_flags |= float_flag_inexact;
6091        }
6092    }
6093    else {
6094        if ( aExp < 0x3FFF ) {
6095            if ( aExp | aSig0 | aSig1 ) {
6096                status->float_exception_flags |= float_flag_inexact;
6097            }
6098            return 0;
6099        }
6100        z = aSig0>>( - shiftCount );
6101        if (    aSig1
6102             || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6103            status->float_exception_flags |= float_flag_inexact;
6104        }
6105    }
6106    if ( aSign ) z = - z;
6107    return z;
6108
6109}
6110
6111/*----------------------------------------------------------------------------
6112| Returns the result of converting the quadruple-precision floating-point
6113| value `a' to the single-precision floating-point format.  The conversion
6114| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6115| Arithmetic.
6116*----------------------------------------------------------------------------*/
6117
6118float32 float128_to_float32(float128 a, float_status *status)
6119{
6120    flag aSign;
6121    int32_t aExp;
6122    uint64_t aSig0, aSig1;
6123    uint32_t zSig;
6124
6125    aSig1 = extractFloat128Frac1( a );
6126    aSig0 = extractFloat128Frac0( a );
6127    aExp = extractFloat128Exp( a );
6128    aSign = extractFloat128Sign( a );
6129    if ( aExp == 0x7FFF ) {
6130        if ( aSig0 | aSig1 ) {
6131            return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6132        }
6133        return packFloat32( aSign, 0xFF, 0 );
6134    }
6135    aSig0 |= ( aSig1 != 0 );
6136    shift64RightJamming( aSig0, 18, &aSig0 );
6137    zSig = aSig0;
6138    if ( aExp || zSig ) {
6139        zSig |= 0x40000000;
6140        aExp -= 0x3F81;
6141    }
6142    return roundAndPackFloat32(aSign, aExp, zSig, status);
6143
6144}
6145
6146/*----------------------------------------------------------------------------
6147| Returns the result of converting the quadruple-precision floating-point
6148| value `a' to the double-precision floating-point format.  The conversion
6149| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6150| Arithmetic.
6151*----------------------------------------------------------------------------*/
6152
6153float64 float128_to_float64(float128 a, float_status *status)
6154{
6155    flag aSign;
6156    int32_t aExp;
6157    uint64_t aSig0, aSig1;
6158
6159    aSig1 = extractFloat128Frac1( a );
6160    aSig0 = extractFloat128Frac0( a );
6161    aExp = extractFloat128Exp( a );
6162    aSign = extractFloat128Sign( a );
6163    if ( aExp == 0x7FFF ) {
6164        if ( aSig0 | aSig1 ) {
6165            return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6166        }
6167        return packFloat64( aSign, 0x7FF, 0 );
6168    }
6169    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6170    aSig0 |= ( aSig1 != 0 );
6171    if ( aExp || aSig0 ) {
6172        aSig0 |= LIT64( 0x4000000000000000 );
6173        aExp -= 0x3C01;
6174    }
6175    return roundAndPackFloat64(aSign, aExp, aSig0, status);
6176
6177}
6178
6179/*----------------------------------------------------------------------------
6180| Returns the result of converting the quadruple-precision floating-point
6181| value `a' to the extended double-precision floating-point format.  The
6182| conversion is performed according to the IEC/IEEE Standard for Binary
6183| Floating-Point Arithmetic.
6184*----------------------------------------------------------------------------*/
6185
6186floatx80 float128_to_floatx80(float128 a, float_status *status)
6187{
6188    flag aSign;
6189    int32_t aExp;
6190    uint64_t aSig0, aSig1;
6191
6192    aSig1 = extractFloat128Frac1( a );
6193    aSig0 = extractFloat128Frac0( a );
6194    aExp = extractFloat128Exp( a );
6195    aSign = extractFloat128Sign( a );
6196    if ( aExp == 0x7FFF ) {
6197        if ( aSig0 | aSig1 ) {
6198            return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6199        }
6200        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6201    }
6202    if ( aExp == 0 ) {
6203        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6204        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6205    }
6206    else {
6207        aSig0 |= LIT64( 0x0001000000000000 );
6208    }
6209    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6210    return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6211
6212}
6213
6214/*----------------------------------------------------------------------------
6215| Rounds the quadruple-precision floating-point value `a' to an integer, and
6216| returns the result as a quadruple-precision floating-point value.  The
6217| operation is performed according to the IEC/IEEE Standard for Binary
6218| Floating-Point Arithmetic.
6219*----------------------------------------------------------------------------*/
6220
6221float128 float128_round_to_int(float128 a, float_status *status)
6222{
6223    flag aSign;
6224    int32_t aExp;
6225    uint64_t lastBitMask, roundBitsMask;
6226    float128 z;
6227
6228    aExp = extractFloat128Exp( a );
6229    if ( 0x402F <= aExp ) {
6230        if ( 0x406F <= aExp ) {
6231            if (    ( aExp == 0x7FFF )
6232                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6233               ) {
6234                return propagateFloat128NaN(a, a, status);
6235            }
6236            return a;
6237        }
6238        lastBitMask = 1;
6239        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6240        roundBitsMask = lastBitMask - 1;
6241        z = a;
6242        switch (status->float_rounding_mode) {
6243        case float_round_nearest_even:
6244            if ( lastBitMask ) {
6245                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6246                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6247            }
6248            else {
6249                if ( (int64_t) z.low < 0 ) {
6250                    ++z.high;
6251                    if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6252                }
6253            }
6254            break;
6255        case float_round_ties_away:
6256            if (lastBitMask) {
6257                add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6258            } else {
6259                if ((int64_t) z.low < 0) {
6260                    ++z.high;
6261                }
6262            }
6263            break;
6264        case float_round_to_zero:
6265            break;
6266        case float_round_up:
6267            if (!extractFloat128Sign(z)) {
6268                add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6269            }
6270            break;
6271        case float_round_down:
6272            if (extractFloat128Sign(z)) {
6273                add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6274            }
6275            break;
6276        default:
6277            abort();
6278        }
6279        z.low &= ~ roundBitsMask;
6280    }
6281    else {
6282        if ( aExp < 0x3FFF ) {
6283            if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6284            status->float_exception_flags |= float_flag_inexact;
6285            aSign = extractFloat128Sign( a );
6286            switch (status->float_rounding_mode) {
6287             case float_round_nearest_even:
6288                if (    ( aExp == 0x3FFE )
6289                     && (   extractFloat128Frac0( a )
6290                          | extractFloat128Frac1( a ) )
6291                   ) {
6292                    return packFloat128( aSign, 0x3FFF, 0, 0 );
6293                }
6294                break;
6295            case float_round_ties_away:
6296                if (aExp == 0x3FFE) {
6297                    return packFloat128(aSign, 0x3FFF, 0, 0);
6298                }
6299                break;
6300             case float_round_down:
6301                return
6302                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6303                    : packFloat128( 0, 0, 0, 0 );
6304             case float_round_up:
6305                return
6306                      aSign ? packFloat128( 1, 0, 0, 0 )
6307                    : packFloat128( 0, 0x3FFF, 0, 0 );
6308            }
6309            return packFloat128( aSign, 0, 0, 0 );
6310        }
6311        lastBitMask = 1;
6312        lastBitMask <<= 0x402F - aExp;
6313        roundBitsMask = lastBitMask - 1;
6314        z.low = 0;
6315        z.high = a.high;
6316        switch (status->float_rounding_mode) {
6317        case float_round_nearest_even:
6318            z.high += lastBitMask>>1;
6319            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6320                z.high &= ~ lastBitMask;
6321            }
6322            break;
6323        case float_round_ties_away:
6324            z.high += lastBitMask>>1;
6325            break;
6326        case float_round_to_zero:
6327            break;
6328        case float_round_up:
6329            if (!extractFloat128Sign(z)) {
6330                z.high |= ( a.low != 0 );
6331                z.high += roundBitsMask;
6332            }
6333            break;
6334        case float_round_down:
6335            if (extractFloat128Sign(z)) {
6336                z.high |= (a.low != 0);
6337                z.high += roundBitsMask;
6338            }
6339            break;
6340        default:
6341            abort();
6342        }
6343        z.high &= ~ roundBitsMask;
6344    }
6345    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6346        status->float_exception_flags |= float_flag_inexact;
6347    }
6348    return z;
6349
6350}
6351
6352/*----------------------------------------------------------------------------
6353| Returns the result of adding the absolute values of the quadruple-precision
6354| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6355| before being returned.  `zSign' is ignored if the result is a NaN.
6356| The addition is performed according to the IEC/IEEE Standard for Binary
6357| Floating-Point Arithmetic.
6358*----------------------------------------------------------------------------*/
6359
6360static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6361                                float_status *status)
6362{
6363    int32_t aExp, bExp, zExp;
6364    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6365    int32_t expDiff;
6366
6367    aSig1 = extractFloat128Frac1( a );
6368    aSig0 = extractFloat128Frac0( a );
6369    aExp = extractFloat128Exp( a );
6370    bSig1 = extractFloat128Frac1( b );
6371    bSig0 = extractFloat128Frac0( b );
6372    bExp = extractFloat128Exp( b );
6373    expDiff = aExp - bExp;
6374    if ( 0 < expDiff ) {
6375        if ( aExp == 0x7FFF ) {
6376            if (aSig0 | aSig1) {
6377                return propagateFloat128NaN(a, b, status);
6378            }
6379            return a;
6380        }
6381        if ( bExp == 0 ) {
6382            --expDiff;
6383        }
6384        else {
6385            bSig0 |= LIT64( 0x0001000000000000 );
6386        }
6387        shift128ExtraRightJamming(
6388            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6389        zExp = aExp;
6390    }
6391    else if ( expDiff < 0 ) {
6392        if ( bExp == 0x7FFF ) {
6393            if (bSig0 | bSig1) {
6394                return propagateFloat128NaN(a, b, status);
6395            }
6396            return packFloat128( zSign, 0x7FFF, 0, 0 );
6397        }
6398        if ( aExp == 0 ) {
6399            ++expDiff;
6400        }
6401        else {
6402            aSig0 |= LIT64( 0x0001000000000000 );
6403        }
6404        shift128ExtraRightJamming(
6405            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6406        zExp = bExp;
6407    }
6408    else {
6409        if ( aExp == 0x7FFF ) {
6410            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6411                return propagateFloat128NaN(a, b, status);
6412            }
6413            return a;
6414        }
6415        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6416        if ( aExp == 0 ) {
6417            if (status->flush_to_zero) {
6418                if (zSig0 | zSig1) {
6419                    float_raise(float_flag_output_denormal, status);
6420                }
6421                return packFloat128(zSign, 0, 0, 0);
6422            }
6423            return packFloat128( zSign, 0, zSig0, zSig1 );
6424        }
6425        zSig2 = 0;
6426        zSig0 |= LIT64( 0x0002000000000000 );
6427        zExp = aExp;
6428        goto shiftRight1;
6429    }
6430    aSig0 |= LIT64( 0x0001000000000000 );
6431    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6432    --zExp;
6433    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6434    ++zExp;
6435 shiftRight1:
6436    shift128ExtraRightJamming(
6437        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6438 roundAndPack:
6439    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6440
6441}
6442
6443/*----------------------------------------------------------------------------
6444| Returns the result of subtracting the absolute values of the quadruple-
6445| precision floating-point values `a' and `b'.  If `zSign' is 1, the
6446| difference is negated before being returned.  `zSign' is ignored if the
6447| result is a NaN.  The subtraction is performed according to the IEC/IEEE
6448| Standard for Binary Floating-Point Arithmetic.
6449*----------------------------------------------------------------------------*/
6450
6451static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6452                                float_status *status)
6453{
6454    int32_t aExp, bExp, zExp;
6455    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6456    int32_t expDiff;
6457
6458    aSig1 = extractFloat128Frac1( a );
6459    aSig0 = extractFloat128Frac0( a );
6460    aExp = extractFloat128Exp( a );
6461    bSig1 = extractFloat128Frac1( b );
6462    bSig0 = extractFloat128Frac0( b );
6463    bExp = extractFloat128Exp( b );
6464    expDiff = aExp - bExp;
6465    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6466    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6467    if ( 0 < expDiff ) goto aExpBigger;
6468    if ( expDiff < 0 ) goto bExpBigger;
6469    if ( aExp == 0x7FFF ) {
6470        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6471            return propagateFloat128NaN(a, b, status);
6472        }
6473        float_raise(float_flag_invalid, status);
6474        return float128_default_nan(status);
6475    }
6476    if ( aExp == 0 ) {
6477        aExp = 1;
6478        bExp = 1;
6479    }
6480    if ( bSig0 < aSig0 ) goto aBigger;
6481    if ( aSig0 < bSig0 ) goto bBigger;
6482    if ( bSig1 < aSig1 ) goto aBigger;
6483    if ( aSig1 < bSig1 ) goto bBigger;
6484    return packFloat128(status->float_rounding_mode == float_round_down,
6485                        0, 0, 0);
6486 bExpBigger:
6487    if ( bExp == 0x7FFF ) {
6488        if (bSig0 | bSig1) {
6489            return propagateFloat128NaN(a, b, status);
6490        }
6491        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6492    }
6493    if ( aExp == 0 ) {
6494        ++expDiff;
6495    }
6496    else {
6497        aSig0 |= LIT64( 0x4000000000000000 );
6498    }
6499    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6500    bSig0 |= LIT64( 0x4000000000000000 );
6501 bBigger:
6502    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6503    zExp = bExp;
6504    zSign ^= 1;
6505    goto normalizeRoundAndPack;
6506 aExpBigger:
6507    if ( aExp == 0x7FFF ) {
6508        if (aSig0 | aSig1) {
6509            return propagateFloat128NaN(a, b, status);
6510        }
6511        return a;
6512    }
6513    if ( bExp == 0 ) {
6514        --expDiff;
6515    }
6516    else {
6517        bSig0 |= LIT64( 0x4000000000000000 );
6518    }
6519    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6520    aSig0 |= LIT64( 0x4000000000000000 );
6521 aBigger:
6522    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6523    zExp = aExp;
6524 normalizeRoundAndPack:
6525    --zExp;
6526    return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6527                                         status);
6528
6529}
6530
6531/*----------------------------------------------------------------------------
6532| Returns the result of adding the quadruple-precision floating-point values
6533| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6534| for Binary Floating-Point Arithmetic.
6535*----------------------------------------------------------------------------*/
6536
6537float128 float128_add(float128 a, float128 b, float_status *status)
6538{
6539    flag aSign, bSign;
6540
6541    aSign = extractFloat128Sign( a );
6542    bSign = extractFloat128Sign( b );
6543    if ( aSign == bSign ) {
6544        return addFloat128Sigs(a, b, aSign, status);
6545    }
6546    else {
6547        return subFloat128Sigs(a, b, aSign, status);
6548    }
6549
6550}
6551
6552/*----------------------------------------------------------------------------
6553| Returns the result of subtracting the quadruple-precision floating-point
6554| values `a' and `b'.  The operation is performed according to the IEC/IEEE
6555| Standard for Binary Floating-Point Arithmetic.
6556*----------------------------------------------------------------------------*/
6557
6558float128 float128_sub(float128 a, float128 b, float_status *status)
6559{
6560    flag aSign, bSign;
6561
6562    aSign = extractFloat128Sign( a );
6563    bSign = extractFloat128Sign( b );
6564    if ( aSign == bSign ) {
6565        return subFloat128Sigs(a, b, aSign, status);
6566    }
6567    else {
6568        return addFloat128Sigs(a, b, aSign, status);
6569    }
6570
6571}
6572
6573/*----------------------------------------------------------------------------
6574| Returns the result of multiplying the quadruple-precision floating-point
6575| values `a' and `b'.  The operation is performed according to the IEC/IEEE
6576| Standard for Binary Floating-Point Arithmetic.
6577*----------------------------------------------------------------------------*/
6578
6579float128 float128_mul(float128 a, float128 b, float_status *status)
6580{
6581    flag aSign, bSign, zSign;
6582    int32_t aExp, bExp, zExp;
6583    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6584
6585    aSig1 = extractFloat128Frac1( a );
6586    aSig0 = extractFloat128Frac0( a );
6587    aExp = extractFloat128Exp( a );
6588    aSign = extractFloat128Sign( a );
6589    bSig1 = extractFloat128Frac1( b );
6590    bSig0 = extractFloat128Frac0( b );
6591    bExp = extractFloat128Exp( b );
6592    bSign = extractFloat128Sign( b );
6593    zSign = aSign ^ bSign;
6594    if ( aExp == 0x7FFF ) {
6595        if (    ( aSig0 | aSig1 )
6596             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6597            return propagateFloat128NaN(a, b, status);
6598        }
6599        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6600        return packFloat128( zSign, 0x7FFF, 0, 0 );
6601    }
6602    if ( bExp == 0x7FFF ) {
6603        if (bSig0 | bSig1) {
6604            return propagateFloat128NaN(a, b, status);
6605        }
6606        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6607 invalid:
6608            float_raise(float_flag_invalid, status);
6609            return float128_default_nan(status);
6610        }
6611        return packFloat128( zSign, 0x7FFF, 0, 0 );
6612    }
6613    if ( aExp == 0 ) {
6614        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6615        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6616    }
6617    if ( bExp == 0 ) {
6618        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6619        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6620    }
6621    zExp = aExp + bExp - 0x4000;
6622    aSig0 |= LIT64( 0x0001000000000000 );
6623    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6624    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6625    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6626    zSig2 |= ( zSig3 != 0 );
6627    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6628        shift128ExtraRightJamming(
6629            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6630        ++zExp;
6631    }
6632    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6633
6634}
6635
6636/*----------------------------------------------------------------------------
6637| Returns the result of dividing the quadruple-precision floating-point value
6638| `a' by the corresponding value `b'.  The operation is performed according to
6639| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6640*----------------------------------------------------------------------------*/
6641
6642float128 float128_div(float128 a, float128 b, float_status *status)
6643{
6644    flag aSign, bSign, zSign;
6645    int32_t aExp, bExp, zExp;
6646    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6647    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6648
6649    aSig1 = extractFloat128Frac1( a );
6650    aSig0 = extractFloat128Frac0( a );
6651    aExp = extractFloat128Exp( a );
6652    aSign = extractFloat128Sign( a );
6653    bSig1 = extractFloat128Frac1( b );
6654    bSig0 = extractFloat128Frac0( b );
6655    bExp = extractFloat128Exp( b );
6656    bSign = extractFloat128Sign( b );
6657    zSign = aSign ^ bSign;
6658    if ( aExp == 0x7FFF ) {
6659        if (aSig0 | aSig1) {
6660            return propagateFloat128NaN(a, b, status);
6661        }
6662        if ( bExp == 0x7FFF ) {
6663            if (bSig0 | bSig1) {
6664                return propagateFloat128NaN(a, b, status);
6665            }
6666            goto invalid;
6667        }
6668        return packFloat128( zSign, 0x7FFF, 0, 0 );
6669    }
6670    if ( bExp == 0x7FFF ) {
6671        if (bSig0 | bSig1) {
6672            return propagateFloat128NaN(a, b, status);
6673        }
6674        return packFloat128( zSign, 0, 0, 0 );
6675    }
6676    if ( bExp == 0 ) {
6677        if ( ( bSig0 | bSig1 ) == 0 ) {
6678            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6679 invalid:
6680                float_raise(float_flag_invalid, status);
6681                return float128_default_nan(status);
6682            }
6683            float_raise(float_flag_divbyzero, status);
6684            return packFloat128( zSign, 0x7FFF, 0, 0 );
6685        }
6686        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6687    }
6688    if ( aExp == 0 ) {
6689        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6690        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6691    }
6692    zExp = aExp - bExp + 0x3FFD;
6693    shortShift128Left(
6694        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6695    shortShift128Left(
6696        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6697    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6698        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6699        ++zExp;
6700    }
6701    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6702    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6703    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6704    while ( (int64_t) rem0 < 0 ) {
6705        --zSig0;
6706        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6707    }
6708    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6709    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6710        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6711        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6712        while ( (int64_t) rem1 < 0 ) {
6713            --zSig1;
6714            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6715        }
6716        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6717    }
6718    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6719    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6720
6721}
6722
6723/*----------------------------------------------------------------------------
6724| Returns the remainder of the quadruple-precision floating-point value `a'
6725| with respect to the corresponding value `b'.  The operation is performed
6726| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6727*----------------------------------------------------------------------------*/
6728
6729float128 float128_rem(float128 a, float128 b, float_status *status)
6730{
6731    flag aSign, zSign;
6732    int32_t aExp, bExp, expDiff;
6733    uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6734    uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6735    int64_t sigMean0;
6736
6737    aSig1 = extractFloat128Frac1( a );
6738    aSig0 = extractFloat128Frac0( a );
6739    aExp = extractFloat128Exp( a );
6740    aSign = extractFloat128Sign( a );
6741    bSig1 = extractFloat128Frac1( b );
6742    bSig0 = extractFloat128Frac0( b );
6743    bExp = extractFloat128Exp( b );
6744    if ( aExp == 0x7FFF ) {
6745        if (    ( aSig0 | aSig1 )
6746             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6747            return propagateFloat128NaN(a, b, status);
6748        }
6749        goto invalid;
6750    }
6751    if ( bExp == 0x7FFF ) {
6752        if (bSig0 | bSig1) {
6753            return propagateFloat128NaN(a, b, status);
6754        }
6755        return a;
6756    }
6757    if ( bExp == 0 ) {
6758        if ( ( bSig0 | bSig1 ) == 0 ) {
6759 invalid:
6760            float_raise(float_flag_invalid, status);
6761            return float128_default_nan(status);
6762        }
6763        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6764    }
6765    if ( aExp == 0 ) {
6766        if ( ( aSig0 | aSig1 ) == 0 ) return a;
6767        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6768    }
6769    expDiff = aExp - bExp;
6770    if ( expDiff < -1 ) return a;
6771    shortShift128Left(
6772        aSig0 | LIT64( 0x0001000000000000 ),
6773        aSig1,
6774        15 - ( expDiff < 0 ),
6775        &aSig0,
6776        &aSig1
6777    );
6778    shortShift128Left(
6779        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6780    q = le128( bSig0, bSig1, aSig0, aSig1 );
6781    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6782    expDiff -= 64;
6783    while ( 0 < expDiff ) {
6784        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6785        q = ( 4 < q ) ? q - 4 : 0;
6786        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6787        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6788        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6789        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6790        expDiff -= 61;
6791    }
6792    if ( -64 < expDiff ) {
6793        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6794        q = ( 4 < q ) ? q - 4 : 0;
6795        q >>= - expDiff;
6796        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6797        expDiff += 52;
6798        if ( expDiff < 0 ) {
6799            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6800        }
6801        else {
6802            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6803        }
6804        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6805        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6806    }
6807    else {
6808        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6809        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6810    }
6811    do {
6812        alternateASig0 = aSig0;
6813        alternateASig1 = aSig1;
6814        ++q;
6815        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6816    } while ( 0 <= (int64_t) aSig0 );
6817    add128(
6818        aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6819    if (    ( sigMean0 < 0 )
6820         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6821        aSig0 = alternateASig0;
6822        aSig1 = alternateASig1;
6823    }
6824    zSign = ( (int64_t) aSig0 < 0 );
6825    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6826    return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6827                                         status);
6828}
6829
6830/*----------------------------------------------------------------------------
6831| Returns the square root of the quadruple-precision floating-point value `a'.
6832| The operation is performed according to the IEC/IEEE Standard for Binary
6833| Floating-Point Arithmetic.
6834*----------------------------------------------------------------------------*/
6835
6836float128 float128_sqrt(float128 a, float_status *status)
6837{
6838    flag aSign;
6839    int32_t aExp, zExp;
6840    uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6841    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6842
6843    aSig1 = extractFloat128Frac1( a );
6844    aSig0 = extractFloat128Frac0( a );
6845    aExp = extractFloat128Exp( a );
6846    aSign = extractFloat128Sign( a );
6847    if ( aExp == 0x7FFF ) {
6848        if (aSig0 | aSig1) {
6849            return propagateFloat128NaN(a, a, status);
6850        }
6851        if ( ! aSign ) return a;
6852        goto invalid;
6853    }
6854    if ( aSign ) {
6855        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6856 invalid:
6857        float_raise(float_flag_invalid, status);
6858        return float128_default_nan(status);
6859    }
6860    if ( aExp == 0 ) {
6861        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6862        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6863    }
6864    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6865    aSig0 |= LIT64( 0x0001000000000000 );
6866    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6867    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6868    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6869    doubleZSig0 = zSig0<<1;
6870    mul64To128( zSig0, zSig0, &term0, &term1 );
6871    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6872    while ( (int64_t) rem0 < 0 ) {
6873        --zSig0;
6874        doubleZSig0 -= 2;
6875        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6876    }
6877    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6878    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6879        if ( zSig1 == 0 ) zSig1 = 1;
6880        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6881        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6882        mul64To128( zSig1, zSig1, &term2, &term3 );
6883        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6884        while ( (int64_t) rem1 < 0 ) {
6885            --zSig1;
6886            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6887            term3 |= 1;
6888            term2 |= doubleZSig0;
6889            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6890        }
6891        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6892    }
6893    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6894    return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6895
6896}
6897
6898/*----------------------------------------------------------------------------
6899| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6900| the corresponding value `b', and 0 otherwise.  The invalid exception is
6901| raised if either operand is a NaN.  Otherwise, the comparison is performed
6902| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6903*----------------------------------------------------------------------------*/
6904
6905int float128_eq(float128 a, float128 b, float_status *status)
6906{
6907
6908    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6909              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6910         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6911              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6912       ) {
6913        float_raise(float_flag_invalid, status);
6914        return 0;
6915    }
6916    return
6917           ( a.low == b.low )
6918        && (    ( a.high == b.high )
6919             || (    ( a.low == 0 )
6920                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6921           );
6922
6923}
6924
6925/*----------------------------------------------------------------------------
6926| Returns 1 if the quadruple-precision floating-point value `a' is less than
6927| or equal to the corresponding value `b', and 0 otherwise.  The invalid
6928| exception is raised if either operand is a NaN.  The comparison is performed
6929| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6930*----------------------------------------------------------------------------*/
6931
6932int float128_le(float128 a, float128 b, float_status *status)
6933{
6934    flag aSign, bSign;
6935
6936    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6937              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6938         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6939              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6940       ) {
6941        float_raise(float_flag_invalid, status);
6942        return 0;
6943    }
6944    aSign = extractFloat128Sign( a );
6945    bSign = extractFloat128Sign( b );
6946    if ( aSign != bSign ) {
6947        return
6948               aSign
6949            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6950                 == 0 );
6951    }
6952    return
6953          aSign ? le128( b.high, b.low, a.high, a.low )
6954        : le128( a.high, a.low, b.high, b.low );
6955
6956}
6957
6958/*----------------------------------------------------------------------------
6959| Returns 1 if the quadruple-precision floating-point value `a' is less than
6960| the corresponding value `b', and 0 otherwise.  The invalid exception is
6961| raised if either operand is a NaN.  The comparison is performed according
6962| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6963*----------------------------------------------------------------------------*/
6964
6965int float128_lt(float128 a, float128 b, float_status *status)
6966{
6967    flag aSign, bSign;
6968
6969    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6970              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6971         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6972              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6973       ) {
6974        float_raise(float_flag_invalid, status);
6975        return 0;
6976    }
6977    aSign = extractFloat128Sign( a );
6978    bSign = extractFloat128Sign( b );
6979    if ( aSign != bSign ) {
6980        return
6981               aSign
6982            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6983                 != 0 );
6984    }
6985    return
6986          aSign ? lt128( b.high, b.low, a.high, a.low )
6987        : lt128( a.high, a.low, b.high, b.low );
6988
6989}
6990
6991/*----------------------------------------------------------------------------
6992| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6993| be compared, and 0 otherwise.  The invalid exception is raised if either
6994| operand is a NaN. The comparison is performed according to the IEC/IEEE
6995| Standard for Binary Floating-Point Arithmetic.
6996*----------------------------------------------------------------------------*/
6997
6998int float128_unordered(float128 a, float128 b, float_status *status)
6999{
7000    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7001              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7002         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7003              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7004       ) {
7005        float_raise(float_flag_invalid, status);
7006        return 1;
7007    }
7008    return 0;
7009}
7010
7011/*----------------------------------------------------------------------------
7012| Returns 1 if the quadruple-precision floating-point value `a' is equal to
7013| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7014| exception.  The comparison is performed according to the IEC/IEEE Standard
7015| for Binary Floating-Point Arithmetic.
7016*----------------------------------------------------------------------------*/
7017
7018int float128_eq_quiet(float128 a, float128 b, float_status *status)
7019{
7020
7021    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7022              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7023         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7024              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7025       ) {
7026        if (float128_is_signaling_nan(a, status)
7027         || float128_is_signaling_nan(b, status)) {
7028            float_raise(float_flag_invalid, status);
7029        }
7030        return 0;
7031    }
7032    return
7033           ( a.low == b.low )
7034        && (    ( a.high == b.high )
7035             || (    ( a.low == 0 )
7036                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7037           );
7038
7039}
7040
7041/*----------------------------------------------------------------------------
7042| Returns 1 if the quadruple-precision floating-point value `a' is less than
7043| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7044| cause an exception.  Otherwise, the comparison is performed according to the
7045| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7046*----------------------------------------------------------------------------*/
7047
7048int float128_le_quiet(float128 a, float128 b, float_status *status)
7049{
7050    flag aSign, bSign;
7051
7052    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7053              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7054         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7055              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7056       ) {
7057        if (float128_is_signaling_nan(a, status)
7058         || float128_is_signaling_nan(b, status)) {
7059            float_raise(float_flag_invalid, status);
7060        }
7061        return 0;
7062    }
7063    aSign = extractFloat128Sign( a );
7064    bSign = extractFloat128Sign( b );
7065    if ( aSign != bSign ) {
7066        return
7067               aSign
7068            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7069                 == 0 );
7070    }
7071    return
7072          aSign ? le128( b.high, b.low, a.high, a.low )
7073        : le128( a.high, a.low, b.high, b.low );
7074
7075}
7076
7077/*----------------------------------------------------------------------------
7078| Returns 1 if the quadruple-precision floating-point value `a' is less than
7079| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7080| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7081| Standard for Binary Floating-Point Arithmetic.
7082*----------------------------------------------------------------------------*/
7083
7084int float128_lt_quiet(float128 a, float128 b, float_status *status)
7085{
7086    flag aSign, bSign;
7087
7088    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7089              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7090         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7091              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7092       ) {
7093        if (float128_is_signaling_nan(a, status)
7094         || float128_is_signaling_nan(b, status)) {
7095            float_raise(float_flag_invalid, status);
7096        }
7097        return 0;
7098    }
7099    aSign = extractFloat128Sign( a );
7100    bSign = extractFloat128Sign( b );
7101    if ( aSign != bSign ) {
7102        return
7103               aSign
7104            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7105                 != 0 );
7106    }
7107    return
7108          aSign ? lt128( b.high, b.low, a.high, a.low )
7109        : lt128( a.high, a.low, b.high, b.low );
7110
7111}
7112
7113/*----------------------------------------------------------------------------
7114| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7115| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7116| comparison is performed according to the IEC/IEEE Standard for Binary
7117| Floating-Point Arithmetic.
7118*----------------------------------------------------------------------------*/
7119
7120int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7121{
7122    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7123              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7124         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7125              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7126       ) {
7127        if (float128_is_signaling_nan(a, status)
7128         || float128_is_signaling_nan(b, status)) {
7129            float_raise(float_flag_invalid, status);
7130        }
7131        return 1;
7132    }
7133    return 0;
7134}
7135
7136/* misc functions */
7137float32 uint32_to_float32(uint32_t a, float_status *status)
7138{
7139    return int64_to_float32(a, status);
7140}
7141
7142float64 uint32_to_float64(uint32_t a, float_status *status)
7143{
7144    return int64_to_float64(a, status);
7145}
7146
7147uint32_t float32_to_uint32(float32 a, float_status *status)
7148{
7149    int64_t v;
7150    uint32_t res;
7151    int old_exc_flags = get_float_exception_flags(status);
7152
7153    v = float32_to_int64(a, status);
7154    if (v < 0) {
7155        res = 0;
7156    } else if (v > 0xffffffff) {
7157        res = 0xffffffff;
7158    } else {
7159        return v;
7160    }
7161    set_float_exception_flags(old_exc_flags, status);
7162    float_raise(float_flag_invalid, status);
7163    return res;
7164}
7165
7166uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
7167{
7168    int64_t v;
7169    uint32_t res;
7170    int old_exc_flags = get_float_exception_flags(status);
7171
7172    v = float32_to_int64_round_to_zero(a, status);
7173    if (v < 0) {
7174        res = 0;
7175    } else if (v > 0xffffffff) {
7176        res = 0xffffffff;
7177    } else {
7178        return v;
7179    }
7180    set_float_exception_flags(old_exc_flags, status);
7181    float_raise(float_flag_invalid, status);
7182    return res;
7183}
7184
7185int16_t float32_to_int16(float32 a, float_status *status)
7186{
7187    int32_t v;
7188    int16_t res;
7189    int old_exc_flags = get_float_exception_flags(status);
7190
7191    v = float32_to_int32(a, status);
7192    if (v < -0x8000) {
7193        res = -0x8000;
7194    } else if (v > 0x7fff) {
7195        res = 0x7fff;
7196    } else {
7197        return v;
7198    }
7199
7200    set_float_exception_flags(old_exc_flags, status);
7201    float_raise(float_flag_invalid, status);
7202    return res;
7203}
7204
7205uint16_t float32_to_uint16(float32 a, float_status *status)
7206{
7207    int32_t v;
7208    uint16_t res;
7209    int old_exc_flags = get_float_exception_flags(status);
7210
7211    v = float32_to_int32(a, status);
7212    if (v < 0) {
7213        res = 0;
7214    } else if (v > 0xffff) {
7215        res = 0xffff;
7216    } else {
7217        return v;
7218    }
7219
7220    set_float_exception_flags(old_exc_flags, status);
7221    float_raise(float_flag_invalid, status);
7222    return res;
7223}
7224
7225uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7226{
7227    int64_t v;
7228    uint16_t res;
7229    int old_exc_flags = get_float_exception_flags(status);
7230
7231    v = float32_to_int64_round_to_zero(a, status);
7232    if (v < 0) {
7233        res = 0;
7234    } else if (v > 0xffff) {
7235        res = 0xffff;
7236    } else {
7237        return v;
7238    }
7239    set_float_exception_flags(old_exc_flags, status);
7240    float_raise(float_flag_invalid, status);
7241    return res;
7242}
7243
7244uint32_t float64_to_uint32(float64 a, float_status *status)
7245{
7246    uint64_t v;
7247    uint32_t res;
7248    int old_exc_flags = get_float_exception_flags(status);
7249
7250    v = float64_to_uint64(a, status);
7251    if (v > 0xffffffff) {
7252        res = 0xffffffff;
7253    } else {
7254        return v;
7255    }
7256    set_float_exception_flags(old_exc_flags, status);
7257    float_raise(float_flag_invalid, status);
7258    return res;
7259}
7260
7261uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
7262{
7263    uint64_t v;
7264    uint32_t res;
7265    int old_exc_flags = get_float_exception_flags(status);
7266
7267    v = float64_to_uint64_round_to_zero(a, status);
7268    if (v > 0xffffffff) {
7269        res = 0xffffffff;
7270    } else {
7271        return v;
7272    }
7273    set_float_exception_flags(old_exc_flags, status);
7274    float_raise(float_flag_invalid, status);
7275    return res;
7276}
7277
7278int16_t float64_to_int16(float64 a, float_status *status)
7279{
7280    int64_t v;
7281    int16_t res;
7282    int old_exc_flags = get_float_exception_flags(status);
7283
7284    v = float64_to_int32(a, status);
7285    if (v < -0x8000) {
7286        res = -0x8000;
7287    } else if (v > 0x7fff) {
7288        res = 0x7fff;
7289    } else {
7290        return v;
7291    }
7292
7293    set_float_exception_flags(old_exc_flags, status);
7294    float_raise(float_flag_invalid, status);
7295    return res;
7296}
7297
7298uint16_t float64_to_uint16(float64 a, float_status *status)
7299{
7300    int64_t v;
7301    uint16_t res;
7302    int old_exc_flags = get_float_exception_flags(status);
7303
7304    v = float64_to_int32(a, status);
7305    if (v < 0) {
7306        res = 0;
7307    } else if (v > 0xffff) {
7308        res = 0xffff;
7309    } else {
7310        return v;
7311    }
7312
7313    set_float_exception_flags(old_exc_flags, status);
7314    float_raise(float_flag_invalid, status);
7315    return res;
7316}
7317
7318uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7319{
7320    int64_t v;
7321    uint16_t res;
7322    int old_exc_flags = get_float_exception_flags(status);
7323
7324    v = float64_to_int64_round_to_zero(a, status);
7325    if (v < 0) {
7326        res = 0;
7327    } else if (v > 0xffff) {
7328        res = 0xffff;
7329    } else {
7330        return v;
7331    }
7332    set_float_exception_flags(old_exc_flags, status);
7333    float_raise(float_flag_invalid, status);
7334    return res;
7335}
7336
7337/*----------------------------------------------------------------------------
7338| Returns the result of converting the double-precision floating-point value
7339| `a' to the 64-bit unsigned integer format.  The conversion is
7340| performed according to the IEC/IEEE Standard for Binary Floating-Point
7341| Arithmetic---which means in particular that the conversion is rounded
7342| according to the current rounding mode.  If `a' is a NaN, the largest
7343| positive integer is returned.  If the conversion overflows, the
7344| largest unsigned integer is returned.  If 'a' is negative, the value is
7345| rounded and zero is returned; negative values that do not round to zero
7346| will raise the inexact exception.
7347*----------------------------------------------------------------------------*/
7348
7349uint64_t float64_to_uint64(float64 a, float_status *status)
7350{
7351    flag aSign;
7352    int aExp;
7353    int shiftCount;
7354    uint64_t aSig, aSigExtra;
7355    a = float64_squash_input_denormal(a, status);
7356
7357    aSig = extractFloat64Frac(a);
7358    aExp = extractFloat64Exp(a);
7359    aSign = extractFloat64Sign(a);
7360    if (aSign && (aExp > 1022)) {
7361        float_raise(float_flag_invalid, status);
7362        if (float64_is_any_nan(a)) {
7363            return LIT64(0xFFFFFFFFFFFFFFFF);
7364        } else {
7365            return 0;
7366        }
7367    }
7368    if (aExp) {
7369        aSig |= LIT64(0x0010000000000000);
7370    }
7371    shiftCount = 0x433 - aExp;
7372    if (shiftCount <= 0) {
7373        if (0x43E < aExp) {
7374            float_raise(float_flag_invalid, status);
7375            return LIT64(0xFFFFFFFFFFFFFFFF);
7376        }
7377        aSigExtra = 0;
7378        aSig <<= -shiftCount;
7379    } else {
7380        shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7381    }
7382    return roundAndPackUint64(aSign, aSig, aSigExtra, status);
7383}
7384
7385uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7386{
7387    signed char current_rounding_mode = status->float_rounding_mode;
7388    set_float_rounding_mode(float_round_to_zero, status);
7389    int64_t v = float64_to_uint64(a, status);
7390    set_float_rounding_mode(current_rounding_mode, status);
7391    return v;
7392}
7393
7394#define COMPARE(s, nan_exp)                                                  \
7395static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7396                                      int is_quiet, float_status *status)    \
7397{                                                                            \
7398    flag aSign, bSign;                                                       \
7399    uint ## s ## _t av, bv;                                                  \
7400    a = float ## s ## _squash_input_denormal(a, status);                     \
7401    b = float ## s ## _squash_input_denormal(b, status);                     \
7402                                                                             \
7403    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7404         extractFloat ## s ## Frac( a ) ) ||                                 \
7405        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7406          extractFloat ## s ## Frac( b ) )) {                                \
7407        if (!is_quiet ||                                                     \
7408            float ## s ## _is_signaling_nan(a, status) ||                  \
7409            float ## s ## _is_signaling_nan(b, status)) {                 \
7410            float_raise(float_flag_invalid, status);                         \
7411        }                                                                    \
7412        return float_relation_unordered;                                     \
7413    }                                                                        \
7414    aSign = extractFloat ## s ## Sign( a );                                  \
7415    bSign = extractFloat ## s ## Sign( b );                                  \
7416    av = float ## s ## _val(a);                                              \
7417    bv = float ## s ## _val(b);                                              \
7418    if ( aSign != bSign ) {                                                  \
7419        if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7420            /* zero case */                                                  \
7421            return float_relation_equal;                                     \
7422        } else {                                                             \
7423            return 1 - (2 * aSign);                                          \
7424        }                                                                    \
7425    } else {                                                                 \
7426        if (av == bv) {                                                      \
7427            return float_relation_equal;                                     \
7428        } else {                                                             \
7429            return 1 - 2 * (aSign ^ ( av < bv ));                            \
7430        }                                                                    \
7431    }                                                                        \
7432}                                                                            \
7433                                                                             \
7434int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7435{                                                                            \
7436    return float ## s ## _compare_internal(a, b, 0, status);                 \
7437}                                                                            \
7438                                                                             \
7439int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
7440                                 float_status *status)                       \
7441{                                                                            \
7442    return float ## s ## _compare_internal(a, b, 1, status);                 \
7443}
7444
7445COMPARE(32, 0xff)
7446COMPARE(64, 0x7ff)
7447
7448static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7449                                            int is_quiet, float_status *status)
7450{
7451    flag aSign, bSign;
7452
7453    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7454        float_raise(float_flag_invalid, status);
7455        return float_relation_unordered;
7456    }
7457    if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7458          ( extractFloatx80Frac( a )<<1 ) ) ||
7459        ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7460          ( extractFloatx80Frac( b )<<1 ) )) {
7461        if (!is_quiet ||
7462            floatx80_is_signaling_nan(a, status) ||
7463            floatx80_is_signaling_nan(b, status)) {
7464            float_raise(float_flag_invalid, status);
7465        }
7466        return float_relation_unordered;
7467    }
7468    aSign = extractFloatx80Sign( a );
7469    bSign = extractFloatx80Sign( b );
7470    if ( aSign != bSign ) {
7471
7472        if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7473             ( ( a.low | b.low ) == 0 ) ) {
7474            /* zero case */
7475            return float_relation_equal;
7476        } else {
7477            return 1 - (2 * aSign);
7478        }
7479    } else {
7480        if (a.low == b.low && a.high == b.high) {
7481            return float_relation_equal;
7482        } else {
7483            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7484        }
7485    }
7486}
7487
7488int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7489{
7490    return floatx80_compare_internal(a, b, 0, status);
7491}
7492
7493int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7494{
7495    return floatx80_compare_internal(a, b, 1, status);
7496}
7497
7498static inline int float128_compare_internal(float128 a, float128 b,
7499                                            int is_quiet, float_status *status)
7500{
7501    flag aSign, bSign;
7502
7503    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7504          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7505        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7506          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7507        if (!is_quiet ||
7508            float128_is_signaling_nan(a, status) ||
7509            float128_is_signaling_nan(b, status)) {
7510            float_raise(float_flag_invalid, status);
7511        }
7512        return float_relation_unordered;
7513    }
7514    aSign = extractFloat128Sign( a );
7515    bSign = extractFloat128Sign( b );
7516    if ( aSign != bSign ) {
7517        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7518            /* zero case */
7519            return float_relation_equal;
7520        } else {
7521            return 1 - (2 * aSign);
7522        }
7523    } else {
7524        if (a.low == b.low && a.high == b.high) {
7525            return float_relation_equal;
7526        } else {
7527            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7528        }
7529    }
7530}
7531
7532int float128_compare(float128 a, float128 b, float_status *status)
7533{
7534    return float128_compare_internal(a, b, 0, status);
7535}
7536
7537int float128_compare_quiet(float128 a, float128 b, float_status *status)
7538{
7539    return float128_compare_internal(a, b, 1, status);
7540}
7541
7542/* min() and max() functions. These can't be implemented as
7543 * 'compare and pick one input' because that would mishandle
7544 * NaNs and +0 vs -0.
7545 *
7546 * minnum() and maxnum() functions. These are similar to the min()
7547 * and max() functions but if one of the arguments is a QNaN and
7548 * the other is numerical then the numerical argument is returned.
7549 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7550 * and maxNum() operations. min() and max() are the typical min/max
7551 * semantics provided by many CPUs which predate that specification.
7552 *
7553 * minnummag() and maxnummag() functions correspond to minNumMag()
7554 * and minNumMag() from the IEEE-754 2008.
7555 */
7556#define MINMAX(s)                                                       \
7557static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7558                                               int ismin, int isieee,   \
7559                                               int ismag,               \
7560                                               float_status *status)    \
7561{                                                                       \
7562    flag aSign, bSign;                                                  \
7563    uint ## s ## _t av, bv, aav, abv;                                   \
7564    a = float ## s ## _squash_input_denormal(a, status);                \
7565    b = float ## s ## _squash_input_denormal(b, status);                \
7566    if (float ## s ## _is_any_nan(a) ||                                 \
7567        float ## s ## _is_any_nan(b)) {                                 \
7568        if (isieee) {                                                   \
7569            if (float ## s ## _is_quiet_nan(a, status) &&               \
7570                !float ## s ##_is_any_nan(b)) {                         \
7571                return b;                                               \
7572            } else if (float ## s ## _is_quiet_nan(b, status) &&        \
7573                       !float ## s ## _is_any_nan(a)) {                \
7574                return a;                                               \
7575            }                                                           \
7576        }                                                               \
7577        return propagateFloat ## s ## NaN(a, b, status);                \
7578    }                                                                   \
7579    aSign = extractFloat ## s ## Sign(a);                               \
7580    bSign = extractFloat ## s ## Sign(b);                               \
7581    av = float ## s ## _val(a);                                         \
7582    bv = float ## s ## _val(b);                                         \
7583    if (ismag) {                                                        \
7584        aav = float ## s ## _abs(av);                                   \
7585        abv = float ## s ## _abs(bv);                                   \
7586        if (aav != abv) {                                               \
7587            if (ismin) {                                                \
7588                return (aav < abv) ? a : b;                             \
7589            } else {                                                    \
7590                return (aav < abv) ? b : a;                             \
7591            }                                                           \
7592        }                                                               \
7593    }                                                                   \
7594    if (aSign != bSign) {                                               \
7595        if (ismin) {                                                    \
7596            return aSign ? a : b;                                       \
7597        } else {                                                        \
7598            return aSign ? b : a;                                       \
7599        }                                                               \
7600    } else {                                                            \
7601        if (ismin) {                                                    \
7602            return (aSign ^ (av < bv)) ? a : b;                         \
7603        } else {                                                        \
7604            return (aSign ^ (av < bv)) ? b : a;                         \
7605        }                                                               \
7606    }                                                                   \
7607}                                                                       \
7608                                                                        \
7609float ## s float ## s ## _min(float ## s a, float ## s b,               \
7610                              float_status *status)                     \
7611{                                                                       \
7612    return float ## s ## _minmax(a, b, 1, 0, 0, status);                \
7613}                                                                       \
7614                                                                        \
7615float ## s float ## s ## _max(float ## s a, float ## s b,               \
7616                              float_status *status)                     \
7617{                                                                       \
7618    return float ## s ## _minmax(a, b, 0, 0, 0, status);                \
7619}                                                                       \
7620                                                                        \
7621float ## s float ## s ## _minnum(float ## s a, float ## s b,            \
7622                                 float_status *status)                  \
7623{                                                                       \
7624    return float ## s ## _minmax(a, b, 1, 1, 0, status);                \
7625}                                                                       \
7626                                                                        \
7627float ## s float ## s ## _maxnum(float ## s a, float ## s b,            \
7628                                 float_status *status)                  \
7629{                                                                       \
7630    return float ## s ## _minmax(a, b, 0, 1, 0, status);                \
7631}                                                                       \
7632                                                                        \
7633float ## s float ## s ## _minnummag(float ## s a, float ## s b,         \
7634                                    float_status *status)               \
7635{                                                                       \
7636    return float ## s ## _minmax(a, b, 1, 1, 1, status);                \
7637}                                                                       \
7638                                                                        \
7639float ## s float ## s ## _maxnummag(float ## s a, float ## s b,         \
7640                                    float_status *status)               \
7641{                                                                       \
7642    return float ## s ## _minmax(a, b, 0, 1, 1, status);                \
7643}
7644
7645MINMAX(32)
7646MINMAX(64)
7647
7648
7649/* Multiply A by 2 raised to the power N.  */
7650float32 float32_scalbn(float32 a, int n, float_status *status)
7651{
7652    flag aSign;
7653    int16_t aExp;
7654    uint32_t aSig;
7655
7656    a = float32_squash_input_denormal(a, status);
7657    aSig = extractFloat32Frac( a );
7658    aExp = extractFloat32Exp( a );
7659    aSign = extractFloat32Sign( a );
7660
7661    if ( aExp == 0xFF ) {
7662        if ( aSig ) {
7663            return propagateFloat32NaN(a, a, status);
7664        }
7665        return a;
7666    }
7667    if (aExp != 0) {
7668        aSig |= 0x00800000;
7669    } else if (aSig == 0) {
7670        return a;
7671    } else {
7672        aExp++;
7673    }
7674
7675    if (n > 0x200) {
7676        n = 0x200;
7677    } else if (n < -0x200) {
7678        n = -0x200;
7679    }
7680
7681    aExp += n - 1;
7682    aSig <<= 7;
7683    return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
7684}
7685
7686float64 float64_scalbn(float64 a, int n, float_status *status)
7687{
7688    flag aSign;
7689    int16_t aExp;
7690    uint64_t aSig;
7691
7692    a = float64_squash_input_denormal(a, status);
7693    aSig = extractFloat64Frac( a );
7694    aExp = extractFloat64Exp( a );
7695    aSign = extractFloat64Sign( a );
7696
7697    if ( aExp == 0x7FF ) {
7698        if ( aSig ) {
7699            return propagateFloat64NaN(a, a, status);
7700        }
7701        return a;
7702    }
7703    if (aExp != 0) {
7704        aSig |= LIT64( 0x0010000000000000 );
7705    } else if (aSig == 0) {
7706        return a;
7707    } else {
7708        aExp++;
7709    }
7710
7711    if (n > 0x1000) {
7712        n = 0x1000;
7713    } else if (n < -0x1000) {
7714        n = -0x1000;
7715    }
7716
7717    aExp += n - 1;
7718    aSig <<= 10;
7719    return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
7720}
7721
7722floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7723{
7724    flag aSign;
7725    int32_t aExp;
7726    uint64_t aSig;
7727
7728    if (floatx80_invalid_encoding(a)) {
7729        float_raise(float_flag_invalid, status);
7730        return floatx80_default_nan(status);
7731    }
7732    aSig = extractFloatx80Frac( a );
7733    aExp = extractFloatx80Exp( a );
7734    aSign = extractFloatx80Sign( a );
7735
7736    if ( aExp == 0x7FFF ) {
7737        if ( aSig<<1 ) {
7738            return propagateFloatx80NaN(a, a, status);
7739        }
7740        return a;
7741    }
7742
7743    if (aExp == 0) {
7744        if (aSig == 0) {
7745            return a;
7746        }
7747        aExp++;
7748    }
7749
7750    if (n > 0x10000) {
7751        n = 0x10000;
7752    } else if (n < -0x10000) {
7753        n = -0x10000;
7754    }
7755
7756    aExp += n;
7757    return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7758                                         aSign, aExp, aSig, 0, status);
7759}
7760
7761float128 float128_scalbn(float128 a, int n, float_status *status)
7762{
7763    flag aSign;
7764    int32_t aExp;
7765    uint64_t aSig0, aSig1;
7766
7767    aSig1 = extractFloat128Frac1( a );
7768    aSig0 = extractFloat128Frac0( a );
7769    aExp = extractFloat128Exp( a );
7770    aSign = extractFloat128Sign( a );
7771    if ( aExp == 0x7FFF ) {
7772        if ( aSig0 | aSig1 ) {
7773            return propagateFloat128NaN(a, a, status);
7774        }
7775        return a;
7776    }
7777    if (aExp != 0) {
7778        aSig0 |= LIT64( 0x0001000000000000 );
7779    } else if (aSig0 == 0 && aSig1 == 0) {
7780        return a;
7781    } else {
7782        aExp++;
7783    }
7784
7785    if (n > 0x10000) {
7786        n = 0x10000;
7787    } else if (n < -0x10000) {
7788        n = -0x10000;
7789    }
7790
7791    aExp += n - 1;
7792    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7793                                         , status);
7794
7795}
7796