qemu/fpu/softfloat.c
<<
>>
Prefs
   1/*
   2 * QEMU float support
   3 *
   4 * The code in this source file is derived from release 2a of the SoftFloat
   5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6 * some later contributions) are provided under that license, as detailed below.
   7 * It has subsequently been modified by contributors to the QEMU Project,
   8 * so some portions are provided under:
   9 *  the SoftFloat-2a license
  10 *  the BSD license
  11 *  GPL-v2-or-later
  12 *
  13 * Any future contributions to this file after December 1st 2014 will be
  14 * taken to be licensed under the Softfloat-2a license unless specifically
  15 * indicated otherwise.
  16 */
  17
  18/*
  19===============================================================================
  20This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21Arithmetic Package, Release 2a.
  22
  23Written by John R. Hauser.  This work was made possible in part by the
  24International Computer Science Institute, located at Suite 600, 1947 Center
  25Street, Berkeley, California 94704.  Funding was partially provided by the
  26National Science Foundation under grant MIP-9311980.  The original version
  27of this code was written as part of a project to build a fixed-point vector
  28processor in collaboration with the University of California at Berkeley,
  29overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31arithmetic/SoftFloat.html'.
  32
  33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39Derivative works are acceptable, even for commercial purposes, so long as
  40(1) they include prominent notice that the work is derivative, and (2) they
  41include prominent notice akin to these four paragraphs for those parts of
  42this code that are retained.
  43
  44===============================================================================
  45*/
  46
  47/* BSD licensing:
  48 * Copyright (c) 2006, Fabrice Bellard
  49 * All rights reserved.
  50 *
  51 * Redistribution and use in source and binary forms, with or without
  52 * modification, are permitted provided that the following conditions are met:
  53 *
  54 * 1. Redistributions of source code must retain the above copyright notice,
  55 * this list of conditions and the following disclaimer.
  56 *
  57 * 2. Redistributions in binary form must reproduce the above copyright notice,
  58 * this list of conditions and the following disclaimer in the documentation
  59 * and/or other materials provided with the distribution.
  60 *
  61 * 3. Neither the name of the copyright holder nor the names of its contributors
  62 * may be used to endorse or promote products derived from this software without
  63 * specific prior written permission.
  64 *
  65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75 * THE POSSIBILITY OF SUCH DAMAGE.
  76 */
  77
  78/* Portions of this work are licensed under the terms of the GNU GPL,
  79 * version 2 or later. See the COPYING file in the top-level directory.
  80 */
  81
  82/* softfloat (and in particular the code in softfloat-specialize.h) is
  83 * target-dependent and needs the TARGET_* macros.
  84 */
  85#include "qemu/osdep.h"
  86#include <math.h>
  87#include "qemu/bitops.h"
  88#include "fpu/softfloat.h"
  89
  90/* We only need stdlib for abort() */
  91
  92/*----------------------------------------------------------------------------
  93| Primitive arithmetic functions, including multi-word arithmetic, and
  94| division and square root approximations.  (Can be specialized to target if
  95| desired.)
  96*----------------------------------------------------------------------------*/
  97#include "fpu/softfloat-macros.h"
  98
  99/*
 100 * Hardfloat
 101 *
 102 * Fast emulation of guest FP instructions is challenging for two reasons.
 103 * First, FP instruction semantics are similar but not identical, particularly
 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
 105 * exception flags is not trivial: reading the host's flags register with a
 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
 107 * and trapping on every FP exception is not fast nor pleasant to work with.
 108 *
 109 * We address these challenges by leveraging the host FPU for a subset of the
 110 * operations. To do this we expand on the idea presented in this paper:
 111 *
 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
 114 *
 115 * The idea is thus to leverage the host FPU to (1) compute FP operations
 116 * and (2) identify whether FP exceptions occurred while avoiding
 117 * expensive exception flag register accesses.
 118 *
 119 * An important optimization shown in the paper is that given that exception
 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
 121 * This is particularly useful for the inexact flag, which is very frequently
 122 * raised in floating-point workloads.
 123 *
 124 * We optimize the code further by deferring to soft-fp whenever FP exception
 125 * detection might get hairy. Two examples: (1) when at least one operand is
 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
 127 * and the result is < the minimum normal.
 128 */
 129#define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
 130    static inline void name(soft_t *a, float_status *s)                 \
 131    {                                                                   \
 132        if (unlikely(soft_t ## _is_denormal(*a))) {                     \
 133            *a = soft_t ## _set_sign(soft_t ## _zero,                   \
 134                                     soft_t ## _is_neg(*a));            \
 135            s->float_exception_flags |= float_flag_input_denormal;      \
 136        }                                                               \
 137    }
 138
 139GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
 140GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
 141#undef GEN_INPUT_FLUSH__NOCHECK
 142
 143#define GEN_INPUT_FLUSH1(name, soft_t)                  \
 144    static inline void name(soft_t *a, float_status *s) \
 145    {                                                   \
 146        if (likely(!s->flush_inputs_to_zero)) {         \
 147            return;                                     \
 148        }                                               \
 149        soft_t ## _input_flush__nocheck(a, s);          \
 150    }
 151
 152GEN_INPUT_FLUSH1(float32_input_flush1, float32)
 153GEN_INPUT_FLUSH1(float64_input_flush1, float64)
 154#undef GEN_INPUT_FLUSH1
 155
 156#define GEN_INPUT_FLUSH2(name, soft_t)                                  \
 157    static inline void name(soft_t *a, soft_t *b, float_status *s)      \
 158    {                                                                   \
 159        if (likely(!s->flush_inputs_to_zero)) {                         \
 160            return;                                                     \
 161        }                                                               \
 162        soft_t ## _input_flush__nocheck(a, s);                          \
 163        soft_t ## _input_flush__nocheck(b, s);                          \
 164    }
 165
 166GEN_INPUT_FLUSH2(float32_input_flush2, float32)
 167GEN_INPUT_FLUSH2(float64_input_flush2, float64)
 168#undef GEN_INPUT_FLUSH2
 169
 170#define GEN_INPUT_FLUSH3(name, soft_t)                                  \
 171    static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
 172    {                                                                   \
 173        if (likely(!s->flush_inputs_to_zero)) {                         \
 174            return;                                                     \
 175        }                                                               \
 176        soft_t ## _input_flush__nocheck(a, s);                          \
 177        soft_t ## _input_flush__nocheck(b, s);                          \
 178        soft_t ## _input_flush__nocheck(c, s);                          \
 179    }
 180
 181GEN_INPUT_FLUSH3(float32_input_flush3, float32)
 182GEN_INPUT_FLUSH3(float64_input_flush3, float64)
 183#undef GEN_INPUT_FLUSH3
 184
 185/*
 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
 187 * hardfloat functions. Each combination of number of inputs and float size
 188 * gets its own value.
 189 */
 190#if defined(__x86_64__)
 191# define QEMU_HARDFLOAT_1F32_USE_FP 0
 192# define QEMU_HARDFLOAT_1F64_USE_FP 1
 193# define QEMU_HARDFLOAT_2F32_USE_FP 0
 194# define QEMU_HARDFLOAT_2F64_USE_FP 1
 195# define QEMU_HARDFLOAT_3F32_USE_FP 0
 196# define QEMU_HARDFLOAT_3F64_USE_FP 1
 197#else
 198# define QEMU_HARDFLOAT_1F32_USE_FP 0
 199# define QEMU_HARDFLOAT_1F64_USE_FP 0
 200# define QEMU_HARDFLOAT_2F32_USE_FP 0
 201# define QEMU_HARDFLOAT_2F64_USE_FP 0
 202# define QEMU_HARDFLOAT_3F32_USE_FP 0
 203# define QEMU_HARDFLOAT_3F64_USE_FP 0
 204#endif
 205
 206/*
 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
 208 * float{32,64}_is_infinity when !USE_FP.
 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
 211 */
 212#if defined(__x86_64__) || defined(__aarch64__)
 213# define QEMU_HARDFLOAT_USE_ISINF   1
 214#else
 215# define QEMU_HARDFLOAT_USE_ISINF   0
 216#endif
 217
 218/*
 219 * Some targets clear the FP flags before most FP operations. This prevents
 220 * the use of hardfloat, since hardfloat relies on the inexact flag being
 221 * already set.
 222 */
 223#if defined(TARGET_PPC) || defined(__FAST_MATH__)
 224# if defined(__FAST_MATH__)
 225#  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
 226    IEEE implementation
 227# endif
 228# define QEMU_NO_HARDFLOAT 1
 229# define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
 230#else
 231# define QEMU_NO_HARDFLOAT 0
 232# define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
 233#endif
 234
 235static inline bool can_use_fpu(const float_status *s)
 236{
 237    if (QEMU_NO_HARDFLOAT) {
 238        return false;
 239    }
 240    return likely(s->float_exception_flags & float_flag_inexact &&
 241                  s->float_rounding_mode == float_round_nearest_even);
 242}
 243
 244/*
 245 * Hardfloat generation functions. Each operation can have two flavors:
 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
 247 * most condition checks, or native ones (e.g. fpclassify).
 248 *
 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
 250 * compiler to propagate constants and inline everything into the callers.
 251 *
 252 * We only generate functions for operations with two inputs, since only
 253 * these are common enough to justify consolidating them into common code.
 254 */
 255
 256typedef union {
 257    float32 s;
 258    float h;
 259} union_float32;
 260
 261typedef union {
 262    float64 s;
 263    double h;
 264} union_float64;
 265
 266typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
 267typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
 268
 269typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
 270typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
 271typedef float   (*hard_f32_op2_fn)(float a, float b);
 272typedef double  (*hard_f64_op2_fn)(double a, double b);
 273
 274/* 2-input is-zero-or-normal */
 275static inline bool f32_is_zon2(union_float32 a, union_float32 b)
 276{
 277    if (QEMU_HARDFLOAT_2F32_USE_FP) {
 278        /*
 279         * Not using a temp variable for consecutive fpclassify calls ends up
 280         * generating faster code.
 281         */
 282        return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 283               (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 284    }
 285    return float32_is_zero_or_normal(a.s) &&
 286           float32_is_zero_or_normal(b.s);
 287}
 288
 289static inline bool f64_is_zon2(union_float64 a, union_float64 b)
 290{
 291    if (QEMU_HARDFLOAT_2F64_USE_FP) {
 292        return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 293               (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 294    }
 295    return float64_is_zero_or_normal(a.s) &&
 296           float64_is_zero_or_normal(b.s);
 297}
 298
 299/* 3-input is-zero-or-normal */
 300static inline
 301bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
 302{
 303    if (QEMU_HARDFLOAT_3F32_USE_FP) {
 304        return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 305               (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 306               (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 307    }
 308    return float32_is_zero_or_normal(a.s) &&
 309           float32_is_zero_or_normal(b.s) &&
 310           float32_is_zero_or_normal(c.s);
 311}
 312
 313static inline
 314bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
 315{
 316    if (QEMU_HARDFLOAT_3F64_USE_FP) {
 317        return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 318               (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 319               (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 320    }
 321    return float64_is_zero_or_normal(a.s) &&
 322           float64_is_zero_or_normal(b.s) &&
 323           float64_is_zero_or_normal(c.s);
 324}
 325
 326static inline bool f32_is_inf(union_float32 a)
 327{
 328    if (QEMU_HARDFLOAT_USE_ISINF) {
 329        return isinf(a.h);
 330    }
 331    return float32_is_infinity(a.s);
 332}
 333
 334static inline bool f64_is_inf(union_float64 a)
 335{
 336    if (QEMU_HARDFLOAT_USE_ISINF) {
 337        return isinf(a.h);
 338    }
 339    return float64_is_infinity(a.s);
 340}
 341
 342/* Note: @fast_test and @post can be NULL */
 343static inline float32
 344float32_gen2(float32 xa, float32 xb, float_status *s,
 345             hard_f32_op2_fn hard, soft_f32_op2_fn soft,
 346             f32_check_fn pre, f32_check_fn post,
 347             f32_check_fn fast_test, soft_f32_op2_fn fast_op)
 348{
 349    union_float32 ua, ub, ur;
 350
 351    ua.s = xa;
 352    ub.s = xb;
 353
 354    if (unlikely(!can_use_fpu(s))) {
 355        goto soft;
 356    }
 357
 358    float32_input_flush2(&ua.s, &ub.s, s);
 359    if (unlikely(!pre(ua, ub))) {
 360        goto soft;
 361    }
 362    if (fast_test && fast_test(ua, ub)) {
 363        return fast_op(ua.s, ub.s, s);
 364    }
 365
 366    ur.h = hard(ua.h, ub.h);
 367    if (unlikely(f32_is_inf(ur))) {
 368        s->float_exception_flags |= float_flag_overflow;
 369    } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
 370        if (post == NULL || post(ua, ub)) {
 371            goto soft;
 372        }
 373    }
 374    return ur.s;
 375
 376 soft:
 377    return soft(ua.s, ub.s, s);
 378}
 379
 380static inline float64
 381float64_gen2(float64 xa, float64 xb, float_status *s,
 382             hard_f64_op2_fn hard, soft_f64_op2_fn soft,
 383             f64_check_fn pre, f64_check_fn post,
 384             f64_check_fn fast_test, soft_f64_op2_fn fast_op)
 385{
 386    union_float64 ua, ub, ur;
 387
 388    ua.s = xa;
 389    ub.s = xb;
 390
 391    if (unlikely(!can_use_fpu(s))) {
 392        goto soft;
 393    }
 394
 395    float64_input_flush2(&ua.s, &ub.s, s);
 396    if (unlikely(!pre(ua, ub))) {
 397        goto soft;
 398    }
 399    if (fast_test && fast_test(ua, ub)) {
 400        return fast_op(ua.s, ub.s, s);
 401    }
 402
 403    ur.h = hard(ua.h, ub.h);
 404    if (unlikely(f64_is_inf(ur))) {
 405        s->float_exception_flags |= float_flag_overflow;
 406    } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
 407        if (post == NULL || post(ua, ub)) {
 408            goto soft;
 409        }
 410    }
 411    return ur.s;
 412
 413 soft:
 414    return soft(ua.s, ub.s, s);
 415}
 416
 417/*----------------------------------------------------------------------------
 418| Returns the fraction bits of the single-precision floating-point value `a'.
 419*----------------------------------------------------------------------------*/
 420
 421static inline uint32_t extractFloat32Frac(float32 a)
 422{
 423    return float32_val(a) & 0x007FFFFF;
 424}
 425
 426/*----------------------------------------------------------------------------
 427| Returns the exponent bits of the single-precision floating-point value `a'.
 428*----------------------------------------------------------------------------*/
 429
 430static inline int extractFloat32Exp(float32 a)
 431{
 432    return (float32_val(a) >> 23) & 0xFF;
 433}
 434
 435/*----------------------------------------------------------------------------
 436| Returns the sign bit of the single-precision floating-point value `a'.
 437*----------------------------------------------------------------------------*/
 438
 439static inline flag extractFloat32Sign(float32 a)
 440{
 441    return float32_val(a) >> 31;
 442}
 443
 444/*----------------------------------------------------------------------------
 445| Returns the fraction bits of the double-precision floating-point value `a'.
 446*----------------------------------------------------------------------------*/
 447
 448static inline uint64_t extractFloat64Frac(float64 a)
 449{
 450    return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
 451}
 452
 453/*----------------------------------------------------------------------------
 454| Returns the exponent bits of the double-precision floating-point value `a'.
 455*----------------------------------------------------------------------------*/
 456
 457static inline int extractFloat64Exp(float64 a)
 458{
 459    return (float64_val(a) >> 52) & 0x7FF;
 460}
 461
 462/*----------------------------------------------------------------------------
 463| Returns the sign bit of the double-precision floating-point value `a'.
 464*----------------------------------------------------------------------------*/
 465
 466static inline flag extractFloat64Sign(float64 a)
 467{
 468    return float64_val(a) >> 63;
 469}
 470
 471/*
 472 * Classify a floating point number. Everything above float_class_qnan
 473 * is a NaN so cls >= float_class_qnan is any NaN.
 474 */
 475
 476typedef enum __attribute__ ((__packed__)) {
 477    float_class_unclassified,
 478    float_class_zero,
 479    float_class_normal,
 480    float_class_inf,
 481    float_class_qnan,  /* all NaNs from here */
 482    float_class_snan,
 483} FloatClass;
 484
 485/* Simple helpers for checking if, or what kind of, NaN we have */
 486static inline __attribute__((unused)) bool is_nan(FloatClass c)
 487{
 488    return unlikely(c >= float_class_qnan);
 489}
 490
 491static inline __attribute__((unused)) bool is_snan(FloatClass c)
 492{
 493    return c == float_class_snan;
 494}
 495
 496static inline __attribute__((unused)) bool is_qnan(FloatClass c)
 497{
 498    return c == float_class_qnan;
 499}
 500
 501/*
 502 * Structure holding all of the decomposed parts of a float. The
 503 * exponent is unbiased and the fraction is normalized. All
 504 * calculations are done with a 64 bit fraction and then rounded as
 505 * appropriate for the final format.
 506 *
 507 * Thanks to the packed FloatClass a decent compiler should be able to
 508 * fit the whole structure into registers and avoid using the stack
 509 * for parameter passing.
 510 */
 511
 512typedef struct {
 513    uint64_t frac;
 514    int32_t  exp;
 515    FloatClass cls;
 516    bool sign;
 517} FloatParts;
 518
 519#define DECOMPOSED_BINARY_POINT    (64 - 2)
 520#define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 521#define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
 522
 523/* Structure holding all of the relevant parameters for a format.
 524 *   exp_size: the size of the exponent field
 525 *   exp_bias: the offset applied to the exponent field
 526 *   exp_max: the maximum normalised exponent
 527 *   frac_size: the size of the fraction field
 528 *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 529 * The following are computed based the size of fraction
 530 *   frac_lsb: least significant bit of fraction
 531 *   frac_lsbm1: the bit below the least significant bit (for rounding)
 532 *   round_mask/roundeven_mask: masks used for rounding
 533 * The following optional modifiers are available:
 534 *   arm_althp: handle ARM Alternative Half Precision
 535 */
 536typedef struct {
 537    int exp_size;
 538    int exp_bias;
 539    int exp_max;
 540    int frac_size;
 541    int frac_shift;
 542    uint64_t frac_lsb;
 543    uint64_t frac_lsbm1;
 544    uint64_t round_mask;
 545    uint64_t roundeven_mask;
 546    bool arm_althp;
 547} FloatFmt;
 548
 549/* Expand fields based on the size of exponent and fraction */
 550#define FLOAT_PARAMS(E, F)                                           \
 551    .exp_size       = E,                                             \
 552    .exp_bias       = ((1 << E) - 1) >> 1,                           \
 553    .exp_max        = (1 << E) - 1,                                  \
 554    .frac_size      = F,                                             \
 555    .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
 556    .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
 557    .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
 558    .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
 559    .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
 560
 561static const FloatFmt float16_params = {
 562    FLOAT_PARAMS(5, 10)
 563};
 564
 565static const FloatFmt float16_params_ahp = {
 566    FLOAT_PARAMS(5, 10),
 567    .arm_althp = true
 568};
 569
 570static const FloatFmt float32_params = {
 571    FLOAT_PARAMS(8, 23)
 572};
 573
 574static const FloatFmt float64_params = {
 575    FLOAT_PARAMS(11, 52)
 576};
 577
 578/* Unpack a float to parts, but do not canonicalize.  */
 579static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
 580{
 581    const int sign_pos = fmt.frac_size + fmt.exp_size;
 582
 583    return (FloatParts) {
 584        .cls = float_class_unclassified,
 585        .sign = extract64(raw, sign_pos, 1),
 586        .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
 587        .frac = extract64(raw, 0, fmt.frac_size),
 588    };
 589}
 590
 591static inline FloatParts float16_unpack_raw(float16 f)
 592{
 593    return unpack_raw(float16_params, f);
 594}
 595
 596static inline FloatParts float32_unpack_raw(float32 f)
 597{
 598    return unpack_raw(float32_params, f);
 599}
 600
 601static inline FloatParts float64_unpack_raw(float64 f)
 602{
 603    return unpack_raw(float64_params, f);
 604}
 605
 606/* Pack a float from parts, but do not canonicalize.  */
 607static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
 608{
 609    const int sign_pos = fmt.frac_size + fmt.exp_size;
 610    uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
 611    return deposit64(ret, sign_pos, 1, p.sign);
 612}
 613
 614static inline float16 float16_pack_raw(FloatParts p)
 615{
 616    return make_float16(pack_raw(float16_params, p));
 617}
 618
 619static inline float32 float32_pack_raw(FloatParts p)
 620{
 621    return make_float32(pack_raw(float32_params, p));
 622}
 623
 624static inline float64 float64_pack_raw(FloatParts p)
 625{
 626    return make_float64(pack_raw(float64_params, p));
 627}
 628
 629/*----------------------------------------------------------------------------
 630| Functions and definitions to determine:  (1) whether tininess for underflow
 631| is detected before or after rounding by default, (2) what (if anything)
 632| happens when exceptions are raised, (3) how signaling NaNs are distinguished
 633| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 634| are propagated from function inputs to output.  These details are target-
 635| specific.
 636*----------------------------------------------------------------------------*/
 637#include "softfloat-specialize.inc.c"
 638
 639/* Canonicalize EXP and FRAC, setting CLS.  */
 640static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
 641                                  float_status *status)
 642{
 643    if (part.exp == parm->exp_max && !parm->arm_althp) {
 644        if (part.frac == 0) {
 645            part.cls = float_class_inf;
 646        } else {
 647            part.frac <<= parm->frac_shift;
 648            part.cls = (parts_is_snan_frac(part.frac, status)
 649                        ? float_class_snan : float_class_qnan);
 650        }
 651    } else if (part.exp == 0) {
 652        if (likely(part.frac == 0)) {
 653            part.cls = float_class_zero;
 654        } else if (status->flush_inputs_to_zero) {
 655            float_raise(float_flag_input_denormal, status);
 656            part.cls = float_class_zero;
 657            part.frac = 0;
 658        } else {
 659            int shift = clz64(part.frac) - 1;
 660            part.cls = float_class_normal;
 661            part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
 662            part.frac <<= shift;
 663        }
 664    } else {
 665        part.cls = float_class_normal;
 666        part.exp -= parm->exp_bias;
 667        part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
 668    }
 669    return part;
 670}
 671
 672/* Round and uncanonicalize a floating-point number by parts. There
 673 * are FRAC_SHIFT bits that may require rounding at the bottom of the
 674 * fraction; these bits will be removed. The exponent will be biased
 675 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 676 */
 677
 678static FloatParts round_canonical(FloatParts p, float_status *s,
 679                                  const FloatFmt *parm)
 680{
 681    const uint64_t frac_lsb = parm->frac_lsb;
 682    const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 683    const uint64_t round_mask = parm->round_mask;
 684    const uint64_t roundeven_mask = parm->roundeven_mask;
 685    const int exp_max = parm->exp_max;
 686    const int frac_shift = parm->frac_shift;
 687    uint64_t frac, inc;
 688    int exp, flags = 0;
 689    bool overflow_norm;
 690
 691    frac = p.frac;
 692    exp = p.exp;
 693
 694    switch (p.cls) {
 695    case float_class_normal:
 696        switch (s->float_rounding_mode) {
 697        case float_round_nearest_even:
 698            overflow_norm = false;
 699            inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 700            break;
 701        case float_round_ties_away:
 702            overflow_norm = false;
 703            inc = frac_lsbm1;
 704            break;
 705        case float_round_to_zero:
 706            overflow_norm = true;
 707            inc = 0;
 708            break;
 709        case float_round_up:
 710            inc = p.sign ? 0 : round_mask;
 711            overflow_norm = p.sign;
 712            break;
 713        case float_round_down:
 714            inc = p.sign ? round_mask : 0;
 715            overflow_norm = !p.sign;
 716            break;
 717        case float_round_to_odd:
 718            overflow_norm = true;
 719            inc = frac & frac_lsb ? 0 : round_mask;
 720            break;
 721        default:
 722            g_assert_not_reached();
 723        }
 724
 725        exp += parm->exp_bias;
 726        if (likely(exp > 0)) {
 727            if (frac & round_mask) {
 728                flags |= float_flag_inexact;
 729                frac += inc;
 730                if (frac & DECOMPOSED_OVERFLOW_BIT) {
 731                    frac >>= 1;
 732                    exp++;
 733                }
 734            }
 735            frac >>= frac_shift;
 736
 737            if (parm->arm_althp) {
 738                /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
 739                if (unlikely(exp > exp_max)) {
 740                    /* Overflow.  Return the maximum normal.  */
 741                    flags = float_flag_invalid;
 742                    exp = exp_max;
 743                    frac = -1;
 744                }
 745            } else if (unlikely(exp >= exp_max)) {
 746                flags |= float_flag_overflow | float_flag_inexact;
 747                if (overflow_norm) {
 748                    exp = exp_max - 1;
 749                    frac = -1;
 750                } else {
 751                    p.cls = float_class_inf;
 752                    goto do_inf;
 753                }
 754            }
 755        } else if (s->flush_to_zero) {
 756            flags |= float_flag_output_denormal;
 757            p.cls = float_class_zero;
 758            goto do_zero;
 759        } else {
 760            bool is_tiny = (s->float_detect_tininess
 761                            == float_tininess_before_rounding)
 762                        || (exp < 0)
 763                        || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
 764
 765            shift64RightJamming(frac, 1 - exp, &frac);
 766            if (frac & round_mask) {
 767                /* Need to recompute round-to-even.  */
 768                switch (s->float_rounding_mode) {
 769                case float_round_nearest_even:
 770                    inc = ((frac & roundeven_mask) != frac_lsbm1
 771                           ? frac_lsbm1 : 0);
 772                    break;
 773                case float_round_to_odd:
 774                    inc = frac & frac_lsb ? 0 : round_mask;
 775                    break;
 776                }
 777                flags |= float_flag_inexact;
 778                frac += inc;
 779            }
 780
 781            exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 782            frac >>= frac_shift;
 783
 784            if (is_tiny && (flags & float_flag_inexact)) {
 785                flags |= float_flag_underflow;
 786            }
 787            if (exp == 0 && frac == 0) {
 788                p.cls = float_class_zero;
 789            }
 790        }
 791        break;
 792
 793    case float_class_zero:
 794    do_zero:
 795        exp = 0;
 796        frac = 0;
 797        break;
 798
 799    case float_class_inf:
 800    do_inf:
 801        assert(!parm->arm_althp);
 802        exp = exp_max;
 803        frac = 0;
 804        break;
 805
 806    case float_class_qnan:
 807    case float_class_snan:
 808        assert(!parm->arm_althp);
 809        exp = exp_max;
 810        frac >>= parm->frac_shift;
 811        break;
 812
 813    default:
 814        g_assert_not_reached();
 815    }
 816
 817    float_raise(flags, s);
 818    p.exp = exp;
 819    p.frac = frac;
 820    return p;
 821}
 822
 823/* Explicit FloatFmt version */
 824static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
 825                                            const FloatFmt *params)
 826{
 827    return sf_canonicalize(float16_unpack_raw(f), params, s);
 828}
 829
 830static FloatParts float16_unpack_canonical(float16 f, float_status *s)
 831{
 832    return float16a_unpack_canonical(f, s, &float16_params);
 833}
 834
 835static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
 836                                             const FloatFmt *params)
 837{
 838    return float16_pack_raw(round_canonical(p, s, params));
 839}
 840
 841static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
 842{
 843    return float16a_round_pack_canonical(p, s, &float16_params);
 844}
 845
 846static FloatParts float32_unpack_canonical(float32 f, float_status *s)
 847{
 848    return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
 849}
 850
 851static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
 852{
 853    return float32_pack_raw(round_canonical(p, s, &float32_params));
 854}
 855
 856static FloatParts float64_unpack_canonical(float64 f, float_status *s)
 857{
 858    return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
 859}
 860
 861static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
 862{
 863    return float64_pack_raw(round_canonical(p, s, &float64_params));
 864}
 865
 866static FloatParts return_nan(FloatParts a, float_status *s)
 867{
 868    switch (a.cls) {
 869    case float_class_snan:
 870        s->float_exception_flags |= float_flag_invalid;
 871        a = parts_silence_nan(a, s);
 872        /* fall through */
 873    case float_class_qnan:
 874        if (s->default_nan_mode) {
 875            return parts_default_nan(s);
 876        }
 877        break;
 878
 879    default:
 880        g_assert_not_reached();
 881    }
 882    return a;
 883}
 884
 885static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
 886{
 887    if (is_snan(a.cls) || is_snan(b.cls)) {
 888        s->float_exception_flags |= float_flag_invalid;
 889    }
 890
 891    if (s->default_nan_mode) {
 892        return parts_default_nan(s);
 893    } else {
 894        if (pickNaN(a.cls, b.cls,
 895                    a.frac > b.frac ||
 896                    (a.frac == b.frac && a.sign < b.sign))) {
 897            a = b;
 898        }
 899        if (is_snan(a.cls)) {
 900            return parts_silence_nan(a, s);
 901        }
 902    }
 903    return a;
 904}
 905
 906static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
 907                                  bool inf_zero, float_status *s)
 908{
 909    int which;
 910
 911    if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
 912        s->float_exception_flags |= float_flag_invalid;
 913    }
 914
 915    which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
 916
 917    if (s->default_nan_mode) {
 918        /* Note that this check is after pickNaNMulAdd so that function
 919         * has an opportunity to set the Invalid flag.
 920         */
 921        which = 3;
 922    }
 923
 924    switch (which) {
 925    case 0:
 926        break;
 927    case 1:
 928        a = b;
 929        break;
 930    case 2:
 931        a = c;
 932        break;
 933    case 3:
 934        return parts_default_nan(s);
 935    default:
 936        g_assert_not_reached();
 937    }
 938
 939    if (is_snan(a.cls)) {
 940        return parts_silence_nan(a, s);
 941    }
 942    return a;
 943}
 944
 945/*
 946 * Returns the result of adding or subtracting the values of the
 947 * floating-point values `a' and `b'. The operation is performed
 948 * according to the IEC/IEEE Standard for Binary Floating-Point
 949 * Arithmetic.
 950 */
 951
 952static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
 953                                float_status *s)
 954{
 955    bool a_sign = a.sign;
 956    bool b_sign = b.sign ^ subtract;
 957
 958    if (a_sign != b_sign) {
 959        /* Subtraction */
 960
 961        if (a.cls == float_class_normal && b.cls == float_class_normal) {
 962            if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
 963                shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 964                a.frac = a.frac - b.frac;
 965            } else {
 966                shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 967                a.frac = b.frac - a.frac;
 968                a.exp = b.exp;
 969                a_sign ^= 1;
 970            }
 971
 972            if (a.frac == 0) {
 973                a.cls = float_class_zero;
 974                a.sign = s->float_rounding_mode == float_round_down;
 975            } else {
 976                int shift = clz64(a.frac) - 1;
 977                a.frac = a.frac << shift;
 978                a.exp = a.exp - shift;
 979                a.sign = a_sign;
 980            }
 981            return a;
 982        }
 983        if (is_nan(a.cls) || is_nan(b.cls)) {
 984            return pick_nan(a, b, s);
 985        }
 986        if (a.cls == float_class_inf) {
 987            if (b.cls == float_class_inf) {
 988                float_raise(float_flag_invalid, s);
 989                return parts_default_nan(s);
 990            }
 991            return a;
 992        }
 993        if (a.cls == float_class_zero && b.cls == float_class_zero) {
 994            a.sign = s->float_rounding_mode == float_round_down;
 995            return a;
 996        }
 997        if (a.cls == float_class_zero || b.cls == float_class_inf) {
 998            b.sign = a_sign ^ 1;
 999            return b;
1000        }
1001        if (b.cls == float_class_zero) {
1002            return a;
1003        }
1004    } else {
1005        /* Addition */
1006        if (a.cls == float_class_normal && b.cls == float_class_normal) {
1007            if (a.exp > b.exp) {
1008                shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1009            } else if (a.exp < b.exp) {
1010                shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1011                a.exp = b.exp;
1012            }
1013            a.frac += b.frac;
1014            if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1015                shift64RightJamming(a.frac, 1, &a.frac);
1016                a.exp += 1;
1017            }
1018            return a;
1019        }
1020        if (is_nan(a.cls) || is_nan(b.cls)) {
1021            return pick_nan(a, b, s);
1022        }
1023        if (a.cls == float_class_inf || b.cls == float_class_zero) {
1024            return a;
1025        }
1026        if (b.cls == float_class_inf || a.cls == float_class_zero) {
1027            b.sign = b_sign;
1028            return b;
1029        }
1030    }
1031    g_assert_not_reached();
1032}
1033
1034/*
1035 * Returns the result of adding or subtracting the floating-point
1036 * values `a' and `b'. The operation is performed according to the
1037 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1038 */
1039
1040float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1041{
1042    FloatParts pa = float16_unpack_canonical(a, status);
1043    FloatParts pb = float16_unpack_canonical(b, status);
1044    FloatParts pr = addsub_floats(pa, pb, false, status);
1045
1046    return float16_round_pack_canonical(pr, status);
1047}
1048
1049float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1050{
1051    FloatParts pa = float16_unpack_canonical(a, status);
1052    FloatParts pb = float16_unpack_canonical(b, status);
1053    FloatParts pr = addsub_floats(pa, pb, true, status);
1054
1055    return float16_round_pack_canonical(pr, status);
1056}
1057
1058static float32 QEMU_SOFTFLOAT_ATTR
1059soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1060{
1061    FloatParts pa = float32_unpack_canonical(a, status);
1062    FloatParts pb = float32_unpack_canonical(b, status);
1063    FloatParts pr = addsub_floats(pa, pb, subtract, status);
1064
1065    return float32_round_pack_canonical(pr, status);
1066}
1067
1068static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1069{
1070    return soft_f32_addsub(a, b, false, status);
1071}
1072
1073static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1074{
1075    return soft_f32_addsub(a, b, true, status);
1076}
1077
1078static float64 QEMU_SOFTFLOAT_ATTR
1079soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1080{
1081    FloatParts pa = float64_unpack_canonical(a, status);
1082    FloatParts pb = float64_unpack_canonical(b, status);
1083    FloatParts pr = addsub_floats(pa, pb, subtract, status);
1084
1085    return float64_round_pack_canonical(pr, status);
1086}
1087
1088static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1089{
1090    return soft_f64_addsub(a, b, false, status);
1091}
1092
1093static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1094{
1095    return soft_f64_addsub(a, b, true, status);
1096}
1097
1098static float hard_f32_add(float a, float b)
1099{
1100    return a + b;
1101}
1102
1103static float hard_f32_sub(float a, float b)
1104{
1105    return a - b;
1106}
1107
1108static double hard_f64_add(double a, double b)
1109{
1110    return a + b;
1111}
1112
1113static double hard_f64_sub(double a, double b)
1114{
1115    return a - b;
1116}
1117
1118static bool f32_addsub_post(union_float32 a, union_float32 b)
1119{
1120    if (QEMU_HARDFLOAT_2F32_USE_FP) {
1121        return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1122    }
1123    return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1124}
1125
1126static bool f64_addsub_post(union_float64 a, union_float64 b)
1127{
1128    if (QEMU_HARDFLOAT_2F64_USE_FP) {
1129        return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1130    } else {
1131        return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1132    }
1133}
1134
1135static float32 float32_addsub(float32 a, float32 b, float_status *s,
1136                              hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1137{
1138    return float32_gen2(a, b, s, hard, soft,
1139                        f32_is_zon2, f32_addsub_post, NULL, NULL);
1140}
1141
1142static float64 float64_addsub(float64 a, float64 b, float_status *s,
1143                              hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1144{
1145    return float64_gen2(a, b, s, hard, soft,
1146                        f64_is_zon2, f64_addsub_post, NULL, NULL);
1147}
1148
1149float32 QEMU_FLATTEN
1150float32_add(float32 a, float32 b, float_status *s)
1151{
1152    return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1153}
1154
1155float32 QEMU_FLATTEN
1156float32_sub(float32 a, float32 b, float_status *s)
1157{
1158    return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1159}
1160
1161float64 QEMU_FLATTEN
1162float64_add(float64 a, float64 b, float_status *s)
1163{
1164    return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1165}
1166
1167float64 QEMU_FLATTEN
1168float64_sub(float64 a, float64 b, float_status *s)
1169{
1170    return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1171}
1172
1173/*
1174 * Returns the result of multiplying the floating-point values `a' and
1175 * `b'. The operation is performed according to the IEC/IEEE Standard
1176 * for Binary Floating-Point Arithmetic.
1177 */
1178
1179static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1180{
1181    bool sign = a.sign ^ b.sign;
1182
1183    if (a.cls == float_class_normal && b.cls == float_class_normal) {
1184        uint64_t hi, lo;
1185        int exp = a.exp + b.exp;
1186
1187        mul64To128(a.frac, b.frac, &hi, &lo);
1188        shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1189        if (lo & DECOMPOSED_OVERFLOW_BIT) {
1190            shift64RightJamming(lo, 1, &lo);
1191            exp += 1;
1192        }
1193
1194        /* Re-use a */
1195        a.exp = exp;
1196        a.sign = sign;
1197        a.frac = lo;
1198        return a;
1199    }
1200    /* handle all the NaN cases */
1201    if (is_nan(a.cls) || is_nan(b.cls)) {
1202        return pick_nan(a, b, s);
1203    }
1204    /* Inf * Zero == NaN */
1205    if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1206        (a.cls == float_class_zero && b.cls == float_class_inf)) {
1207        s->float_exception_flags |= float_flag_invalid;
1208        return parts_default_nan(s);
1209    }
1210    /* Multiply by 0 or Inf */
1211    if (a.cls == float_class_inf || a.cls == float_class_zero) {
1212        a.sign = sign;
1213        return a;
1214    }
1215    if (b.cls == float_class_inf || b.cls == float_class_zero) {
1216        b.sign = sign;
1217        return b;
1218    }
1219    g_assert_not_reached();
1220}
1221
1222float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1223{
1224    FloatParts pa = float16_unpack_canonical(a, status);
1225    FloatParts pb = float16_unpack_canonical(b, status);
1226    FloatParts pr = mul_floats(pa, pb, status);
1227
1228    return float16_round_pack_canonical(pr, status);
1229}
1230
1231static float32 QEMU_SOFTFLOAT_ATTR
1232soft_f32_mul(float32 a, float32 b, float_status *status)
1233{
1234    FloatParts pa = float32_unpack_canonical(a, status);
1235    FloatParts pb = float32_unpack_canonical(b, status);
1236    FloatParts pr = mul_floats(pa, pb, status);
1237
1238    return float32_round_pack_canonical(pr, status);
1239}
1240
1241static float64 QEMU_SOFTFLOAT_ATTR
1242soft_f64_mul(float64 a, float64 b, float_status *status)
1243{
1244    FloatParts pa = float64_unpack_canonical(a, status);
1245    FloatParts pb = float64_unpack_canonical(b, status);
1246    FloatParts pr = mul_floats(pa, pb, status);
1247
1248    return float64_round_pack_canonical(pr, status);
1249}
1250
1251static float hard_f32_mul(float a, float b)
1252{
1253    return a * b;
1254}
1255
1256static double hard_f64_mul(double a, double b)
1257{
1258    return a * b;
1259}
1260
1261static bool f32_mul_fast_test(union_float32 a, union_float32 b)
1262{
1263    return float32_is_zero(a.s) || float32_is_zero(b.s);
1264}
1265
1266static bool f64_mul_fast_test(union_float64 a, union_float64 b)
1267{
1268    return float64_is_zero(a.s) || float64_is_zero(b.s);
1269}
1270
1271static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s)
1272{
1273    bool signbit = float32_is_neg(a) ^ float32_is_neg(b);
1274
1275    return float32_set_sign(float32_zero, signbit);
1276}
1277
1278static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s)
1279{
1280    bool signbit = float64_is_neg(a) ^ float64_is_neg(b);
1281
1282    return float64_set_sign(float64_zero, signbit);
1283}
1284
1285float32 QEMU_FLATTEN
1286float32_mul(float32 a, float32 b, float_status *s)
1287{
1288    return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1289                        f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op);
1290}
1291
1292float64 QEMU_FLATTEN
1293float64_mul(float64 a, float64 b, float_status *s)
1294{
1295    return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1296                        f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op);
1297}
1298
1299/*
1300 * Returns the result of multiplying the floating-point values `a' and
1301 * `b' then adding 'c', with no intermediate rounding step after the
1302 * multiplication. The operation is performed according to the
1303 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1304 * The flags argument allows the caller to select negation of the
1305 * addend, the intermediate product, or the final result. (The
1306 * difference between this and having the caller do a separate
1307 * negation is that negating externally will flip the sign bit on
1308 * NaNs.)
1309 */
1310
1311static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1312                                int flags, float_status *s)
1313{
1314    bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1315                    ((1 << float_class_inf) | (1 << float_class_zero));
1316    bool p_sign;
1317    bool sign_flip = flags & float_muladd_negate_result;
1318    FloatClass p_class;
1319    uint64_t hi, lo;
1320    int p_exp;
1321
1322    /* It is implementation-defined whether the cases of (0,inf,qnan)
1323     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1324     * they return if they do), so we have to hand this information
1325     * off to the target-specific pick-a-NaN routine.
1326     */
1327    if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1328        return pick_nan_muladd(a, b, c, inf_zero, s);
1329    }
1330
1331    if (inf_zero) {
1332        s->float_exception_flags |= float_flag_invalid;
1333        return parts_default_nan(s);
1334    }
1335
1336    if (flags & float_muladd_negate_c) {
1337        c.sign ^= 1;
1338    }
1339
1340    p_sign = a.sign ^ b.sign;
1341
1342    if (flags & float_muladd_negate_product) {
1343        p_sign ^= 1;
1344    }
1345
1346    if (a.cls == float_class_inf || b.cls == float_class_inf) {
1347        p_class = float_class_inf;
1348    } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1349        p_class = float_class_zero;
1350    } else {
1351        p_class = float_class_normal;
1352    }
1353
1354    if (c.cls == float_class_inf) {
1355        if (p_class == float_class_inf && p_sign != c.sign) {
1356            s->float_exception_flags |= float_flag_invalid;
1357            return parts_default_nan(s);
1358        } else {
1359            a.cls = float_class_inf;
1360            a.sign = c.sign ^ sign_flip;
1361            return a;
1362        }
1363    }
1364
1365    if (p_class == float_class_inf) {
1366        a.cls = float_class_inf;
1367        a.sign = p_sign ^ sign_flip;
1368        return a;
1369    }
1370
1371    if (p_class == float_class_zero) {
1372        if (c.cls == float_class_zero) {
1373            if (p_sign != c.sign) {
1374                p_sign = s->float_rounding_mode == float_round_down;
1375            }
1376            c.sign = p_sign;
1377        } else if (flags & float_muladd_halve_result) {
1378            c.exp -= 1;
1379        }
1380        c.sign ^= sign_flip;
1381        return c;
1382    }
1383
1384    /* a & b should be normals now... */
1385    assert(a.cls == float_class_normal &&
1386           b.cls == float_class_normal);
1387
1388    p_exp = a.exp + b.exp;
1389
1390    /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1391     * result.
1392     */
1393    mul64To128(a.frac, b.frac, &hi, &lo);
1394    /* binary point now at bit 124 */
1395
1396    /* check for overflow */
1397    if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1398        shift128RightJamming(hi, lo, 1, &hi, &lo);
1399        p_exp += 1;
1400    }
1401
1402    /* + add/sub */
1403    if (c.cls == float_class_zero) {
1404        /* move binary point back to 62 */
1405        shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1406    } else {
1407        int exp_diff = p_exp - c.exp;
1408        if (p_sign == c.sign) {
1409            /* Addition */
1410            if (exp_diff <= 0) {
1411                shift128RightJamming(hi, lo,
1412                                     DECOMPOSED_BINARY_POINT - exp_diff,
1413                                     &hi, &lo);
1414                lo += c.frac;
1415                p_exp = c.exp;
1416            } else {
1417                uint64_t c_hi, c_lo;
1418                /* shift c to the same binary point as the product (124) */
1419                c_hi = c.frac >> 2;
1420                c_lo = 0;
1421                shift128RightJamming(c_hi, c_lo,
1422                                     exp_diff,
1423                                     &c_hi, &c_lo);
1424                add128(hi, lo, c_hi, c_lo, &hi, &lo);
1425                /* move binary point back to 62 */
1426                shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1427            }
1428
1429            if (lo & DECOMPOSED_OVERFLOW_BIT) {
1430                shift64RightJamming(lo, 1, &lo);
1431                p_exp += 1;
1432            }
1433
1434        } else {
1435            /* Subtraction */
1436            uint64_t c_hi, c_lo;
1437            /* make C binary point match product at bit 124 */
1438            c_hi = c.frac >> 2;
1439            c_lo = 0;
1440
1441            if (exp_diff <= 0) {
1442                shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1443                if (exp_diff == 0
1444                    &&
1445                    (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1446                    sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1447                } else {
1448                    sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1449                    p_sign ^= 1;
1450                    p_exp = c.exp;
1451                }
1452            } else {
1453                shift128RightJamming(c_hi, c_lo,
1454                                     exp_diff,
1455                                     &c_hi, &c_lo);
1456                sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1457            }
1458
1459            if (hi == 0 && lo == 0) {
1460                a.cls = float_class_zero;
1461                a.sign = s->float_rounding_mode == float_round_down;
1462                a.sign ^= sign_flip;
1463                return a;
1464            } else {
1465                int shift;
1466                if (hi != 0) {
1467                    shift = clz64(hi);
1468                } else {
1469                    shift = clz64(lo) + 64;
1470                }
1471                /* Normalizing to a binary point of 124 is the
1472                   correct adjust for the exponent.  However since we're
1473                   shifting, we might as well put the binary point back
1474                   at 62 where we really want it.  Therefore shift as
1475                   if we're leaving 1 bit at the top of the word, but
1476                   adjust the exponent as if we're leaving 3 bits.  */
1477                shift -= 1;
1478                if (shift >= 64) {
1479                    lo = lo << (shift - 64);
1480                } else {
1481                    hi = (hi << shift) | (lo >> (64 - shift));
1482                    lo = hi | ((lo << shift) != 0);
1483                }
1484                p_exp -= shift - 2;
1485            }
1486        }
1487    }
1488
1489    if (flags & float_muladd_halve_result) {
1490        p_exp -= 1;
1491    }
1492
1493    /* finally prepare our result */
1494    a.cls = float_class_normal;
1495    a.sign = p_sign ^ sign_flip;
1496    a.exp = p_exp;
1497    a.frac = lo;
1498
1499    return a;
1500}
1501
1502float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1503                                                int flags, float_status *status)
1504{
1505    FloatParts pa = float16_unpack_canonical(a, status);
1506    FloatParts pb = float16_unpack_canonical(b, status);
1507    FloatParts pc = float16_unpack_canonical(c, status);
1508    FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1509
1510    return float16_round_pack_canonical(pr, status);
1511}
1512
1513static float32 QEMU_SOFTFLOAT_ATTR
1514soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1515                float_status *status)
1516{
1517    FloatParts pa = float32_unpack_canonical(a, status);
1518    FloatParts pb = float32_unpack_canonical(b, status);
1519    FloatParts pc = float32_unpack_canonical(c, status);
1520    FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1521
1522    return float32_round_pack_canonical(pr, status);
1523}
1524
1525static float64 QEMU_SOFTFLOAT_ATTR
1526soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1527                float_status *status)
1528{
1529    FloatParts pa = float64_unpack_canonical(a, status);
1530    FloatParts pb = float64_unpack_canonical(b, status);
1531    FloatParts pc = float64_unpack_canonical(c, status);
1532    FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1533
1534    return float64_round_pack_canonical(pr, status);
1535}
1536
1537static bool force_soft_fma;
1538
1539float32 QEMU_FLATTEN
1540float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1541{
1542    union_float32 ua, ub, uc, ur;
1543
1544    ua.s = xa;
1545    ub.s = xb;
1546    uc.s = xc;
1547
1548    if (unlikely(!can_use_fpu(s))) {
1549        goto soft;
1550    }
1551    if (unlikely(flags & float_muladd_halve_result)) {
1552        goto soft;
1553    }
1554
1555    float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1556    if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1557        goto soft;
1558    }
1559
1560    if (unlikely(force_soft_fma)) {
1561        goto soft;
1562    }
1563
1564    /*
1565     * When (a || b) == 0, there's no need to check for under/over flow,
1566     * since we know the addend is (normal || 0) and the product is 0.
1567     */
1568    if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1569        union_float32 up;
1570        bool prod_sign;
1571
1572        prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1573        prod_sign ^= !!(flags & float_muladd_negate_product);
1574        up.s = float32_set_sign(float32_zero, prod_sign);
1575
1576        if (flags & float_muladd_negate_c) {
1577            uc.h = -uc.h;
1578        }
1579        ur.h = up.h + uc.h;
1580    } else {
1581        union_float32 ua_orig = ua;
1582        union_float32 uc_orig = uc;
1583
1584        if (flags & float_muladd_negate_product) {
1585            ua.h = -ua.h;
1586        }
1587        if (flags & float_muladd_negate_c) {
1588            uc.h = -uc.h;
1589        }
1590
1591        ur.h = fmaf(ua.h, ub.h, uc.h);
1592
1593        if (unlikely(f32_is_inf(ur))) {
1594            s->float_exception_flags |= float_flag_overflow;
1595        } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1596            ua = ua_orig;
1597            uc = uc_orig;
1598            goto soft;
1599        }
1600    }
1601    if (flags & float_muladd_negate_result) {
1602        return float32_chs(ur.s);
1603    }
1604    return ur.s;
1605
1606 soft:
1607    return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1608}
1609
1610float64 QEMU_FLATTEN
1611float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1612{
1613    union_float64 ua, ub, uc, ur;
1614
1615    ua.s = xa;
1616    ub.s = xb;
1617    uc.s = xc;
1618
1619    if (unlikely(!can_use_fpu(s))) {
1620        goto soft;
1621    }
1622    if (unlikely(flags & float_muladd_halve_result)) {
1623        goto soft;
1624    }
1625
1626    float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1627    if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1628        goto soft;
1629    }
1630
1631    if (unlikely(force_soft_fma)) {
1632        goto soft;
1633    }
1634
1635    /*
1636     * When (a || b) == 0, there's no need to check for under/over flow,
1637     * since we know the addend is (normal || 0) and the product is 0.
1638     */
1639    if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1640        union_float64 up;
1641        bool prod_sign;
1642
1643        prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1644        prod_sign ^= !!(flags & float_muladd_negate_product);
1645        up.s = float64_set_sign(float64_zero, prod_sign);
1646
1647        if (flags & float_muladd_negate_c) {
1648            uc.h = -uc.h;
1649        }
1650        ur.h = up.h + uc.h;
1651    } else {
1652        union_float64 ua_orig = ua;
1653        union_float64 uc_orig = uc;
1654
1655        if (flags & float_muladd_negate_product) {
1656            ua.h = -ua.h;
1657        }
1658        if (flags & float_muladd_negate_c) {
1659            uc.h = -uc.h;
1660        }
1661
1662        ur.h = fma(ua.h, ub.h, uc.h);
1663
1664        if (unlikely(f64_is_inf(ur))) {
1665            s->float_exception_flags |= float_flag_overflow;
1666        } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1667            ua = ua_orig;
1668            uc = uc_orig;
1669            goto soft;
1670        }
1671    }
1672    if (flags & float_muladd_negate_result) {
1673        return float64_chs(ur.s);
1674    }
1675    return ur.s;
1676
1677 soft:
1678    return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1679}
1680
1681/*
1682 * Returns the result of dividing the floating-point value `a' by the
1683 * corresponding value `b'. The operation is performed according to
1684 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1685 */
1686
1687static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1688{
1689    bool sign = a.sign ^ b.sign;
1690
1691    if (a.cls == float_class_normal && b.cls == float_class_normal) {
1692        uint64_t n0, n1, q, r;
1693        int exp = a.exp - b.exp;
1694
1695        /*
1696         * We want a 2*N / N-bit division to produce exactly an N-bit
1697         * result, so that we do not lose any precision and so that we
1698         * do not have to renormalize afterward.  If A.frac < B.frac,
1699         * then division would produce an (N-1)-bit result; shift A left
1700         * by one to produce the an N-bit result, and decrement the
1701         * exponent to match.
1702         *
1703         * The udiv_qrnnd algorithm that we're using requires normalization,
1704         * i.e. the msb of the denominator must be set.  Since we know that
1705         * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1706         * by one (more), and the remainder must be shifted right by one.
1707         */
1708        if (a.frac < b.frac) {
1709            exp -= 1;
1710            shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1711        } else {
1712            shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1713        }
1714        q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1715
1716        /*
1717         * Set lsb if there is a remainder, to set inexact.
1718         * As mentioned above, to find the actual value of the remainder we
1719         * would need to shift right, but (1) we are only concerned about
1720         * non-zero-ness, and (2) the remainder will always be even because
1721         * both inputs to the division primitive are even.
1722         */
1723        a.frac = q | (r != 0);
1724        a.sign = sign;
1725        a.exp = exp;
1726        return a;
1727    }
1728    /* handle all the NaN cases */
1729    if (is_nan(a.cls) || is_nan(b.cls)) {
1730        return pick_nan(a, b, s);
1731    }
1732    /* 0/0 or Inf/Inf */
1733    if (a.cls == b.cls
1734        &&
1735        (a.cls == float_class_inf || a.cls == float_class_zero)) {
1736        s->float_exception_flags |= float_flag_invalid;
1737        return parts_default_nan(s);
1738    }
1739    /* Inf / x or 0 / x */
1740    if (a.cls == float_class_inf || a.cls == float_class_zero) {
1741        a.sign = sign;
1742        return a;
1743    }
1744    /* Div 0 => Inf */
1745    if (b.cls == float_class_zero) {
1746        s->float_exception_flags |= float_flag_divbyzero;
1747        a.cls = float_class_inf;
1748        a.sign = sign;
1749        return a;
1750    }
1751    /* Div by Inf */
1752    if (b.cls == float_class_inf) {
1753        a.cls = float_class_zero;
1754        a.sign = sign;
1755        return a;
1756    }
1757    g_assert_not_reached();
1758}
1759
1760float16 float16_div(float16 a, float16 b, float_status *status)
1761{
1762    FloatParts pa = float16_unpack_canonical(a, status);
1763    FloatParts pb = float16_unpack_canonical(b, status);
1764    FloatParts pr = div_floats(pa, pb, status);
1765
1766    return float16_round_pack_canonical(pr, status);
1767}
1768
1769static float32 QEMU_SOFTFLOAT_ATTR
1770soft_f32_div(float32 a, float32 b, float_status *status)
1771{
1772    FloatParts pa = float32_unpack_canonical(a, status);
1773    FloatParts pb = float32_unpack_canonical(b, status);
1774    FloatParts pr = div_floats(pa, pb, status);
1775
1776    return float32_round_pack_canonical(pr, status);
1777}
1778
1779static float64 QEMU_SOFTFLOAT_ATTR
1780soft_f64_div(float64 a, float64 b, float_status *status)
1781{
1782    FloatParts pa = float64_unpack_canonical(a, status);
1783    FloatParts pb = float64_unpack_canonical(b, status);
1784    FloatParts pr = div_floats(pa, pb, status);
1785
1786    return float64_round_pack_canonical(pr, status);
1787}
1788
1789static float hard_f32_div(float a, float b)
1790{
1791    return a / b;
1792}
1793
1794static double hard_f64_div(double a, double b)
1795{
1796    return a / b;
1797}
1798
1799static bool f32_div_pre(union_float32 a, union_float32 b)
1800{
1801    if (QEMU_HARDFLOAT_2F32_USE_FP) {
1802        return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1803               fpclassify(b.h) == FP_NORMAL;
1804    }
1805    return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1806}
1807
1808static bool f64_div_pre(union_float64 a, union_float64 b)
1809{
1810    if (QEMU_HARDFLOAT_2F64_USE_FP) {
1811        return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1812               fpclassify(b.h) == FP_NORMAL;
1813    }
1814    return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1815}
1816
1817static bool f32_div_post(union_float32 a, union_float32 b)
1818{
1819    if (QEMU_HARDFLOAT_2F32_USE_FP) {
1820        return fpclassify(a.h) != FP_ZERO;
1821    }
1822    return !float32_is_zero(a.s);
1823}
1824
1825static bool f64_div_post(union_float64 a, union_float64 b)
1826{
1827    if (QEMU_HARDFLOAT_2F64_USE_FP) {
1828        return fpclassify(a.h) != FP_ZERO;
1829    }
1830    return !float64_is_zero(a.s);
1831}
1832
1833float32 QEMU_FLATTEN
1834float32_div(float32 a, float32 b, float_status *s)
1835{
1836    return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1837                        f32_div_pre, f32_div_post, NULL, NULL);
1838}
1839
1840float64 QEMU_FLATTEN
1841float64_div(float64 a, float64 b, float_status *s)
1842{
1843    return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1844                        f64_div_pre, f64_div_post, NULL, NULL);
1845}
1846
1847/*
1848 * Float to Float conversions
1849 *
1850 * Returns the result of converting one float format to another. The
1851 * conversion is performed according to the IEC/IEEE Standard for
1852 * Binary Floating-Point Arithmetic.
1853 *
1854 * The float_to_float helper only needs to take care of raising
1855 * invalid exceptions and handling the conversion on NaNs.
1856 */
1857
1858static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1859                                 float_status *s)
1860{
1861    if (dstf->arm_althp) {
1862        switch (a.cls) {
1863        case float_class_qnan:
1864        case float_class_snan:
1865            /* There is no NaN in the destination format.  Raise Invalid
1866             * and return a zero with the sign of the input NaN.
1867             */
1868            s->float_exception_flags |= float_flag_invalid;
1869            a.cls = float_class_zero;
1870            a.frac = 0;
1871            a.exp = 0;
1872            break;
1873
1874        case float_class_inf:
1875            /* There is no Inf in the destination format.  Raise Invalid
1876             * and return the maximum normal with the correct sign.
1877             */
1878            s->float_exception_flags |= float_flag_invalid;
1879            a.cls = float_class_normal;
1880            a.exp = dstf->exp_max;
1881            a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1882            break;
1883
1884        default:
1885            break;
1886        }
1887    } else if (is_nan(a.cls)) {
1888        if (is_snan(a.cls)) {
1889            s->float_exception_flags |= float_flag_invalid;
1890            a = parts_silence_nan(a, s);
1891        }
1892        if (s->default_nan_mode) {
1893            return parts_default_nan(s);
1894        }
1895    }
1896    return a;
1897}
1898
1899float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1900{
1901    const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1902    FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1903    FloatParts pr = float_to_float(p, &float32_params, s);
1904    return float32_round_pack_canonical(pr, s);
1905}
1906
1907float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1908{
1909    const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1910    FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1911    FloatParts pr = float_to_float(p, &float64_params, s);
1912    return float64_round_pack_canonical(pr, s);
1913}
1914
1915float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1916{
1917    const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1918    FloatParts p = float32_unpack_canonical(a, s);
1919    FloatParts pr = float_to_float(p, fmt16, s);
1920    return float16a_round_pack_canonical(pr, s, fmt16);
1921}
1922
1923static float64 QEMU_SOFTFLOAT_ATTR
1924soft_float32_to_float64(float32 a, float_status *s)
1925{
1926    FloatParts p = float32_unpack_canonical(a, s);
1927    FloatParts pr = float_to_float(p, &float64_params, s);
1928    return float64_round_pack_canonical(pr, s);
1929}
1930
1931float64 float32_to_float64(float32 a, float_status *s)
1932{
1933    if (likely(float32_is_normal(a))) {
1934        /* Widening conversion can never produce inexact results.  */
1935        union_float32 uf;
1936        union_float64 ud;
1937        uf.s = a;
1938        ud.h = uf.h;
1939        return ud.s;
1940    } else if (float32_is_zero(a)) {
1941        return float64_set_sign(float64_zero, float32_is_neg(a));
1942    } else {
1943        return soft_float32_to_float64(a, s);
1944    }
1945}
1946
1947float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1948{
1949    const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1950    FloatParts p = float64_unpack_canonical(a, s);
1951    FloatParts pr = float_to_float(p, fmt16, s);
1952    return float16a_round_pack_canonical(pr, s, fmt16);
1953}
1954
1955float32 float64_to_float32(float64 a, float_status *s)
1956{
1957    FloatParts p = float64_unpack_canonical(a, s);
1958    FloatParts pr = float_to_float(p, &float32_params, s);
1959    return float32_round_pack_canonical(pr, s);
1960}
1961
1962/*
1963 * Rounds the floating-point value `a' to an integer, and returns the
1964 * result as a floating-point value. The operation is performed
1965 * according to the IEC/IEEE Standard for Binary Floating-Point
1966 * Arithmetic.
1967 */
1968
1969static FloatParts round_to_int(FloatParts a, int rmode,
1970                               int scale, float_status *s)
1971{
1972    switch (a.cls) {
1973    case float_class_qnan:
1974    case float_class_snan:
1975        return return_nan(a, s);
1976
1977    case float_class_zero:
1978    case float_class_inf:
1979        /* already "integral" */
1980        break;
1981
1982    case float_class_normal:
1983        scale = MIN(MAX(scale, -0x10000), 0x10000);
1984        a.exp += scale;
1985
1986        if (a.exp >= DECOMPOSED_BINARY_POINT) {
1987            /* already integral */
1988            break;
1989        }
1990        if (a.exp < 0) {
1991            bool one;
1992            /* all fractional */
1993            s->float_exception_flags |= float_flag_inexact;
1994            switch (rmode) {
1995            case float_round_nearest_even:
1996                one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1997                break;
1998            case float_round_ties_away:
1999                one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2000                break;
2001            case float_round_to_zero:
2002                one = false;
2003                break;
2004            case float_round_up:
2005                one = !a.sign;
2006                break;
2007            case float_round_down:
2008                one = a.sign;
2009                break;
2010            case float_round_to_odd:
2011                one = true;
2012                break;
2013            default:
2014                g_assert_not_reached();
2015            }
2016
2017            if (one) {
2018                a.frac = DECOMPOSED_IMPLICIT_BIT;
2019                a.exp = 0;
2020            } else {
2021                a.cls = float_class_zero;
2022            }
2023        } else {
2024            uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2025            uint64_t frac_lsbm1 = frac_lsb >> 1;
2026            uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2027            uint64_t rnd_mask = rnd_even_mask >> 1;
2028            uint64_t inc;
2029
2030            switch (rmode) {
2031            case float_round_nearest_even:
2032                inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2033                break;
2034            case float_round_ties_away:
2035                inc = frac_lsbm1;
2036                break;
2037            case float_round_to_zero:
2038                inc = 0;
2039                break;
2040            case float_round_up:
2041                inc = a.sign ? 0 : rnd_mask;
2042                break;
2043            case float_round_down:
2044                inc = a.sign ? rnd_mask : 0;
2045                break;
2046            case float_round_to_odd:
2047                inc = a.frac & frac_lsb ? 0 : rnd_mask;
2048                break;
2049            default:
2050                g_assert_not_reached();
2051            }
2052
2053            if (a.frac & rnd_mask) {
2054                s->float_exception_flags |= float_flag_inexact;
2055                a.frac += inc;
2056                a.frac &= ~rnd_mask;
2057                if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
2058                    a.frac >>= 1;
2059                    a.exp++;
2060                }
2061            }
2062        }
2063        break;
2064    default:
2065        g_assert_not_reached();
2066    }
2067    return a;
2068}
2069
2070float16 float16_round_to_int(float16 a, float_status *s)
2071{
2072    FloatParts pa = float16_unpack_canonical(a, s);
2073    FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2074    return float16_round_pack_canonical(pr, s);
2075}
2076
2077float32 float32_round_to_int(float32 a, float_status *s)
2078{
2079    FloatParts pa = float32_unpack_canonical(a, s);
2080    FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2081    return float32_round_pack_canonical(pr, s);
2082}
2083
2084float64 float64_round_to_int(float64 a, float_status *s)
2085{
2086    FloatParts pa = float64_unpack_canonical(a, s);
2087    FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2088    return float64_round_pack_canonical(pr, s);
2089}
2090
2091/*
2092 * Returns the result of converting the floating-point value `a' to
2093 * the two's complement integer format. The conversion is performed
2094 * according to the IEC/IEEE Standard for Binary Floating-Point
2095 * Arithmetic---which means in particular that the conversion is
2096 * rounded according to the current rounding mode. If `a' is a NaN,
2097 * the largest positive integer is returned. Otherwise, if the
2098 * conversion overflows, the largest integer with the same sign as `a'
2099 * is returned.
2100*/
2101
2102static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
2103                                     int64_t min, int64_t max,
2104                                     float_status *s)
2105{
2106    uint64_t r;
2107    int orig_flags = get_float_exception_flags(s);
2108    FloatParts p = round_to_int(in, rmode, scale, s);
2109
2110    switch (p.cls) {
2111    case float_class_snan:
2112    case float_class_qnan:
2113        s->float_exception_flags = orig_flags | float_flag_invalid;
2114        return max;
2115    case float_class_inf:
2116        s->float_exception_flags = orig_flags | float_flag_invalid;
2117        return p.sign ? min : max;
2118    case float_class_zero:
2119        return 0;
2120    case float_class_normal:
2121        if (p.exp < DECOMPOSED_BINARY_POINT) {
2122            r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2123        } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2124            r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2125        } else {
2126            r = UINT64_MAX;
2127        }
2128        if (p.sign) {
2129            if (r <= -(uint64_t) min) {
2130                return -r;
2131            } else {
2132                s->float_exception_flags = orig_flags | float_flag_invalid;
2133                return min;
2134            }
2135        } else {
2136            if (r <= max) {
2137                return r;
2138            } else {
2139                s->float_exception_flags = orig_flags | float_flag_invalid;
2140                return max;
2141            }
2142        }
2143    default:
2144        g_assert_not_reached();
2145    }
2146}
2147
2148int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
2149                                float_status *s)
2150{
2151    return round_to_int_and_pack(float16_unpack_canonical(a, s),
2152                                 rmode, scale, INT16_MIN, INT16_MAX, s);
2153}
2154
2155int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
2156                                float_status *s)
2157{
2158    return round_to_int_and_pack(float16_unpack_canonical(a, s),
2159                                 rmode, scale, INT32_MIN, INT32_MAX, s);
2160}
2161
2162int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
2163                                float_status *s)
2164{
2165    return round_to_int_and_pack(float16_unpack_canonical(a, s),
2166                                 rmode, scale, INT64_MIN, INT64_MAX, s);
2167}
2168
2169int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
2170                                float_status *s)
2171{
2172    return round_to_int_and_pack(float32_unpack_canonical(a, s),
2173                                 rmode, scale, INT16_MIN, INT16_MAX, s);
2174}
2175
2176int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
2177                                float_status *s)
2178{
2179    return round_to_int_and_pack(float32_unpack_canonical(a, s),
2180                                 rmode, scale, INT32_MIN, INT32_MAX, s);
2181}
2182
2183int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
2184                                float_status *s)
2185{
2186    return round_to_int_and_pack(float32_unpack_canonical(a, s),
2187                                 rmode, scale, INT64_MIN, INT64_MAX, s);
2188}
2189
2190int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
2191                                float_status *s)
2192{
2193    return round_to_int_and_pack(float64_unpack_canonical(a, s),
2194                                 rmode, scale, INT16_MIN, INT16_MAX, s);
2195}
2196
2197int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
2198                                float_status *s)
2199{
2200    return round_to_int_and_pack(float64_unpack_canonical(a, s),
2201                                 rmode, scale, INT32_MIN, INT32_MAX, s);
2202}
2203
2204int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
2205                                float_status *s)
2206{
2207    return round_to_int_and_pack(float64_unpack_canonical(a, s),
2208                                 rmode, scale, INT64_MIN, INT64_MAX, s);
2209}
2210
2211int16_t float16_to_int16(float16 a, float_status *s)
2212{
2213    return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2214}
2215
2216int32_t float16_to_int32(float16 a, float_status *s)
2217{
2218    return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2219}
2220
2221int64_t float16_to_int64(float16 a, float_status *s)
2222{
2223    return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2224}
2225
2226int16_t float32_to_int16(float32 a, float_status *s)
2227{
2228    return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2229}
2230
2231int32_t float32_to_int32(float32 a, float_status *s)
2232{
2233    return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2234}
2235
2236int64_t float32_to_int64(float32 a, float_status *s)
2237{
2238    return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2239}
2240
2241int16_t float64_to_int16(float64 a, float_status *s)
2242{
2243    return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2244}
2245
2246int32_t float64_to_int32(float64 a, float_status *s)
2247{
2248    return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2249}
2250
2251int64_t float64_to_int64(float64 a, float_status *s)
2252{
2253    return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2254}
2255
2256int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2257{
2258    return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2259}
2260
2261int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2262{
2263    return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2264}
2265
2266int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2267{
2268    return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2269}
2270
2271int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2272{
2273    return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2274}
2275
2276int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2277{
2278    return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2279}
2280
2281int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2282{
2283    return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2284}
2285
2286int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2287{
2288    return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2289}
2290
2291int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2292{
2293    return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2294}
2295
2296int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2297{
2298    return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2299}
2300
2301/*
2302 *  Returns the result of converting the floating-point value `a' to
2303 *  the unsigned integer format. The conversion is performed according
2304 *  to the IEC/IEEE Standard for Binary Floating-Point
2305 *  Arithmetic---which means in particular that the conversion is
2306 *  rounded according to the current rounding mode. If `a' is a NaN,
2307 *  the largest unsigned integer is returned. Otherwise, if the
2308 *  conversion overflows, the largest unsigned integer is returned. If
2309 *  the 'a' is negative, the result is rounded and zero is returned;
2310 *  values that do not round to zero will raise the inexact exception
2311 *  flag.
2312 */
2313
2314static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
2315                                       uint64_t max, float_status *s)
2316{
2317    int orig_flags = get_float_exception_flags(s);
2318    FloatParts p = round_to_int(in, rmode, scale, s);
2319    uint64_t r;
2320
2321    switch (p.cls) {
2322    case float_class_snan:
2323    case float_class_qnan:
2324        s->float_exception_flags = orig_flags | float_flag_invalid;
2325        return max;
2326    case float_class_inf:
2327        s->float_exception_flags = orig_flags | float_flag_invalid;
2328        return p.sign ? 0 : max;
2329    case float_class_zero:
2330        return 0;
2331    case float_class_normal:
2332        if (p.sign) {
2333            s->float_exception_flags = orig_flags | float_flag_invalid;
2334            return 0;
2335        }
2336
2337        if (p.exp < DECOMPOSED_BINARY_POINT) {
2338            r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2339        } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2340            r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2341        } else {
2342            s->float_exception_flags = orig_flags | float_flag_invalid;
2343            return max;
2344        }
2345
2346        /* For uint64 this will never trip, but if p.exp is too large
2347         * to shift a decomposed fraction we shall have exited via the
2348         * 3rd leg above.
2349         */
2350        if (r > max) {
2351            s->float_exception_flags = orig_flags | float_flag_invalid;
2352            return max;
2353        }
2354        return r;
2355    default:
2356        g_assert_not_reached();
2357    }
2358}
2359
2360uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
2361                                  float_status *s)
2362{
2363    return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2364                                  rmode, scale, UINT16_MAX, s);
2365}
2366
2367uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
2368                                  float_status *s)
2369{
2370    return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2371                                  rmode, scale, UINT32_MAX, s);
2372}
2373
2374uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
2375                                  float_status *s)
2376{
2377    return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2378                                  rmode, scale, UINT64_MAX, s);
2379}
2380
2381uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
2382                                  float_status *s)
2383{
2384    return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2385                                  rmode, scale, UINT16_MAX, s);
2386}
2387
2388uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
2389                                  float_status *s)
2390{
2391    return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2392                                  rmode, scale, UINT32_MAX, s);
2393}
2394
2395uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
2396                                  float_status *s)
2397{
2398    return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2399                                  rmode, scale, UINT64_MAX, s);
2400}
2401
2402uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
2403                                  float_status *s)
2404{
2405    return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2406                                  rmode, scale, UINT16_MAX, s);
2407}
2408
2409uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
2410                                  float_status *s)
2411{
2412    return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2413                                  rmode, scale, UINT32_MAX, s);
2414}
2415
2416uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
2417                                  float_status *s)
2418{
2419    return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2420                                  rmode, scale, UINT64_MAX, s);
2421}
2422
2423uint16_t float16_to_uint16(float16 a, float_status *s)
2424{
2425    return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2426}
2427
2428uint32_t float16_to_uint32(float16 a, float_status *s)
2429{
2430    return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2431}
2432
2433uint64_t float16_to_uint64(float16 a, float_status *s)
2434{
2435    return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2436}
2437
2438uint16_t float32_to_uint16(float32 a, float_status *s)
2439{
2440    return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2441}
2442
2443uint32_t float32_to_uint32(float32 a, float_status *s)
2444{
2445    return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2446}
2447
2448uint64_t float32_to_uint64(float32 a, float_status *s)
2449{
2450    return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2451}
2452
2453uint16_t float64_to_uint16(float64 a, float_status *s)
2454{
2455    return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2456}
2457
2458uint32_t float64_to_uint32(float64 a, float_status *s)
2459{
2460    return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2461}
2462
2463uint64_t float64_to_uint64(float64 a, float_status *s)
2464{
2465    return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2466}
2467
2468uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2469{
2470    return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2471}
2472
2473uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2474{
2475    return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2476}
2477
2478uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2479{
2480    return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2481}
2482
2483uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2484{
2485    return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2486}
2487
2488uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2489{
2490    return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2491}
2492
2493uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2494{
2495    return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2496}
2497
2498uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2499{
2500    return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2501}
2502
2503uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2504{
2505    return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2506}
2507
2508uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2509{
2510    return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2511}
2512
2513/*
2514 * Integer to float conversions
2515 *
2516 * Returns the result of converting the two's complement integer `a'
2517 * to the floating-point format. The conversion is performed according
2518 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2519 */
2520
2521static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2522{
2523    FloatParts r = { .sign = false };
2524
2525    if (a == 0) {
2526        r.cls = float_class_zero;
2527    } else {
2528        uint64_t f = a;
2529        int shift;
2530
2531        r.cls = float_class_normal;
2532        if (a < 0) {
2533            f = -f;
2534            r.sign = true;
2535        }
2536        shift = clz64(f) - 1;
2537        scale = MIN(MAX(scale, -0x10000), 0x10000);
2538
2539        r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2540        r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2541    }
2542
2543    return r;
2544}
2545
2546float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2547{
2548    FloatParts pa = int_to_float(a, scale, status);
2549    return float16_round_pack_canonical(pa, status);
2550}
2551
2552float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2553{
2554    return int64_to_float16_scalbn(a, scale, status);
2555}
2556
2557float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2558{
2559    return int64_to_float16_scalbn(a, scale, status);
2560}
2561
2562float16 int64_to_float16(int64_t a, float_status *status)
2563{
2564    return int64_to_float16_scalbn(a, 0, status);
2565}
2566
2567float16 int32_to_float16(int32_t a, float_status *status)
2568{
2569    return int64_to_float16_scalbn(a, 0, status);
2570}
2571
2572float16 int16_to_float16(int16_t a, float_status *status)
2573{
2574    return int64_to_float16_scalbn(a, 0, status);
2575}
2576
2577float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2578{
2579    FloatParts pa = int_to_float(a, scale, status);
2580    return float32_round_pack_canonical(pa, status);
2581}
2582
2583float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2584{
2585    return int64_to_float32_scalbn(a, scale, status);
2586}
2587
2588float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2589{
2590    return int64_to_float32_scalbn(a, scale, status);
2591}
2592
2593float32 int64_to_float32(int64_t a, float_status *status)
2594{
2595    return int64_to_float32_scalbn(a, 0, status);
2596}
2597
2598float32 int32_to_float32(int32_t a, float_status *status)
2599{
2600    return int64_to_float32_scalbn(a, 0, status);
2601}
2602
2603float32 int16_to_float32(int16_t a, float_status *status)
2604{
2605    return int64_to_float32_scalbn(a, 0, status);
2606}
2607
2608float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2609{
2610    FloatParts pa = int_to_float(a, scale, status);
2611    return float64_round_pack_canonical(pa, status);
2612}
2613
2614float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2615{
2616    return int64_to_float64_scalbn(a, scale, status);
2617}
2618
2619float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2620{
2621    return int64_to_float64_scalbn(a, scale, status);
2622}
2623
2624float64 int64_to_float64(int64_t a, float_status *status)
2625{
2626    return int64_to_float64_scalbn(a, 0, status);
2627}
2628
2629float64 int32_to_float64(int32_t a, float_status *status)
2630{
2631    return int64_to_float64_scalbn(a, 0, status);
2632}
2633
2634float64 int16_to_float64(int16_t a, float_status *status)
2635{
2636    return int64_to_float64_scalbn(a, 0, status);
2637}
2638
2639
2640/*
2641 * Unsigned Integer to float conversions
2642 *
2643 * Returns the result of converting the unsigned integer `a' to the
2644 * floating-point format. The conversion is performed according to the
2645 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2646 */
2647
2648static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2649{
2650    FloatParts r = { .sign = false };
2651
2652    if (a == 0) {
2653        r.cls = float_class_zero;
2654    } else {
2655        scale = MIN(MAX(scale, -0x10000), 0x10000);
2656        r.cls = float_class_normal;
2657        if ((int64_t)a < 0) {
2658            r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2659            shift64RightJamming(a, 1, &a);
2660            r.frac = a;
2661        } else {
2662            int shift = clz64(a) - 1;
2663            r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2664            r.frac = a << shift;
2665        }
2666    }
2667
2668    return r;
2669}
2670
2671float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2672{
2673    FloatParts pa = uint_to_float(a, scale, status);
2674    return float16_round_pack_canonical(pa, status);
2675}
2676
2677float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2678{
2679    return uint64_to_float16_scalbn(a, scale, status);
2680}
2681
2682float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2683{
2684    return uint64_to_float16_scalbn(a, scale, status);
2685}
2686
2687float16 uint64_to_float16(uint64_t a, float_status *status)
2688{
2689    return uint64_to_float16_scalbn(a, 0, status);
2690}
2691
2692float16 uint32_to_float16(uint32_t a, float_status *status)
2693{
2694    return uint64_to_float16_scalbn(a, 0, status);
2695}
2696
2697float16 uint16_to_float16(uint16_t a, float_status *status)
2698{
2699    return uint64_to_float16_scalbn(a, 0, status);
2700}
2701
2702float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2703{
2704    FloatParts pa = uint_to_float(a, scale, status);
2705    return float32_round_pack_canonical(pa, status);
2706}
2707
2708float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2709{
2710    return uint64_to_float32_scalbn(a, scale, status);
2711}
2712
2713float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2714{
2715    return uint64_to_float32_scalbn(a, scale, status);
2716}
2717
2718float32 uint64_to_float32(uint64_t a, float_status *status)
2719{
2720    return uint64_to_float32_scalbn(a, 0, status);
2721}
2722
2723float32 uint32_to_float32(uint32_t a, float_status *status)
2724{
2725    return uint64_to_float32_scalbn(a, 0, status);
2726}
2727
2728float32 uint16_to_float32(uint16_t a, float_status *status)
2729{
2730    return uint64_to_float32_scalbn(a, 0, status);
2731}
2732
2733float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2734{
2735    FloatParts pa = uint_to_float(a, scale, status);
2736    return float64_round_pack_canonical(pa, status);
2737}
2738
2739float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2740{
2741    return uint64_to_float64_scalbn(a, scale, status);
2742}
2743
2744float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2745{
2746    return uint64_to_float64_scalbn(a, scale, status);
2747}
2748
2749float64 uint64_to_float64(uint64_t a, float_status *status)
2750{
2751    return uint64_to_float64_scalbn(a, 0, status);
2752}
2753
2754float64 uint32_to_float64(uint32_t a, float_status *status)
2755{
2756    return uint64_to_float64_scalbn(a, 0, status);
2757}
2758
2759float64 uint16_to_float64(uint16_t a, float_status *status)
2760{
2761    return uint64_to_float64_scalbn(a, 0, status);
2762}
2763
2764/* Float Min/Max */
2765/* min() and max() functions. These can't be implemented as
2766 * 'compare and pick one input' because that would mishandle
2767 * NaNs and +0 vs -0.
2768 *
2769 * minnum() and maxnum() functions. These are similar to the min()
2770 * and max() functions but if one of the arguments is a QNaN and
2771 * the other is numerical then the numerical argument is returned.
2772 * SNaNs will get quietened before being returned.
2773 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2774 * and maxNum() operations. min() and max() are the typical min/max
2775 * semantics provided by many CPUs which predate that specification.
2776 *
2777 * minnummag() and maxnummag() functions correspond to minNumMag()
2778 * and minNumMag() from the IEEE-754 2008.
2779 */
2780static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2781                                bool ieee, bool ismag, float_status *s)
2782{
2783    if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2784        if (ieee) {
2785            /* Takes two floating-point values `a' and `b', one of
2786             * which is a NaN, and returns the appropriate NaN
2787             * result. If either `a' or `b' is a signaling NaN,
2788             * the invalid exception is raised.
2789             */
2790            if (is_snan(a.cls) || is_snan(b.cls)) {
2791                return pick_nan(a, b, s);
2792            } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2793                return b;
2794            } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2795                return a;
2796            }
2797        }
2798        return pick_nan(a, b, s);
2799    } else {
2800        int a_exp, b_exp;
2801
2802        switch (a.cls) {
2803        case float_class_normal:
2804            a_exp = a.exp;
2805            break;
2806        case float_class_inf:
2807            a_exp = INT_MAX;
2808            break;
2809        case float_class_zero:
2810            a_exp = INT_MIN;
2811            break;
2812        default:
2813            g_assert_not_reached();
2814            break;
2815        }
2816        switch (b.cls) {
2817        case float_class_normal:
2818            b_exp = b.exp;
2819            break;
2820        case float_class_inf:
2821            b_exp = INT_MAX;
2822            break;
2823        case float_class_zero:
2824            b_exp = INT_MIN;
2825            break;
2826        default:
2827            g_assert_not_reached();
2828            break;
2829        }
2830
2831        if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2832            bool a_less = a_exp < b_exp;
2833            if (a_exp == b_exp) {
2834                a_less = a.frac < b.frac;
2835            }
2836            return a_less ^ ismin ? b : a;
2837        }
2838
2839        if (a.sign == b.sign) {
2840            bool a_less = a_exp < b_exp;
2841            if (a_exp == b_exp) {
2842                a_less = a.frac < b.frac;
2843            }
2844            return a.sign ^ a_less ^ ismin ? b : a;
2845        } else {
2846            return a.sign ^ ismin ? b : a;
2847        }
2848    }
2849}
2850
2851#define MINMAX(sz, name, ismin, isiee, ismag)                           \
2852float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2853                                     float_status *s)                   \
2854{                                                                       \
2855    FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2856    FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2857    FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2858                                                                        \
2859    return float ## sz ## _round_pack_canonical(pr, s);                 \
2860}
2861
2862MINMAX(16, min, true, false, false)
2863MINMAX(16, minnum, true, true, false)
2864MINMAX(16, minnummag, true, true, true)
2865MINMAX(16, max, false, false, false)
2866MINMAX(16, maxnum, false, true, false)
2867MINMAX(16, maxnummag, false, true, true)
2868
2869MINMAX(32, min, true, false, false)
2870MINMAX(32, minnum, true, true, false)
2871MINMAX(32, minnummag, true, true, true)
2872MINMAX(32, max, false, false, false)
2873MINMAX(32, maxnum, false, true, false)
2874MINMAX(32, maxnummag, false, true, true)
2875
2876MINMAX(64, min, true, false, false)
2877MINMAX(64, minnum, true, true, false)
2878MINMAX(64, minnummag, true, true, true)
2879MINMAX(64, max, false, false, false)
2880MINMAX(64, maxnum, false, true, false)
2881MINMAX(64, maxnummag, false, true, true)
2882
2883#undef MINMAX
2884
2885/* Floating point compare */
2886static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2887                          float_status *s)
2888{
2889    if (is_nan(a.cls) || is_nan(b.cls)) {
2890        if (!is_quiet ||
2891            a.cls == float_class_snan ||
2892            b.cls == float_class_snan) {
2893            s->float_exception_flags |= float_flag_invalid;
2894        }
2895        return float_relation_unordered;
2896    }
2897
2898    if (a.cls == float_class_zero) {
2899        if (b.cls == float_class_zero) {
2900            return float_relation_equal;
2901        }
2902        return b.sign ? float_relation_greater : float_relation_less;
2903    } else if (b.cls == float_class_zero) {
2904        return a.sign ? float_relation_less : float_relation_greater;
2905    }
2906
2907    /* The only really important thing about infinity is its sign. If
2908     * both are infinities the sign marks the smallest of the two.
2909     */
2910    if (a.cls == float_class_inf) {
2911        if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2912            return float_relation_equal;
2913        }
2914        return a.sign ? float_relation_less : float_relation_greater;
2915    } else if (b.cls == float_class_inf) {
2916        return b.sign ? float_relation_greater : float_relation_less;
2917    }
2918
2919    if (a.sign != b.sign) {
2920        return a.sign ? float_relation_less : float_relation_greater;
2921    }
2922
2923    if (a.exp == b.exp) {
2924        if (a.frac == b.frac) {
2925            return float_relation_equal;
2926        }
2927        if (a.sign) {
2928            return a.frac > b.frac ?
2929                float_relation_less : float_relation_greater;
2930        } else {
2931            return a.frac > b.frac ?
2932                float_relation_greater : float_relation_less;
2933        }
2934    } else {
2935        if (a.sign) {
2936            return a.exp > b.exp ? float_relation_less : float_relation_greater;
2937        } else {
2938            return a.exp > b.exp ? float_relation_greater : float_relation_less;
2939        }
2940    }
2941}
2942
2943#define COMPARE(name, attr, sz)                                         \
2944static int attr                                                         \
2945name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
2946{                                                                       \
2947    FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2948    FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2949    return compare_floats(pa, pb, is_quiet, s);                         \
2950}
2951
2952COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
2953COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
2954COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
2955
2956#undef COMPARE
2957
2958int float16_compare(float16 a, float16 b, float_status *s)
2959{
2960    return soft_f16_compare(a, b, false, s);
2961}
2962
2963int float16_compare_quiet(float16 a, float16 b, float_status *s)
2964{
2965    return soft_f16_compare(a, b, true, s);
2966}
2967
2968static int QEMU_FLATTEN
2969f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
2970{
2971    union_float32 ua, ub;
2972
2973    ua.s = xa;
2974    ub.s = xb;
2975
2976    if (QEMU_NO_HARDFLOAT) {
2977        goto soft;
2978    }
2979
2980    float32_input_flush2(&ua.s, &ub.s, s);
2981    if (isgreaterequal(ua.h, ub.h)) {
2982        if (isgreater(ua.h, ub.h)) {
2983            return float_relation_greater;
2984        }
2985        return float_relation_equal;
2986    }
2987    if (likely(isless(ua.h, ub.h))) {
2988        return float_relation_less;
2989    }
2990    /* The only condition remaining is unordered.
2991     * Fall through to set flags.
2992     */
2993 soft:
2994    return soft_f32_compare(ua.s, ub.s, is_quiet, s);
2995}
2996
2997int float32_compare(float32 a, float32 b, float_status *s)
2998{
2999    return f32_compare(a, b, false, s);
3000}
3001
3002int float32_compare_quiet(float32 a, float32 b, float_status *s)
3003{
3004    return f32_compare(a, b, true, s);
3005}
3006
3007static int QEMU_FLATTEN
3008f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3009{
3010    union_float64 ua, ub;
3011
3012    ua.s = xa;
3013    ub.s = xb;
3014
3015    if (QEMU_NO_HARDFLOAT) {
3016        goto soft;
3017    }
3018
3019    float64_input_flush2(&ua.s, &ub.s, s);
3020    if (isgreaterequal(ua.h, ub.h)) {
3021        if (isgreater(ua.h, ub.h)) {
3022            return float_relation_greater;
3023        }
3024        return float_relation_equal;
3025    }
3026    if (likely(isless(ua.h, ub.h))) {
3027        return float_relation_less;
3028    }
3029    /* The only condition remaining is unordered.
3030     * Fall through to set flags.
3031     */
3032 soft:
3033    return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3034}
3035
3036int float64_compare(float64 a, float64 b, float_status *s)
3037{
3038    return f64_compare(a, b, false, s);
3039}
3040
3041int float64_compare_quiet(float64 a, float64 b, float_status *s)
3042{
3043    return f64_compare(a, b, true, s);
3044}
3045
3046/* Multiply A by 2 raised to the power N.  */
3047static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
3048{
3049    if (unlikely(is_nan(a.cls))) {
3050        return return_nan(a, s);
3051    }
3052    if (a.cls == float_class_normal) {
3053        /* The largest float type (even though not supported by FloatParts)
3054         * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3055         * still allows rounding to infinity, without allowing overflow
3056         * within the int32_t that backs FloatParts.exp.
3057         */
3058        n = MIN(MAX(n, -0x10000), 0x10000);
3059        a.exp += n;
3060    }
3061    return a;
3062}
3063
3064float16 float16_scalbn(float16 a, int n, float_status *status)
3065{
3066    FloatParts pa = float16_unpack_canonical(a, status);
3067    FloatParts pr = scalbn_decomposed(pa, n, status);
3068    return float16_round_pack_canonical(pr, status);
3069}
3070
3071float32 float32_scalbn(float32 a, int n, float_status *status)
3072{
3073    FloatParts pa = float32_unpack_canonical(a, status);
3074    FloatParts pr = scalbn_decomposed(pa, n, status);
3075    return float32_round_pack_canonical(pr, status);
3076}
3077
3078float64 float64_scalbn(float64 a, int n, float_status *status)
3079{
3080    FloatParts pa = float64_unpack_canonical(a, status);
3081    FloatParts pr = scalbn_decomposed(pa, n, status);
3082    return float64_round_pack_canonical(pr, status);
3083}
3084
3085/*
3086 * Square Root
3087 *
3088 * The old softfloat code did an approximation step before zeroing in
3089 * on the final result. However for simpleness we just compute the
3090 * square root by iterating down from the implicit bit to enough extra
3091 * bits to ensure we get a correctly rounded result.
3092 *
3093 * This does mean however the calculation is slower than before,
3094 * especially for 64 bit floats.
3095 */
3096
3097static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
3098{
3099    uint64_t a_frac, r_frac, s_frac;
3100    int bit, last_bit;
3101
3102    if (is_nan(a.cls)) {
3103        return return_nan(a, s);
3104    }
3105    if (a.cls == float_class_zero) {
3106        return a;  /* sqrt(+-0) = +-0 */
3107    }
3108    if (a.sign) {
3109        s->float_exception_flags |= float_flag_invalid;
3110        return parts_default_nan(s);
3111    }
3112    if (a.cls == float_class_inf) {
3113        return a;  /* sqrt(+inf) = +inf */
3114    }
3115
3116    assert(a.cls == float_class_normal);
3117
3118    /* We need two overflow bits at the top. Adding room for that is a
3119     * right shift. If the exponent is odd, we can discard the low bit
3120     * by multiplying the fraction by 2; that's a left shift. Combine
3121     * those and we shift right if the exponent is even.
3122     */
3123    a_frac = a.frac;
3124    if (!(a.exp & 1)) {
3125        a_frac >>= 1;
3126    }
3127    a.exp >>= 1;
3128
3129    /* Bit-by-bit computation of sqrt.  */
3130    r_frac = 0;
3131    s_frac = 0;
3132
3133    /* Iterate from implicit bit down to the 3 extra bits to compute a
3134     * properly rounded result. Remember we've inserted one more bit
3135     * at the top, so these positions are one less.
3136     */
3137    bit = DECOMPOSED_BINARY_POINT - 1;
3138    last_bit = MAX(p->frac_shift - 4, 0);
3139    do {
3140        uint64_t q = 1ULL << bit;
3141        uint64_t t_frac = s_frac + q;
3142        if (t_frac <= a_frac) {
3143            s_frac = t_frac + q;
3144            a_frac -= t_frac;
3145            r_frac += q;
3146        }
3147        a_frac <<= 1;
3148    } while (--bit >= last_bit);
3149
3150    /* Undo the right shift done above. If there is any remaining
3151     * fraction, the result is inexact. Set the sticky bit.
3152     */
3153    a.frac = (r_frac << 1) + (a_frac != 0);
3154
3155    return a;
3156}
3157
3158float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3159{
3160    FloatParts pa = float16_unpack_canonical(a, status);
3161    FloatParts pr = sqrt_float(pa, status, &float16_params);
3162    return float16_round_pack_canonical(pr, status);
3163}
3164
3165static float32 QEMU_SOFTFLOAT_ATTR
3166soft_f32_sqrt(float32 a, float_status *status)
3167{
3168    FloatParts pa = float32_unpack_canonical(a, status);
3169    FloatParts pr = sqrt_float(pa, status, &float32_params);
3170    return float32_round_pack_canonical(pr, status);
3171}
3172
3173static float64 QEMU_SOFTFLOAT_ATTR
3174soft_f64_sqrt(float64 a, float_status *status)
3175{
3176    FloatParts pa = float64_unpack_canonical(a, status);
3177    FloatParts pr = sqrt_float(pa, status, &float64_params);
3178    return float64_round_pack_canonical(pr, status);
3179}
3180
3181float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3182{
3183    union_float32 ua, ur;
3184
3185    ua.s = xa;
3186    if (unlikely(!can_use_fpu(s))) {
3187        goto soft;
3188    }
3189
3190    float32_input_flush1(&ua.s, s);
3191    if (QEMU_HARDFLOAT_1F32_USE_FP) {
3192        if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3193                       fpclassify(ua.h) == FP_ZERO) ||
3194                     signbit(ua.h))) {
3195            goto soft;
3196        }
3197    } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3198                        float32_is_neg(ua.s))) {
3199        goto soft;
3200    }
3201    ur.h = sqrtf(ua.h);
3202    return ur.s;
3203
3204 soft:
3205    return soft_f32_sqrt(ua.s, s);
3206}
3207
3208float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3209{
3210    union_float64 ua, ur;
3211
3212    ua.s = xa;
3213    if (unlikely(!can_use_fpu(s))) {
3214        goto soft;
3215    }
3216
3217    float64_input_flush1(&ua.s, s);
3218    if (QEMU_HARDFLOAT_1F64_USE_FP) {
3219        if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3220                       fpclassify(ua.h) == FP_ZERO) ||
3221                     signbit(ua.h))) {
3222            goto soft;
3223        }
3224    } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3225                        float64_is_neg(ua.s))) {
3226        goto soft;
3227    }
3228    ur.h = sqrt(ua.h);
3229    return ur.s;
3230
3231 soft:
3232    return soft_f64_sqrt(ua.s, s);
3233}
3234
3235/*----------------------------------------------------------------------------
3236| The pattern for a default generated NaN.
3237*----------------------------------------------------------------------------*/
3238
3239float16 float16_default_nan(float_status *status)
3240{
3241    FloatParts p = parts_default_nan(status);
3242    p.frac >>= float16_params.frac_shift;
3243    return float16_pack_raw(p);
3244}
3245
3246float32 float32_default_nan(float_status *status)
3247{
3248    FloatParts p = parts_default_nan(status);
3249    p.frac >>= float32_params.frac_shift;
3250    return float32_pack_raw(p);
3251}
3252
3253float64 float64_default_nan(float_status *status)
3254{
3255    FloatParts p = parts_default_nan(status);
3256    p.frac >>= float64_params.frac_shift;
3257    return float64_pack_raw(p);
3258}
3259
3260float128 float128_default_nan(float_status *status)
3261{
3262    FloatParts p = parts_default_nan(status);
3263    float128 r;
3264
3265    /* Extrapolate from the choices made by parts_default_nan to fill
3266     * in the quad-floating format.  If the low bit is set, assume we
3267     * want to set all non-snan bits.
3268     */
3269    r.low = -(p.frac & 1);
3270    r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3271    r.high |= UINT64_C(0x7FFF000000000000);
3272    r.high |= (uint64_t)p.sign << 63;
3273
3274    return r;
3275}
3276
3277/*----------------------------------------------------------------------------
3278| Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3279*----------------------------------------------------------------------------*/
3280
3281float16 float16_silence_nan(float16 a, float_status *status)
3282{
3283    FloatParts p = float16_unpack_raw(a);
3284    p.frac <<= float16_params.frac_shift;
3285    p = parts_silence_nan(p, status);
3286    p.frac >>= float16_params.frac_shift;
3287    return float16_pack_raw(p);
3288}
3289
3290float32 float32_silence_nan(float32 a, float_status *status)
3291{
3292    FloatParts p = float32_unpack_raw(a);
3293    p.frac <<= float32_params.frac_shift;
3294    p = parts_silence_nan(p, status);
3295    p.frac >>= float32_params.frac_shift;
3296    return float32_pack_raw(p);
3297}
3298
3299float64 float64_silence_nan(float64 a, float_status *status)
3300{
3301    FloatParts p = float64_unpack_raw(a);
3302    p.frac <<= float64_params.frac_shift;
3303    p = parts_silence_nan(p, status);
3304    p.frac >>= float64_params.frac_shift;
3305    return float64_pack_raw(p);
3306}
3307
3308
3309/*----------------------------------------------------------------------------
3310| If `a' is denormal and we are in flush-to-zero mode then set the
3311| input-denormal exception and return zero. Otherwise just return the value.
3312*----------------------------------------------------------------------------*/
3313
3314static bool parts_squash_denormal(FloatParts p, float_status *status)
3315{
3316    if (p.exp == 0 && p.frac != 0) {
3317        float_raise(float_flag_input_denormal, status);
3318        return true;
3319    }
3320
3321    return false;
3322}
3323
3324float16 float16_squash_input_denormal(float16 a, float_status *status)
3325{
3326    if (status->flush_inputs_to_zero) {
3327        FloatParts p = float16_unpack_raw(a);
3328        if (parts_squash_denormal(p, status)) {
3329            return float16_set_sign(float16_zero, p.sign);
3330        }
3331    }
3332    return a;
3333}
3334
3335float32 float32_squash_input_denormal(float32 a, float_status *status)
3336{
3337    if (status->flush_inputs_to_zero) {
3338        FloatParts p = float32_unpack_raw(a);
3339        if (parts_squash_denormal(p, status)) {
3340            return float32_set_sign(float32_zero, p.sign);
3341        }
3342    }
3343    return a;
3344}
3345
3346float64 float64_squash_input_denormal(float64 a, float_status *status)
3347{
3348    if (status->flush_inputs_to_zero) {
3349        FloatParts p = float64_unpack_raw(a);
3350        if (parts_squash_denormal(p, status)) {
3351            return float64_set_sign(float64_zero, p.sign);
3352        }
3353    }
3354    return a;
3355}
3356
3357/*----------------------------------------------------------------------------
3358| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3359| and 7, and returns the properly rounded 32-bit integer corresponding to the
3360| input.  If `zSign' is 1, the input is negated before being converted to an
3361| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3362| is simply rounded to an integer, with the inexact exception raised if the
3363| input cannot be represented exactly as an integer.  However, if the fixed-
3364| point input is too large, the invalid exception is raised and the largest
3365| positive or negative integer is returned.
3366*----------------------------------------------------------------------------*/
3367
3368static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
3369{
3370    int8_t roundingMode;
3371    flag roundNearestEven;
3372    int8_t roundIncrement, roundBits;
3373    int32_t z;
3374
3375    roundingMode = status->float_rounding_mode;
3376    roundNearestEven = ( roundingMode == float_round_nearest_even );
3377    switch (roundingMode) {
3378    case float_round_nearest_even:
3379    case float_round_ties_away:
3380        roundIncrement = 0x40;
3381        break;
3382    case float_round_to_zero:
3383        roundIncrement = 0;
3384        break;
3385    case float_round_up:
3386        roundIncrement = zSign ? 0 : 0x7f;
3387        break;
3388    case float_round_down:
3389        roundIncrement = zSign ? 0x7f : 0;
3390        break;
3391    case float_round_to_odd:
3392        roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3393        break;
3394    default:
3395        abort();
3396    }
3397    roundBits = absZ & 0x7F;
3398    absZ = ( absZ + roundIncrement )>>7;
3399    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3400    z = absZ;
3401    if ( zSign ) z = - z;
3402    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3403        float_raise(float_flag_invalid, status);
3404        return zSign ? INT32_MIN : INT32_MAX;
3405    }
3406    if (roundBits) {
3407        status->float_exception_flags |= float_flag_inexact;
3408    }
3409    return z;
3410
3411}
3412
3413/*----------------------------------------------------------------------------
3414| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3415| `absZ1', with binary point between bits 63 and 64 (between the input words),
3416| and returns the properly rounded 64-bit integer corresponding to the input.
3417| If `zSign' is 1, the input is negated before being converted to an integer.
3418| Ordinarily, the fixed-point input is simply rounded to an integer, with
3419| the inexact exception raised if the input cannot be represented exactly as
3420| an integer.  However, if the fixed-point input is too large, the invalid
3421| exception is raised and the largest positive or negative integer is
3422| returned.
3423*----------------------------------------------------------------------------*/
3424
3425static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
3426                               float_status *status)
3427{
3428    int8_t roundingMode;
3429    flag roundNearestEven, increment;
3430    int64_t z;
3431
3432    roundingMode = status->float_rounding_mode;
3433    roundNearestEven = ( roundingMode == float_round_nearest_even );
3434    switch (roundingMode) {
3435    case float_round_nearest_even:
3436    case float_round_ties_away:
3437        increment = ((int64_t) absZ1 < 0);
3438        break;
3439    case float_round_to_zero:
3440        increment = 0;
3441        break;
3442    case float_round_up:
3443        increment = !zSign && absZ1;
3444        break;
3445    case float_round_down:
3446        increment = zSign && absZ1;
3447        break;
3448    case float_round_to_odd:
3449        increment = !(absZ0 & 1) && absZ1;
3450        break;
3451    default:
3452        abort();
3453    }
3454    if ( increment ) {
3455        ++absZ0;
3456        if ( absZ0 == 0 ) goto overflow;
3457        absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
3458    }
3459    z = absZ0;
3460    if ( zSign ) z = - z;
3461    if ( z && ( ( z < 0 ) ^ zSign ) ) {
3462 overflow:
3463        float_raise(float_flag_invalid, status);
3464        return zSign ? INT64_MIN : INT64_MAX;
3465    }
3466    if (absZ1) {
3467        status->float_exception_flags |= float_flag_inexact;
3468    }
3469    return z;
3470
3471}
3472
3473/*----------------------------------------------------------------------------
3474| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3475| `absZ1', with binary point between bits 63 and 64 (between the input words),
3476| and returns the properly rounded 64-bit unsigned integer corresponding to the
3477| input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3478| with the inexact exception raised if the input cannot be represented exactly
3479| as an integer.  However, if the fixed-point input is too large, the invalid
3480| exception is raised and the largest unsigned integer is returned.
3481*----------------------------------------------------------------------------*/
3482
3483static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
3484                                uint64_t absZ1, float_status *status)
3485{
3486    int8_t roundingMode;
3487    flag roundNearestEven, increment;
3488
3489    roundingMode = status->float_rounding_mode;
3490    roundNearestEven = (roundingMode == float_round_nearest_even);
3491    switch (roundingMode) {
3492    case float_round_nearest_even:
3493    case float_round_ties_away:
3494        increment = ((int64_t)absZ1 < 0);
3495        break;
3496    case float_round_to_zero:
3497        increment = 0;
3498        break;
3499    case float_round_up:
3500        increment = !zSign && absZ1;
3501        break;
3502    case float_round_down:
3503        increment = zSign && absZ1;
3504        break;
3505    case float_round_to_odd:
3506        increment = !(absZ0 & 1) && absZ1;
3507        break;
3508    default:
3509        abort();
3510    }
3511    if (increment) {
3512        ++absZ0;
3513        if (absZ0 == 0) {
3514            float_raise(float_flag_invalid, status);
3515            return UINT64_MAX;
3516        }
3517        absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
3518    }
3519
3520    if (zSign && absZ0) {
3521        float_raise(float_flag_invalid, status);
3522        return 0;
3523    }
3524
3525    if (absZ1) {
3526        status->float_exception_flags |= float_flag_inexact;
3527    }
3528    return absZ0;
3529}
3530
3531/*----------------------------------------------------------------------------
3532| Normalizes the subnormal single-precision floating-point value represented
3533| by the denormalized significand `aSig'.  The normalized exponent and
3534| significand are stored at the locations pointed to by `zExpPtr' and
3535| `zSigPtr', respectively.
3536*----------------------------------------------------------------------------*/
3537
3538static void
3539 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3540{
3541    int8_t shiftCount;
3542
3543    shiftCount = clz32(aSig) - 8;
3544    *zSigPtr = aSig<<shiftCount;
3545    *zExpPtr = 1 - shiftCount;
3546
3547}
3548
3549/*----------------------------------------------------------------------------
3550| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3551| and significand `zSig', and returns the proper single-precision floating-
3552| point value corresponding to the abstract input.  Ordinarily, the abstract
3553| value is simply rounded and packed into the single-precision format, with
3554| the inexact exception raised if the abstract input cannot be represented
3555| exactly.  However, if the abstract value is too large, the overflow and
3556| inexact exceptions are raised and an infinity or maximal finite value is
3557| returned.  If the abstract value is too small, the input value is rounded to
3558| a subnormal number, and the underflow and inexact exceptions are raised if
3559| the abstract input cannot be represented exactly as a subnormal single-
3560| precision floating-point number.
3561|     The input significand `zSig' has its binary point between bits 30
3562| and 29, which is 7 bits to the left of the usual location.  This shifted
3563| significand must be normalized or smaller.  If `zSig' is not normalized,
3564| `zExp' must be 0; in that case, the result returned is a subnormal number,
3565| and it must not require rounding.  In the usual case that `zSig' is
3566| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3567| The handling of underflow and overflow follows the IEC/IEEE Standard for
3568| Binary Floating-Point Arithmetic.
3569*----------------------------------------------------------------------------*/
3570
3571static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3572                                   float_status *status)
3573{
3574    int8_t roundingMode;
3575    flag roundNearestEven;
3576    int8_t roundIncrement, roundBits;
3577    flag isTiny;
3578
3579    roundingMode = status->float_rounding_mode;
3580    roundNearestEven = ( roundingMode == float_round_nearest_even );
3581    switch (roundingMode) {
3582    case float_round_nearest_even:
3583    case float_round_ties_away:
3584        roundIncrement = 0x40;
3585        break;
3586    case float_round_to_zero:
3587        roundIncrement = 0;
3588        break;
3589    case float_round_up:
3590        roundIncrement = zSign ? 0 : 0x7f;
3591        break;
3592    case float_round_down:
3593        roundIncrement = zSign ? 0x7f : 0;
3594        break;
3595    case float_round_to_odd:
3596        roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3597        break;
3598    default:
3599        abort();
3600        break;
3601    }
3602    roundBits = zSig & 0x7F;
3603    if ( 0xFD <= (uint16_t) zExp ) {
3604        if (    ( 0xFD < zExp )
3605             || (    ( zExp == 0xFD )
3606                  && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3607           ) {
3608            bool overflow_to_inf = roundingMode != float_round_to_odd &&
3609                                   roundIncrement != 0;
3610            float_raise(float_flag_overflow | float_flag_inexact, status);
3611            return packFloat32(zSign, 0xFF, -!overflow_to_inf);
3612        }
3613        if ( zExp < 0 ) {
3614            if (status->flush_to_zero) {
3615                float_raise(float_flag_output_denormal, status);
3616                return packFloat32(zSign, 0, 0);
3617            }
3618            isTiny =
3619                (status->float_detect_tininess
3620                 == float_tininess_before_rounding)
3621                || ( zExp < -1 )
3622                || ( zSig + roundIncrement < 0x80000000 );
3623            shift32RightJamming( zSig, - zExp, &zSig );
3624            zExp = 0;
3625            roundBits = zSig & 0x7F;
3626            if (isTiny && roundBits) {
3627                float_raise(float_flag_underflow, status);
3628            }
3629            if (roundingMode == float_round_to_odd) {
3630                /*
3631                 * For round-to-odd case, the roundIncrement depends on
3632                 * zSig which just changed.
3633                 */
3634                roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3635            }
3636        }
3637    }
3638    if (roundBits) {
3639        status->float_exception_flags |= float_flag_inexact;
3640    }
3641    zSig = ( zSig + roundIncrement )>>7;
3642    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3643    if ( zSig == 0 ) zExp = 0;
3644    return packFloat32( zSign, zExp, zSig );
3645
3646}
3647
3648/*----------------------------------------------------------------------------
3649| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3650| and significand `zSig', and returns the proper single-precision floating-
3651| point value corresponding to the abstract input.  This routine is just like
3652| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3653| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3654| floating-point exponent.
3655*----------------------------------------------------------------------------*/
3656
3657static float32
3658 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3659                              float_status *status)
3660{
3661    int8_t shiftCount;
3662
3663    shiftCount = clz32(zSig) - 1;
3664    return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3665                               status);
3666
3667}
3668
3669/*----------------------------------------------------------------------------
3670| Normalizes the subnormal double-precision floating-point value represented
3671| by the denormalized significand `aSig'.  The normalized exponent and
3672| significand are stored at the locations pointed to by `zExpPtr' and
3673| `zSigPtr', respectively.
3674*----------------------------------------------------------------------------*/
3675
3676static void
3677 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3678{
3679    int8_t shiftCount;
3680
3681    shiftCount = clz64(aSig) - 11;
3682    *zSigPtr = aSig<<shiftCount;
3683    *zExpPtr = 1 - shiftCount;
3684
3685}
3686
3687/*----------------------------------------------------------------------------
3688| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3689| double-precision floating-point value, returning the result.  After being
3690| shifted into the proper positions, the three fields are simply added
3691| together to form the result.  This means that any integer portion of `zSig'
3692| will be added into the exponent.  Since a properly normalized significand
3693| will have an integer portion equal to 1, the `zExp' input should be 1 less
3694| than the desired result exponent whenever `zSig' is a complete, normalized
3695| significand.
3696*----------------------------------------------------------------------------*/
3697
3698static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
3699{
3700
3701    return make_float64(
3702        ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3703
3704}
3705
3706/*----------------------------------------------------------------------------
3707| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3708| and significand `zSig', and returns the proper double-precision floating-
3709| point value corresponding to the abstract input.  Ordinarily, the abstract
3710| value is simply rounded and packed into the double-precision format, with
3711| the inexact exception raised if the abstract input cannot be represented
3712| exactly.  However, if the abstract value is too large, the overflow and
3713| inexact exceptions are raised and an infinity or maximal finite value is
3714| returned.  If the abstract value is too small, the input value is rounded to
3715| a subnormal number, and the underflow and inexact exceptions are raised if
3716| the abstract input cannot be represented exactly as a subnormal double-
3717| precision floating-point number.
3718|     The input significand `zSig' has its binary point between bits 62
3719| and 61, which is 10 bits to the left of the usual location.  This shifted
3720| significand must be normalized or smaller.  If `zSig' is not normalized,
3721| `zExp' must be 0; in that case, the result returned is a subnormal number,
3722| and it must not require rounding.  In the usual case that `zSig' is
3723| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3724| The handling of underflow and overflow follows the IEC/IEEE Standard for
3725| Binary Floating-Point Arithmetic.
3726*----------------------------------------------------------------------------*/
3727
3728static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3729                                   float_status *status)
3730{
3731    int8_t roundingMode;
3732    flag roundNearestEven;
3733    int roundIncrement, roundBits;
3734    flag isTiny;
3735
3736    roundingMode = status->float_rounding_mode;
3737    roundNearestEven = ( roundingMode == float_round_nearest_even );
3738    switch (roundingMode) {
3739    case float_round_nearest_even:
3740    case float_round_ties_away:
3741        roundIncrement = 0x200;
3742        break;
3743    case float_round_to_zero:
3744        roundIncrement = 0;
3745        break;
3746    case float_round_up:
3747        roundIncrement = zSign ? 0 : 0x3ff;
3748        break;
3749    case float_round_down:
3750        roundIncrement = zSign ? 0x3ff : 0;
3751        break;
3752    case float_round_to_odd:
3753        roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3754        break;
3755    default:
3756        abort();
3757    }
3758    roundBits = zSig & 0x3FF;
3759    if ( 0x7FD <= (uint16_t) zExp ) {
3760        if (    ( 0x7FD < zExp )
3761             || (    ( zExp == 0x7FD )
3762                  && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3763           ) {
3764            bool overflow_to_inf = roundingMode != float_round_to_odd &&
3765                                   roundIncrement != 0;
3766            float_raise(float_flag_overflow | float_flag_inexact, status);
3767            return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3768        }
3769        if ( zExp < 0 ) {
3770            if (status->flush_to_zero) {
3771                float_raise(float_flag_output_denormal, status);
3772                return packFloat64(zSign, 0, 0);
3773            }
3774            isTiny =
3775                   (status->float_detect_tininess
3776                    == float_tininess_before_rounding)
3777                || ( zExp < -1 )
3778                || ( zSig + roundIncrement < UINT64_C(0x8000000000000000) );
3779            shift64RightJamming( zSig, - zExp, &zSig );
3780            zExp = 0;
3781            roundBits = zSig & 0x3FF;
3782            if (isTiny && roundBits) {
3783                float_raise(float_flag_underflow, status);
3784            }
3785            if (roundingMode == float_round_to_odd) {
3786                /*
3787                 * For round-to-odd case, the roundIncrement depends on
3788                 * zSig which just changed.
3789                 */
3790                roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3791            }
3792        }
3793    }
3794    if (roundBits) {
3795        status->float_exception_flags |= float_flag_inexact;
3796    }
3797    zSig = ( zSig + roundIncrement )>>10;
3798    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3799    if ( zSig == 0 ) zExp = 0;
3800    return packFloat64( zSign, zExp, zSig );
3801
3802}
3803
3804/*----------------------------------------------------------------------------
3805| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3806| and significand `zSig', and returns the proper double-precision floating-
3807| point value corresponding to the abstract input.  This routine is just like
3808| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3809| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3810| floating-point exponent.
3811*----------------------------------------------------------------------------*/
3812
3813static float64
3814 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3815                              float_status *status)
3816{
3817    int8_t shiftCount;
3818
3819    shiftCount = clz64(zSig) - 1;
3820    return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3821                               status);
3822
3823}
3824
3825/*----------------------------------------------------------------------------
3826| Normalizes the subnormal extended double-precision floating-point value
3827| represented by the denormalized significand `aSig'.  The normalized exponent
3828| and significand are stored at the locations pointed to by `zExpPtr' and
3829| `zSigPtr', respectively.
3830*----------------------------------------------------------------------------*/
3831
3832void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3833                                uint64_t *zSigPtr)
3834{
3835    int8_t shiftCount;
3836
3837    shiftCount = clz64(aSig);
3838    *zSigPtr = aSig<<shiftCount;
3839    *zExpPtr = 1 - shiftCount;
3840}
3841
3842/*----------------------------------------------------------------------------
3843| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3844| and extended significand formed by the concatenation of `zSig0' and `zSig1',
3845| and returns the proper extended double-precision floating-point value
3846| corresponding to the abstract input.  Ordinarily, the abstract value is
3847| rounded and packed into the extended double-precision format, with the
3848| inexact exception raised if the abstract input cannot be represented
3849| exactly.  However, if the abstract value is too large, the overflow and
3850| inexact exceptions are raised and an infinity or maximal finite value is
3851| returned.  If the abstract value is too small, the input value is rounded to
3852| a subnormal number, and the underflow and inexact exceptions are raised if
3853| the abstract input cannot be represented exactly as a subnormal extended
3854| double-precision floating-point number.
3855|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3856| number of bits as single or double precision, respectively.  Otherwise, the
3857| result is rounded to the full precision of the extended double-precision
3858| format.
3859|     The input significand must be normalized or smaller.  If the input
3860| significand is not normalized, `zExp' must be 0; in that case, the result
3861| returned is a subnormal number, and it must not require rounding.  The
3862| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3863| Floating-Point Arithmetic.
3864*----------------------------------------------------------------------------*/
3865
3866floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3867                              int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3868                              float_status *status)
3869{
3870    int8_t roundingMode;
3871    flag roundNearestEven, increment, isTiny;
3872    int64_t roundIncrement, roundMask, roundBits;
3873
3874    roundingMode = status->float_rounding_mode;
3875    roundNearestEven = ( roundingMode == float_round_nearest_even );
3876    if ( roundingPrecision == 80 ) goto precision80;
3877    if ( roundingPrecision == 64 ) {
3878        roundIncrement = UINT64_C(0x0000000000000400);
3879        roundMask = UINT64_C(0x00000000000007FF);
3880    }
3881    else if ( roundingPrecision == 32 ) {
3882        roundIncrement = UINT64_C(0x0000008000000000);
3883        roundMask = UINT64_C(0x000000FFFFFFFFFF);
3884    }
3885    else {
3886        goto precision80;
3887    }
3888    zSig0 |= ( zSig1 != 0 );
3889    switch (roundingMode) {
3890    case float_round_nearest_even:
3891    case float_round_ties_away:
3892        break;
3893    case float_round_to_zero:
3894        roundIncrement = 0;
3895        break;
3896    case float_round_up:
3897        roundIncrement = zSign ? 0 : roundMask;
3898        break;
3899    case float_round_down:
3900        roundIncrement = zSign ? roundMask : 0;
3901        break;
3902    default:
3903        abort();
3904    }
3905    roundBits = zSig0 & roundMask;
3906    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3907        if (    ( 0x7FFE < zExp )
3908             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3909           ) {
3910            goto overflow;
3911        }
3912        if ( zExp <= 0 ) {
3913            if (status->flush_to_zero) {
3914                float_raise(float_flag_output_denormal, status);
3915                return packFloatx80(zSign, 0, 0);
3916            }
3917            isTiny =
3918                   (status->float_detect_tininess
3919                    == float_tininess_before_rounding)
3920                || ( zExp < 0 )
3921                || ( zSig0 <= zSig0 + roundIncrement );
3922            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3923            zExp = 0;
3924            roundBits = zSig0 & roundMask;
3925            if (isTiny && roundBits) {
3926                float_raise(float_flag_underflow, status);
3927            }
3928            if (roundBits) {
3929                status->float_exception_flags |= float_flag_inexact;
3930            }
3931            zSig0 += roundIncrement;
3932            if ( (int64_t) zSig0 < 0 ) zExp = 1;
3933            roundIncrement = roundMask + 1;
3934            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3935                roundMask |= roundIncrement;
3936            }
3937            zSig0 &= ~ roundMask;
3938            return packFloatx80( zSign, zExp, zSig0 );
3939        }
3940    }
3941    if (roundBits) {
3942        status->float_exception_flags |= float_flag_inexact;
3943    }
3944    zSig0 += roundIncrement;
3945    if ( zSig0 < roundIncrement ) {
3946        ++zExp;
3947        zSig0 = UINT64_C(0x8000000000000000);
3948    }
3949    roundIncrement = roundMask + 1;
3950    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3951        roundMask |= roundIncrement;
3952    }
3953    zSig0 &= ~ roundMask;
3954    if ( zSig0 == 0 ) zExp = 0;
3955    return packFloatx80( zSign, zExp, zSig0 );
3956 precision80:
3957    switch (roundingMode) {
3958    case float_round_nearest_even:
3959    case float_round_ties_away:
3960        increment = ((int64_t)zSig1 < 0);
3961        break;
3962    case float_round_to_zero:
3963        increment = 0;
3964        break;
3965    case float_round_up:
3966        increment = !zSign && zSig1;
3967        break;
3968    case float_round_down:
3969        increment = zSign && zSig1;
3970        break;
3971    default:
3972        abort();
3973    }
3974    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3975        if (    ( 0x7FFE < zExp )
3976             || (    ( zExp == 0x7FFE )
3977                  && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
3978                  && increment
3979                )
3980           ) {
3981            roundMask = 0;
3982 overflow:
3983            float_raise(float_flag_overflow | float_flag_inexact, status);
3984            if (    ( roundingMode == float_round_to_zero )
3985                 || ( zSign && ( roundingMode == float_round_up ) )
3986                 || ( ! zSign && ( roundingMode == float_round_down ) )
3987               ) {
3988                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3989            }
3990            return packFloatx80(zSign,
3991                                floatx80_infinity_high,
3992                                floatx80_infinity_low);
3993        }
3994        if ( zExp <= 0 ) {
3995            isTiny =
3996                   (status->float_detect_tininess
3997                    == float_tininess_before_rounding)
3998                || ( zExp < 0 )
3999                || ! increment
4000                || ( zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF) );
4001            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4002            zExp = 0;
4003            if (isTiny && zSig1) {
4004                float_raise(float_flag_underflow, status);
4005            }
4006            if (zSig1) {
4007                status->float_exception_flags |= float_flag_inexact;
4008            }
4009            switch (roundingMode) {
4010            case float_round_nearest_even:
4011            case float_round_ties_away:
4012                increment = ((int64_t)zSig1 < 0);
4013                break;
4014            case float_round_to_zero:
4015                increment = 0;
4016                break;
4017            case float_round_up:
4018                increment = !zSign && zSig1;
4019                break;
4020            case float_round_down:
4021                increment = zSign && zSig1;
4022                break;
4023            default:
4024                abort();
4025            }
4026            if ( increment ) {
4027                ++zSig0;
4028                zSig0 &=
4029                    ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
4030                if ( (int64_t) zSig0 < 0 ) zExp = 1;
4031            }
4032            return packFloatx80( zSign, zExp, zSig0 );
4033        }
4034    }
4035    if (zSig1) {
4036        status->float_exception_flags |= float_flag_inexact;
4037    }
4038    if ( increment ) {
4039        ++zSig0;
4040        if ( zSig0 == 0 ) {
4041            ++zExp;
4042            zSig0 = UINT64_C(0x8000000000000000);
4043        }
4044        else {
4045            zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
4046        }
4047    }
4048    else {
4049        if ( zSig0 == 0 ) zExp = 0;
4050    }
4051    return packFloatx80( zSign, zExp, zSig0 );
4052
4053}
4054
4055/*----------------------------------------------------------------------------
4056| Takes an abstract floating-point value having sign `zSign', exponent
4057| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4058| and returns the proper extended double-precision floating-point value
4059| corresponding to the abstract input.  This routine is just like
4060| `roundAndPackFloatx80' except that the input significand does not have to be
4061| normalized.
4062*----------------------------------------------------------------------------*/
4063
4064floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4065                                       flag zSign, int32_t zExp,
4066                                       uint64_t zSig0, uint64_t zSig1,
4067                                       float_status *status)
4068{
4069    int8_t shiftCount;
4070
4071    if ( zSig0 == 0 ) {
4072        zSig0 = zSig1;
4073        zSig1 = 0;
4074        zExp -= 64;
4075    }
4076    shiftCount = clz64(zSig0);
4077    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4078    zExp -= shiftCount;
4079    return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4080                                zSig0, zSig1, status);
4081
4082}
4083
4084/*----------------------------------------------------------------------------
4085| Returns the least-significant 64 fraction bits of the quadruple-precision
4086| floating-point value `a'.
4087*----------------------------------------------------------------------------*/
4088
4089static inline uint64_t extractFloat128Frac1( float128 a )
4090{
4091
4092    return a.low;
4093
4094}
4095
4096/*----------------------------------------------------------------------------
4097| Returns the most-significant 48 fraction bits of the quadruple-precision
4098| floating-point value `a'.
4099*----------------------------------------------------------------------------*/
4100
4101static inline uint64_t extractFloat128Frac0( float128 a )
4102{
4103
4104    return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4105
4106}
4107
4108/*----------------------------------------------------------------------------
4109| Returns the exponent bits of the quadruple-precision floating-point value
4110| `a'.
4111*----------------------------------------------------------------------------*/
4112
4113static inline int32_t extractFloat128Exp( float128 a )
4114{
4115
4116    return ( a.high>>48 ) & 0x7FFF;
4117
4118}
4119
4120/*----------------------------------------------------------------------------
4121| Returns the sign bit of the quadruple-precision floating-point value `a'.
4122*----------------------------------------------------------------------------*/
4123
4124static inline flag extractFloat128Sign( float128 a )
4125{
4126
4127    return a.high>>63;
4128
4129}
4130
4131/*----------------------------------------------------------------------------
4132| Normalizes the subnormal quadruple-precision floating-point value
4133| represented by the denormalized significand formed by the concatenation of
4134| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4135| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4136| significand are stored at the location pointed to by `zSig0Ptr', and the
4137| least significant 64 bits of the normalized significand are stored at the
4138| location pointed to by `zSig1Ptr'.
4139*----------------------------------------------------------------------------*/
4140
4141static void
4142 normalizeFloat128Subnormal(
4143     uint64_t aSig0,
4144     uint64_t aSig1,
4145     int32_t *zExpPtr,
4146     uint64_t *zSig0Ptr,
4147     uint64_t *zSig1Ptr
4148 )
4149{
4150    int8_t shiftCount;
4151
4152    if ( aSig0 == 0 ) {
4153        shiftCount = clz64(aSig1) - 15;
4154        if ( shiftCount < 0 ) {
4155            *zSig0Ptr = aSig1>>( - shiftCount );
4156            *zSig1Ptr = aSig1<<( shiftCount & 63 );
4157        }
4158        else {
4159            *zSig0Ptr = aSig1<<shiftCount;
4160            *zSig1Ptr = 0;
4161        }
4162        *zExpPtr = - shiftCount - 63;
4163    }
4164    else {
4165        shiftCount = clz64(aSig0) - 15;
4166        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4167        *zExpPtr = 1 - shiftCount;
4168    }
4169
4170}
4171
4172/*----------------------------------------------------------------------------
4173| Packs the sign `zSign', the exponent `zExp', and the significand formed
4174| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4175| floating-point value, returning the result.  After being shifted into the
4176| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4177| added together to form the most significant 32 bits of the result.  This
4178| means that any integer portion of `zSig0' will be added into the exponent.
4179| Since a properly normalized significand will have an integer portion equal
4180| to 1, the `zExp' input should be 1 less than the desired result exponent
4181| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4182| significand.
4183*----------------------------------------------------------------------------*/
4184
4185static inline float128
4186 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
4187{
4188    float128 z;
4189
4190    z.low = zSig1;
4191    z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
4192    return z;
4193
4194}
4195
4196/*----------------------------------------------------------------------------
4197| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4198| and extended significand formed by the concatenation of `zSig0', `zSig1',
4199| and `zSig2', and returns the proper quadruple-precision floating-point value
4200| corresponding to the abstract input.  Ordinarily, the abstract value is
4201| simply rounded and packed into the quadruple-precision format, with the
4202| inexact exception raised if the abstract input cannot be represented
4203| exactly.  However, if the abstract value is too large, the overflow and
4204| inexact exceptions are raised and an infinity or maximal finite value is
4205| returned.  If the abstract value is too small, the input value is rounded to
4206| a subnormal number, and the underflow and inexact exceptions are raised if
4207| the abstract input cannot be represented exactly as a subnormal quadruple-
4208| precision floating-point number.
4209|     The input significand must be normalized or smaller.  If the input
4210| significand is not normalized, `zExp' must be 0; in that case, the result
4211| returned is a subnormal number, and it must not require rounding.  In the
4212| usual case that the input significand is normalized, `zExp' must be 1 less
4213| than the ``true'' floating-point exponent.  The handling of underflow and
4214| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4215*----------------------------------------------------------------------------*/
4216
4217static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
4218                                     uint64_t zSig0, uint64_t zSig1,
4219                                     uint64_t zSig2, float_status *status)
4220{
4221    int8_t roundingMode;
4222    flag roundNearestEven, increment, isTiny;
4223
4224    roundingMode = status->float_rounding_mode;
4225    roundNearestEven = ( roundingMode == float_round_nearest_even );
4226    switch (roundingMode) {
4227    case float_round_nearest_even:
4228    case float_round_ties_away:
4229        increment = ((int64_t)zSig2 < 0);
4230        break;
4231    case float_round_to_zero:
4232        increment = 0;
4233        break;
4234    case float_round_up:
4235        increment = !zSign && zSig2;
4236        break;
4237    case float_round_down:
4238        increment = zSign && zSig2;
4239        break;
4240    case float_round_to_odd:
4241        increment = !(zSig1 & 0x1) && zSig2;
4242        break;
4243    default:
4244        abort();
4245    }
4246    if ( 0x7FFD <= (uint32_t) zExp ) {
4247        if (    ( 0x7FFD < zExp )
4248             || (    ( zExp == 0x7FFD )
4249                  && eq128(
4250                         UINT64_C(0x0001FFFFFFFFFFFF),
4251                         UINT64_C(0xFFFFFFFFFFFFFFFF),
4252                         zSig0,
4253                         zSig1
4254                     )
4255                  && increment
4256                )
4257           ) {
4258            float_raise(float_flag_overflow | float_flag_inexact, status);
4259            if (    ( roundingMode == float_round_to_zero )
4260                 || ( zSign && ( roundingMode == float_round_up ) )
4261                 || ( ! zSign && ( roundingMode == float_round_down ) )
4262                 || (roundingMode == float_round_to_odd)
4263               ) {
4264                return
4265                    packFloat128(
4266                        zSign,
4267                        0x7FFE,
4268                        UINT64_C(0x0000FFFFFFFFFFFF),
4269                        UINT64_C(0xFFFFFFFFFFFFFFFF)
4270                    );
4271            }
4272            return packFloat128( zSign, 0x7FFF, 0, 0 );
4273        }
4274        if ( zExp < 0 ) {
4275            if (status->flush_to_zero) {
4276                float_raise(float_flag_output_denormal, status);
4277                return packFloat128(zSign, 0, 0, 0);
4278            }
4279            isTiny =
4280                   (status->float_detect_tininess
4281                    == float_tininess_before_rounding)
4282                || ( zExp < -1 )
4283                || ! increment
4284                || lt128(
4285                       zSig0,
4286                       zSig1,
4287                       UINT64_C(0x0001FFFFFFFFFFFF),
4288                       UINT64_C(0xFFFFFFFFFFFFFFFF)
4289                   );
4290            shift128ExtraRightJamming(
4291                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4292            zExp = 0;
4293            if (isTiny && zSig2) {
4294                float_raise(float_flag_underflow, status);
4295            }
4296            switch (roundingMode) {
4297            case float_round_nearest_even:
4298            case float_round_ties_away:
4299                increment = ((int64_t)zSig2 < 0);
4300                break;
4301            case float_round_to_zero:
4302                increment = 0;
4303                break;
4304            case float_round_up:
4305                increment = !zSign && zSig2;
4306                break;
4307            case float_round_down:
4308                increment = zSign && zSig2;
4309                break;
4310            case float_round_to_odd:
4311                increment = !(zSig1 & 0x1) && zSig2;
4312                break;
4313            default:
4314                abort();
4315            }
4316        }
4317    }
4318    if (zSig2) {
4319        status->float_exception_flags |= float_flag_inexact;
4320    }
4321    if ( increment ) {
4322        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4323        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
4324    }
4325    else {
4326        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4327    }
4328    return packFloat128( zSign, zExp, zSig0, zSig1 );
4329
4330}
4331
4332/*----------------------------------------------------------------------------
4333| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4334| and significand formed by the concatenation of `zSig0' and `zSig1', and
4335| returns the proper quadruple-precision floating-point value corresponding
4336| to the abstract input.  This routine is just like `roundAndPackFloat128'
4337| except that the input significand has fewer bits and does not have to be
4338| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4339| point exponent.
4340*----------------------------------------------------------------------------*/
4341
4342static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
4343                                              uint64_t zSig0, uint64_t zSig1,
4344                                              float_status *status)
4345{
4346    int8_t shiftCount;
4347    uint64_t zSig2;
4348
4349    if ( zSig0 == 0 ) {
4350        zSig0 = zSig1;
4351        zSig1 = 0;
4352        zExp -= 64;
4353    }
4354    shiftCount = clz64(zSig0) - 15;
4355    if ( 0 <= shiftCount ) {
4356        zSig2 = 0;
4357        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4358    }
4359    else {
4360        shift128ExtraRightJamming(
4361            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4362    }
4363    zExp -= shiftCount;
4364    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4365
4366}
4367
4368
4369/*----------------------------------------------------------------------------
4370| Returns the result of converting the 32-bit two's complement integer `a'
4371| to the extended double-precision floating-point format.  The conversion
4372| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4373| Arithmetic.
4374*----------------------------------------------------------------------------*/
4375
4376floatx80 int32_to_floatx80(int32_t a, float_status *status)
4377{
4378    flag zSign;
4379    uint32_t absA;
4380    int8_t shiftCount;
4381    uint64_t zSig;
4382
4383    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4384    zSign = ( a < 0 );
4385    absA = zSign ? - a : a;
4386    shiftCount = clz32(absA) + 32;
4387    zSig = absA;
4388    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4389
4390}
4391
4392/*----------------------------------------------------------------------------
4393| Returns the result of converting the 32-bit two's complement integer `a' to
4394| the quadruple-precision floating-point format.  The conversion is performed
4395| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4396*----------------------------------------------------------------------------*/
4397
4398float128 int32_to_float128(int32_t a, float_status *status)
4399{
4400    flag zSign;
4401    uint32_t absA;
4402    int8_t shiftCount;
4403    uint64_t zSig0;
4404
4405    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4406    zSign = ( a < 0 );
4407    absA = zSign ? - a : a;
4408    shiftCount = clz32(absA) + 17;
4409    zSig0 = absA;
4410    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4411
4412}
4413
4414/*----------------------------------------------------------------------------
4415| Returns the result of converting the 64-bit two's complement integer `a'
4416| to the extended double-precision floating-point format.  The conversion
4417| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4418| Arithmetic.
4419*----------------------------------------------------------------------------*/
4420
4421floatx80 int64_to_floatx80(int64_t a, float_status *status)
4422{
4423    flag zSign;
4424    uint64_t absA;
4425    int8_t shiftCount;
4426
4427    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4428    zSign = ( a < 0 );
4429    absA = zSign ? - a : a;
4430    shiftCount = clz64(absA);
4431    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4432
4433}
4434
4435/*----------------------------------------------------------------------------
4436| Returns the result of converting the 64-bit two's complement integer `a' to
4437| the quadruple-precision floating-point format.  The conversion is performed
4438| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4439*----------------------------------------------------------------------------*/
4440
4441float128 int64_to_float128(int64_t a, float_status *status)
4442{
4443    flag zSign;
4444    uint64_t absA;
4445    int8_t shiftCount;
4446    int32_t zExp;
4447    uint64_t zSig0, zSig1;
4448
4449    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4450    zSign = ( a < 0 );
4451    absA = zSign ? - a : a;
4452    shiftCount = clz64(absA) + 49;
4453    zExp = 0x406E - shiftCount;
4454    if ( 64 <= shiftCount ) {
4455        zSig1 = 0;
4456        zSig0 = absA;
4457        shiftCount -= 64;
4458    }
4459    else {
4460        zSig1 = absA;
4461        zSig0 = 0;
4462    }
4463    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4464    return packFloat128( zSign, zExp, zSig0, zSig1 );
4465
4466}
4467
4468/*----------------------------------------------------------------------------
4469| Returns the result of converting the 64-bit unsigned integer `a'
4470| to the quadruple-precision floating-point format.  The conversion is performed
4471| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4472*----------------------------------------------------------------------------*/
4473
4474float128 uint64_to_float128(uint64_t a, float_status *status)
4475{
4476    if (a == 0) {
4477        return float128_zero;
4478    }
4479    return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4480}
4481
4482/*----------------------------------------------------------------------------
4483| Returns the result of converting the single-precision floating-point value
4484| `a' to the extended double-precision floating-point format.  The conversion
4485| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4486| Arithmetic.
4487*----------------------------------------------------------------------------*/
4488
4489floatx80 float32_to_floatx80(float32 a, float_status *status)
4490{
4491    flag aSign;
4492    int aExp;
4493    uint32_t aSig;
4494
4495    a = float32_squash_input_denormal(a, status);
4496    aSig = extractFloat32Frac( a );
4497    aExp = extractFloat32Exp( a );
4498    aSign = extractFloat32Sign( a );
4499    if ( aExp == 0xFF ) {
4500        if (aSig) {
4501            return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
4502        }
4503        return packFloatx80(aSign,
4504                            floatx80_infinity_high,
4505                            floatx80_infinity_low);
4506    }
4507    if ( aExp == 0 ) {
4508        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4509        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4510    }
4511    aSig |= 0x00800000;
4512    return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4513
4514}
4515
4516/*----------------------------------------------------------------------------
4517| Returns the result of converting the single-precision floating-point value
4518| `a' to the double-precision floating-point format.  The conversion is
4519| performed according to the IEC/IEEE Standard for Binary Floating-Point
4520| Arithmetic.
4521*----------------------------------------------------------------------------*/
4522
4523float128 float32_to_float128(float32 a, float_status *status)
4524{
4525    flag aSign;
4526    int aExp;
4527    uint32_t aSig;
4528
4529    a = float32_squash_input_denormal(a, status);
4530    aSig = extractFloat32Frac( a );
4531    aExp = extractFloat32Exp( a );
4532    aSign = extractFloat32Sign( a );
4533    if ( aExp == 0xFF ) {
4534        if (aSig) {
4535            return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4536        }
4537        return packFloat128( aSign, 0x7FFF, 0, 0 );
4538    }
4539    if ( aExp == 0 ) {
4540        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4541        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4542        --aExp;
4543    }
4544    return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4545
4546}
4547
4548/*----------------------------------------------------------------------------
4549| Returns the remainder of the single-precision floating-point value `a'
4550| with respect to the corresponding value `b'.  The operation is performed
4551| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4552*----------------------------------------------------------------------------*/
4553
4554float32 float32_rem(float32 a, float32 b, float_status *status)
4555{
4556    flag aSign, zSign;
4557    int aExp, bExp, expDiff;
4558    uint32_t aSig, bSig;
4559    uint32_t q;
4560    uint64_t aSig64, bSig64, q64;
4561    uint32_t alternateASig;
4562    int32_t sigMean;
4563    a = float32_squash_input_denormal(a, status);
4564    b = float32_squash_input_denormal(b, status);
4565
4566    aSig = extractFloat32Frac( a );
4567    aExp = extractFloat32Exp( a );
4568    aSign = extractFloat32Sign( a );
4569    bSig = extractFloat32Frac( b );
4570    bExp = extractFloat32Exp( b );
4571    if ( aExp == 0xFF ) {
4572        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4573            return propagateFloat32NaN(a, b, status);
4574        }
4575        float_raise(float_flag_invalid, status);
4576        return float32_default_nan(status);
4577    }
4578    if ( bExp == 0xFF ) {
4579        if (bSig) {
4580            return propagateFloat32NaN(a, b, status);
4581        }
4582        return a;
4583    }
4584    if ( bExp == 0 ) {
4585        if ( bSig == 0 ) {
4586            float_raise(float_flag_invalid, status);
4587            return float32_default_nan(status);
4588        }
4589        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4590    }
4591    if ( aExp == 0 ) {
4592        if ( aSig == 0 ) return a;
4593        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4594    }
4595    expDiff = aExp - bExp;
4596    aSig |= 0x00800000;
4597    bSig |= 0x00800000;
4598    if ( expDiff < 32 ) {
4599        aSig <<= 8;
4600        bSig <<= 8;
4601        if ( expDiff < 0 ) {
4602            if ( expDiff < -1 ) return a;
4603            aSig >>= 1;
4604        }
4605        q = ( bSig <= aSig );
4606        if ( q ) aSig -= bSig;
4607        if ( 0 < expDiff ) {
4608            q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4609            q >>= 32 - expDiff;
4610            bSig >>= 2;
4611            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4612        }
4613        else {
4614            aSig >>= 2;
4615            bSig >>= 2;
4616        }
4617    }
4618    else {
4619        if ( bSig <= aSig ) aSig -= bSig;
4620        aSig64 = ( (uint64_t) aSig )<<40;
4621        bSig64 = ( (uint64_t) bSig )<<40;
4622        expDiff -= 64;
4623        while ( 0 < expDiff ) {
4624            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4625            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4626            aSig64 = - ( ( bSig * q64 )<<38 );
4627            expDiff -= 62;
4628        }
4629        expDiff += 64;
4630        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4631        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4632        q = q64>>( 64 - expDiff );
4633        bSig <<= 6;
4634        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4635    }
4636    do {
4637        alternateASig = aSig;
4638        ++q;
4639        aSig -= bSig;
4640    } while ( 0 <= (int32_t) aSig );
4641    sigMean = aSig + alternateASig;
4642    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4643        aSig = alternateASig;
4644    }
4645    zSign = ( (int32_t) aSig < 0 );
4646    if ( zSign ) aSig = - aSig;
4647    return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4648}
4649
4650
4651
4652/*----------------------------------------------------------------------------
4653| Returns the binary exponential of the single-precision floating-point value
4654| `a'. The operation is performed according to the IEC/IEEE Standard for
4655| Binary Floating-Point Arithmetic.
4656|
4657| Uses the following identities:
4658|
4659| 1. -------------------------------------------------------------------------
4660|      x    x*ln(2)
4661|     2  = e
4662|
4663| 2. -------------------------------------------------------------------------
4664|                      2     3     4     5           n
4665|      x        x     x     x     x     x           x
4666|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4667|               1!    2!    3!    4!    5!          n!
4668*----------------------------------------------------------------------------*/
4669
4670static const float64 float32_exp2_coefficients[15] =
4671{
4672    const_float64( 0x3ff0000000000000ll ), /*  1 */
4673    const_float64( 0x3fe0000000000000ll ), /*  2 */
4674    const_float64( 0x3fc5555555555555ll ), /*  3 */
4675    const_float64( 0x3fa5555555555555ll ), /*  4 */
4676    const_float64( 0x3f81111111111111ll ), /*  5 */
4677    const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
4678    const_float64( 0x3f2a01a01a01a01all ), /*  7 */
4679    const_float64( 0x3efa01a01a01a01all ), /*  8 */
4680    const_float64( 0x3ec71de3a556c734ll ), /*  9 */
4681    const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4682    const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4683    const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4684    const_float64( 0x3de6124613a86d09ll ), /* 13 */
4685    const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4686    const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4687};
4688
4689float32 float32_exp2(float32 a, float_status *status)
4690{
4691    flag aSign;
4692    int aExp;
4693    uint32_t aSig;
4694    float64 r, x, xn;
4695    int i;
4696    a = float32_squash_input_denormal(a, status);
4697
4698    aSig = extractFloat32Frac( a );
4699    aExp = extractFloat32Exp( a );
4700    aSign = extractFloat32Sign( a );
4701
4702    if ( aExp == 0xFF) {
4703        if (aSig) {
4704            return propagateFloat32NaN(a, float32_zero, status);
4705        }
4706        return (aSign) ? float32_zero : a;
4707    }
4708    if (aExp == 0) {
4709        if (aSig == 0) return float32_one;
4710    }
4711
4712    float_raise(float_flag_inexact, status);
4713
4714    /* ******************************* */
4715    /* using float64 for approximation */
4716    /* ******************************* */
4717    x = float32_to_float64(a, status);
4718    x = float64_mul(x, float64_ln2, status);
4719
4720    xn = x;
4721    r = float64_one;
4722    for (i = 0 ; i < 15 ; i++) {
4723        float64 f;
4724
4725        f = float64_mul(xn, float32_exp2_coefficients[i], status);
4726        r = float64_add(r, f, status);
4727
4728        xn = float64_mul(xn, x, status);
4729    }
4730
4731    return float64_to_float32(r, status);
4732}
4733
4734/*----------------------------------------------------------------------------
4735| Returns the binary log of the single-precision floating-point value `a'.
4736| The operation is performed according to the IEC/IEEE Standard for Binary
4737| Floating-Point Arithmetic.
4738*----------------------------------------------------------------------------*/
4739float32 float32_log2(float32 a, float_status *status)
4740{
4741    flag aSign, zSign;
4742    int aExp;
4743    uint32_t aSig, zSig, i;
4744
4745    a = float32_squash_input_denormal(a, status);
4746    aSig = extractFloat32Frac( a );
4747    aExp = extractFloat32Exp( a );
4748    aSign = extractFloat32Sign( a );
4749
4750    if ( aExp == 0 ) {
4751        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4752        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4753    }
4754    if ( aSign ) {
4755        float_raise(float_flag_invalid, status);
4756        return float32_default_nan(status);
4757    }
4758    if ( aExp == 0xFF ) {
4759        if (aSig) {
4760            return propagateFloat32NaN(a, float32_zero, status);
4761        }
4762        return a;
4763    }
4764
4765    aExp -= 0x7F;
4766    aSig |= 0x00800000;
4767    zSign = aExp < 0;
4768    zSig = aExp << 23;
4769
4770    for (i = 1 << 22; i > 0; i >>= 1) {
4771        aSig = ( (uint64_t)aSig * aSig ) >> 23;
4772        if ( aSig & 0x01000000 ) {
4773            aSig >>= 1;
4774            zSig |= i;
4775        }
4776    }
4777
4778    if ( zSign )
4779        zSig = -zSig;
4780
4781    return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4782}
4783
4784/*----------------------------------------------------------------------------
4785| Returns 1 if the single-precision floating-point value `a' is equal to
4786| the corresponding value `b', and 0 otherwise.  The invalid exception is
4787| raised if either operand is a NaN.  Otherwise, the comparison is performed
4788| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4789*----------------------------------------------------------------------------*/
4790
4791int float32_eq(float32 a, float32 b, float_status *status)
4792{
4793    uint32_t av, bv;
4794    a = float32_squash_input_denormal(a, status);
4795    b = float32_squash_input_denormal(b, status);
4796
4797    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4798         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4799       ) {
4800        float_raise(float_flag_invalid, status);
4801        return 0;
4802    }
4803    av = float32_val(a);
4804    bv = float32_val(b);
4805    return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4806}
4807
4808/*----------------------------------------------------------------------------
4809| Returns 1 if the single-precision floating-point value `a' is less than
4810| or equal to the corresponding value `b', and 0 otherwise.  The invalid
4811| exception is raised if either operand is a NaN.  The comparison is performed
4812| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4813*----------------------------------------------------------------------------*/
4814
4815int float32_le(float32 a, float32 b, float_status *status)
4816{
4817    flag aSign, bSign;
4818    uint32_t av, bv;
4819    a = float32_squash_input_denormal(a, status);
4820    b = float32_squash_input_denormal(b, status);
4821
4822    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4823         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4824       ) {
4825        float_raise(float_flag_invalid, status);
4826        return 0;
4827    }
4828    aSign = extractFloat32Sign( a );
4829    bSign = extractFloat32Sign( b );
4830    av = float32_val(a);
4831    bv = float32_val(b);
4832    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4833    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4834
4835}
4836
4837/*----------------------------------------------------------------------------
4838| Returns 1 if the single-precision floating-point value `a' is less than
4839| the corresponding value `b', and 0 otherwise.  The invalid exception is
4840| raised if either operand is a NaN.  The comparison is performed according
4841| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4842*----------------------------------------------------------------------------*/
4843
4844int float32_lt(float32 a, float32 b, float_status *status)
4845{
4846    flag aSign, bSign;
4847    uint32_t av, bv;
4848    a = float32_squash_input_denormal(a, status);
4849    b = float32_squash_input_denormal(b, status);
4850
4851    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4852         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4853       ) {
4854        float_raise(float_flag_invalid, status);
4855        return 0;
4856    }
4857    aSign = extractFloat32Sign( a );
4858    bSign = extractFloat32Sign( b );
4859    av = float32_val(a);
4860    bv = float32_val(b);
4861    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4862    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4863
4864}
4865
4866/*----------------------------------------------------------------------------
4867| Returns 1 if the single-precision floating-point values `a' and `b' cannot
4868| be compared, and 0 otherwise.  The invalid exception is raised if either
4869| operand is a NaN.  The comparison is performed according to the IEC/IEEE
4870| Standard for Binary Floating-Point Arithmetic.
4871*----------------------------------------------------------------------------*/
4872
4873int float32_unordered(float32 a, float32 b, float_status *status)
4874{
4875    a = float32_squash_input_denormal(a, status);
4876    b = float32_squash_input_denormal(b, status);
4877
4878    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4879         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4880       ) {
4881        float_raise(float_flag_invalid, status);
4882        return 1;
4883    }
4884    return 0;
4885}
4886
4887/*----------------------------------------------------------------------------
4888| Returns 1 if the single-precision floating-point value `a' is equal to
4889| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4890| exception.  The comparison is performed according to the IEC/IEEE Standard
4891| for Binary Floating-Point Arithmetic.
4892*----------------------------------------------------------------------------*/
4893
4894int float32_eq_quiet(float32 a, float32 b, float_status *status)
4895{
4896    a = float32_squash_input_denormal(a, status);
4897    b = float32_squash_input_denormal(b, status);
4898
4899    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4900         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4901       ) {
4902        if (float32_is_signaling_nan(a, status)
4903         || float32_is_signaling_nan(b, status)) {
4904            float_raise(float_flag_invalid, status);
4905        }
4906        return 0;
4907    }
4908    return ( float32_val(a) == float32_val(b) ) ||
4909            ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4910}
4911
4912/*----------------------------------------------------------------------------
4913| Returns 1 if the single-precision floating-point value `a' is less than or
4914| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4915| cause an exception.  Otherwise, the comparison is performed according to the
4916| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4917*----------------------------------------------------------------------------*/
4918
4919int float32_le_quiet(float32 a, float32 b, float_status *status)
4920{
4921    flag aSign, bSign;
4922    uint32_t av, bv;
4923    a = float32_squash_input_denormal(a, status);
4924    b = float32_squash_input_denormal(b, status);
4925
4926    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4927         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4928       ) {
4929        if (float32_is_signaling_nan(a, status)
4930         || float32_is_signaling_nan(b, status)) {
4931            float_raise(float_flag_invalid, status);
4932        }
4933        return 0;
4934    }
4935    aSign = extractFloat32Sign( a );
4936    bSign = extractFloat32Sign( b );
4937    av = float32_val(a);
4938    bv = float32_val(b);
4939    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4940    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4941
4942}
4943
4944/*----------------------------------------------------------------------------
4945| Returns 1 if the single-precision floating-point value `a' is less than
4946| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4947| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4948| Standard for Binary Floating-Point Arithmetic.
4949*----------------------------------------------------------------------------*/
4950
4951int float32_lt_quiet(float32 a, float32 b, float_status *status)
4952{
4953    flag aSign, bSign;
4954    uint32_t av, bv;
4955    a = float32_squash_input_denormal(a, status);
4956    b = float32_squash_input_denormal(b, status);
4957
4958    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4959         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4960       ) {
4961        if (float32_is_signaling_nan(a, status)
4962         || float32_is_signaling_nan(b, status)) {
4963            float_raise(float_flag_invalid, status);
4964        }
4965        return 0;
4966    }
4967    aSign = extractFloat32Sign( a );
4968    bSign = extractFloat32Sign( b );
4969    av = float32_val(a);
4970    bv = float32_val(b);
4971    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4972    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4973
4974}
4975
4976/*----------------------------------------------------------------------------
4977| Returns 1 if the single-precision floating-point values `a' and `b' cannot
4978| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4979| comparison is performed according to the IEC/IEEE Standard for Binary
4980| Floating-Point Arithmetic.
4981*----------------------------------------------------------------------------*/
4982
4983int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4984{
4985    a = float32_squash_input_denormal(a, status);
4986    b = float32_squash_input_denormal(b, status);
4987
4988    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4989         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4990       ) {
4991        if (float32_is_signaling_nan(a, status)
4992         || float32_is_signaling_nan(b, status)) {
4993            float_raise(float_flag_invalid, status);
4994        }
4995        return 1;
4996    }
4997    return 0;
4998}
4999
5000/*----------------------------------------------------------------------------
5001| Returns the result of converting the double-precision floating-point value
5002| `a' to the extended double-precision floating-point format.  The conversion
5003| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5004| Arithmetic.
5005*----------------------------------------------------------------------------*/
5006
5007floatx80 float64_to_floatx80(float64 a, float_status *status)
5008{
5009    flag aSign;
5010    int aExp;
5011    uint64_t aSig;
5012
5013    a = float64_squash_input_denormal(a, status);
5014    aSig = extractFloat64Frac( a );
5015    aExp = extractFloat64Exp( a );
5016    aSign = extractFloat64Sign( a );
5017    if ( aExp == 0x7FF ) {
5018        if (aSig) {
5019            return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
5020        }
5021        return packFloatx80(aSign,
5022                            floatx80_infinity_high,
5023                            floatx80_infinity_low);
5024    }
5025    if ( aExp == 0 ) {
5026        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5027        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5028    }
5029    return
5030        packFloatx80(
5031            aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5032
5033}
5034
5035/*----------------------------------------------------------------------------
5036| Returns the result of converting the double-precision floating-point value
5037| `a' to the quadruple-precision floating-point format.  The conversion is
5038| performed according to the IEC/IEEE Standard for Binary Floating-Point
5039| Arithmetic.
5040*----------------------------------------------------------------------------*/
5041
5042float128 float64_to_float128(float64 a, float_status *status)
5043{
5044    flag aSign;
5045    int aExp;
5046    uint64_t aSig, zSig0, zSig1;
5047
5048    a = float64_squash_input_denormal(a, status);
5049    aSig = extractFloat64Frac( a );
5050    aExp = extractFloat64Exp( a );
5051    aSign = extractFloat64Sign( a );
5052    if ( aExp == 0x7FF ) {
5053        if (aSig) {
5054            return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5055        }
5056        return packFloat128( aSign, 0x7FFF, 0, 0 );
5057    }
5058    if ( aExp == 0 ) {
5059        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5060        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5061        --aExp;
5062    }
5063    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5064    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5065
5066}
5067
5068
5069/*----------------------------------------------------------------------------
5070| Returns the remainder of the double-precision floating-point value `a'
5071| with respect to the corresponding value `b'.  The operation is performed
5072| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5073*----------------------------------------------------------------------------*/
5074
5075float64 float64_rem(float64 a, float64 b, float_status *status)
5076{
5077    flag aSign, zSign;
5078    int aExp, bExp, expDiff;
5079    uint64_t aSig, bSig;
5080    uint64_t q, alternateASig;
5081    int64_t sigMean;
5082
5083    a = float64_squash_input_denormal(a, status);
5084    b = float64_squash_input_denormal(b, status);
5085    aSig = extractFloat64Frac( a );
5086    aExp = extractFloat64Exp( a );
5087    aSign = extractFloat64Sign( a );
5088    bSig = extractFloat64Frac( b );
5089    bExp = extractFloat64Exp( b );
5090    if ( aExp == 0x7FF ) {
5091        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5092            return propagateFloat64NaN(a, b, status);
5093        }
5094        float_raise(float_flag_invalid, status);
5095        return float64_default_nan(status);
5096    }
5097    if ( bExp == 0x7FF ) {
5098        if (bSig) {
5099            return propagateFloat64NaN(a, b, status);
5100        }
5101        return a;
5102    }
5103    if ( bExp == 0 ) {
5104        if ( bSig == 0 ) {
5105            float_raise(float_flag_invalid, status);
5106            return float64_default_nan(status);
5107        }
5108        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5109    }
5110    if ( aExp == 0 ) {
5111        if ( aSig == 0 ) return a;
5112        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5113    }
5114    expDiff = aExp - bExp;
5115    aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5116    bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5117    if ( expDiff < 0 ) {
5118        if ( expDiff < -1 ) return a;
5119        aSig >>= 1;
5120    }
5121    q = ( bSig <= aSig );
5122    if ( q ) aSig -= bSig;
5123    expDiff -= 64;
5124    while ( 0 < expDiff ) {
5125        q = estimateDiv128To64( aSig, 0, bSig );
5126        q = ( 2 < q ) ? q - 2 : 0;
5127        aSig = - ( ( bSig>>2 ) * q );
5128        expDiff -= 62;
5129    }
5130    expDiff += 64;
5131    if ( 0 < expDiff ) {
5132        q = estimateDiv128To64( aSig, 0, bSig );
5133        q = ( 2 < q ) ? q - 2 : 0;
5134        q >>= 64 - expDiff;
5135        bSig >>= 2;
5136        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5137    }
5138    else {
5139        aSig >>= 2;
5140        bSig >>= 2;
5141    }
5142    do {
5143        alternateASig = aSig;
5144        ++q;
5145        aSig -= bSig;
5146    } while ( 0 <= (int64_t) aSig );
5147    sigMean = aSig + alternateASig;
5148    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5149        aSig = alternateASig;
5150    }
5151    zSign = ( (int64_t) aSig < 0 );
5152    if ( zSign ) aSig = - aSig;
5153    return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5154
5155}
5156
5157/*----------------------------------------------------------------------------
5158| Returns the binary log of the double-precision floating-point value `a'.
5159| The operation is performed according to the IEC/IEEE Standard for Binary
5160| Floating-Point Arithmetic.
5161*----------------------------------------------------------------------------*/
5162float64 float64_log2(float64 a, float_status *status)
5163{
5164    flag aSign, zSign;
5165    int aExp;
5166    uint64_t aSig, aSig0, aSig1, zSig, i;
5167    a = float64_squash_input_denormal(a, status);
5168
5169    aSig = extractFloat64Frac( a );
5170    aExp = extractFloat64Exp( a );
5171    aSign = extractFloat64Sign( a );
5172
5173    if ( aExp == 0 ) {
5174        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5175        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5176    }
5177    if ( aSign ) {
5178        float_raise(float_flag_invalid, status);
5179        return float64_default_nan(status);
5180    }
5181    if ( aExp == 0x7FF ) {
5182        if (aSig) {
5183            return propagateFloat64NaN(a, float64_zero, status);
5184        }
5185        return a;
5186    }
5187
5188    aExp -= 0x3FF;
5189    aSig |= UINT64_C(0x0010000000000000);
5190    zSign = aExp < 0;
5191    zSig = (uint64_t)aExp << 52;
5192    for (i = 1LL << 51; i > 0; i >>= 1) {
5193        mul64To128( aSig, aSig, &aSig0, &aSig1 );
5194        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5195        if ( aSig & UINT64_C(0x0020000000000000) ) {
5196            aSig >>= 1;
5197            zSig |= i;
5198        }
5199    }
5200
5201    if ( zSign )
5202        zSig = -zSig;
5203    return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5204}
5205
5206/*----------------------------------------------------------------------------
5207| Returns 1 if the double-precision floating-point value `a' is equal to the
5208| corresponding value `b', and 0 otherwise.  The invalid exception is raised
5209| if either operand is a NaN.  Otherwise, the comparison is performed
5210| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5211*----------------------------------------------------------------------------*/
5212
5213int float64_eq(float64 a, float64 b, float_status *status)
5214{
5215    uint64_t av, bv;
5216    a = float64_squash_input_denormal(a, status);
5217    b = float64_squash_input_denormal(b, status);
5218
5219    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5220         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5221       ) {
5222        float_raise(float_flag_invalid, status);
5223        return 0;
5224    }
5225    av = float64_val(a);
5226    bv = float64_val(b);
5227    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5228
5229}
5230
5231/*----------------------------------------------------------------------------
5232| Returns 1 if the double-precision floating-point value `a' is less than or
5233| equal to the corresponding value `b', and 0 otherwise.  The invalid
5234| exception is raised if either operand is a NaN.  The comparison is performed
5235| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5236*----------------------------------------------------------------------------*/
5237
5238int float64_le(float64 a, float64 b, float_status *status)
5239{
5240    flag aSign, bSign;
5241    uint64_t av, bv;
5242    a = float64_squash_input_denormal(a, status);
5243    b = float64_squash_input_denormal(b, status);
5244
5245    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5246         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5247       ) {
5248        float_raise(float_flag_invalid, status);
5249        return 0;
5250    }
5251    aSign = extractFloat64Sign( a );
5252    bSign = extractFloat64Sign( b );
5253    av = float64_val(a);
5254    bv = float64_val(b);
5255    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5256    return ( av == bv ) || ( aSign ^ ( av < bv ) );
5257
5258}
5259
5260/*----------------------------------------------------------------------------
5261| Returns 1 if the double-precision floating-point value `a' is less than
5262| the corresponding value `b', and 0 otherwise.  The invalid exception is
5263| raised if either operand is a NaN.  The comparison is performed according
5264| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5265*----------------------------------------------------------------------------*/
5266
5267int float64_lt(float64 a, float64 b, float_status *status)
5268{
5269    flag aSign, bSign;
5270    uint64_t av, bv;
5271
5272    a = float64_squash_input_denormal(a, status);
5273    b = float64_squash_input_denormal(b, status);
5274    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5275         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5276       ) {
5277        float_raise(float_flag_invalid, status);
5278        return 0;
5279    }
5280    aSign = extractFloat64Sign( a );
5281    bSign = extractFloat64Sign( b );
5282    av = float64_val(a);
5283    bv = float64_val(b);
5284    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5285    return ( av != bv ) && ( aSign ^ ( av < bv ) );
5286
5287}
5288
5289/*----------------------------------------------------------------------------
5290| Returns 1 if the double-precision floating-point values `a' and `b' cannot
5291| be compared, and 0 otherwise.  The invalid exception is raised if either
5292| operand is a NaN.  The comparison is performed according to the IEC/IEEE
5293| Standard for Binary Floating-Point Arithmetic.
5294*----------------------------------------------------------------------------*/
5295
5296int float64_unordered(float64 a, float64 b, float_status *status)
5297{
5298    a = float64_squash_input_denormal(a, status);
5299    b = float64_squash_input_denormal(b, status);
5300
5301    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5302         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5303       ) {
5304        float_raise(float_flag_invalid, status);
5305        return 1;
5306    }
5307    return 0;
5308}
5309
5310/*----------------------------------------------------------------------------
5311| Returns 1 if the double-precision floating-point value `a' is equal to the
5312| corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5313| exception.The comparison is performed according to the IEC/IEEE Standard
5314| for Binary Floating-Point Arithmetic.
5315*----------------------------------------------------------------------------*/
5316
5317int float64_eq_quiet(float64 a, float64 b, float_status *status)
5318{
5319    uint64_t av, bv;
5320    a = float64_squash_input_denormal(a, status);
5321    b = float64_squash_input_denormal(b, status);
5322
5323    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5324         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5325       ) {
5326        if (float64_is_signaling_nan(a, status)
5327         || float64_is_signaling_nan(b, status)) {
5328            float_raise(float_flag_invalid, status);
5329        }
5330        return 0;
5331    }
5332    av = float64_val(a);
5333    bv = float64_val(b);
5334    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5335
5336}
5337
5338/*----------------------------------------------------------------------------
5339| Returns 1 if the double-precision floating-point value `a' is less than or
5340| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5341| cause an exception.  Otherwise, the comparison is performed according to the
5342| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5343*----------------------------------------------------------------------------*/
5344
5345int float64_le_quiet(float64 a, float64 b, float_status *status)
5346{
5347    flag aSign, bSign;
5348    uint64_t av, bv;
5349    a = float64_squash_input_denormal(a, status);
5350    b = float64_squash_input_denormal(b, status);
5351
5352    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5353         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5354       ) {
5355        if (float64_is_signaling_nan(a, status)
5356         || float64_is_signaling_nan(b, status)) {
5357            float_raise(float_flag_invalid, status);
5358        }
5359        return 0;
5360    }
5361    aSign = extractFloat64Sign( a );
5362    bSign = extractFloat64Sign( b );
5363    av = float64_val(a);
5364    bv = float64_val(b);
5365    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5366    return ( av == bv ) || ( aSign ^ ( av < bv ) );
5367
5368}
5369
5370/*----------------------------------------------------------------------------
5371| Returns 1 if the double-precision floating-point value `a' is less than
5372| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5373| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5374| Standard for Binary Floating-Point Arithmetic.
5375*----------------------------------------------------------------------------*/
5376
5377int float64_lt_quiet(float64 a, float64 b, float_status *status)
5378{
5379    flag aSign, bSign;
5380    uint64_t av, bv;
5381    a = float64_squash_input_denormal(a, status);
5382    b = float64_squash_input_denormal(b, status);
5383
5384    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5385         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5386       ) {
5387        if (float64_is_signaling_nan(a, status)
5388         || float64_is_signaling_nan(b, status)) {
5389            float_raise(float_flag_invalid, status);
5390        }
5391        return 0;
5392    }
5393    aSign = extractFloat64Sign( a );
5394    bSign = extractFloat64Sign( b );
5395    av = float64_val(a);
5396    bv = float64_val(b);
5397    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5398    return ( av != bv ) && ( aSign ^ ( av < bv ) );
5399
5400}
5401
5402/*----------------------------------------------------------------------------
5403| Returns 1 if the double-precision floating-point values `a' and `b' cannot
5404| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5405| comparison is performed according to the IEC/IEEE Standard for Binary
5406| Floating-Point Arithmetic.
5407*----------------------------------------------------------------------------*/
5408
5409int float64_unordered_quiet(float64 a, float64 b, float_status *status)
5410{
5411    a = float64_squash_input_denormal(a, status);
5412    b = float64_squash_input_denormal(b, status);
5413
5414    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5415         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5416       ) {
5417        if (float64_is_signaling_nan(a, status)
5418         || float64_is_signaling_nan(b, status)) {
5419            float_raise(float_flag_invalid, status);
5420        }
5421        return 1;
5422    }
5423    return 0;
5424}
5425
5426/*----------------------------------------------------------------------------
5427| Returns the result of converting the extended double-precision floating-
5428| point value `a' to the 32-bit two's complement integer format.  The
5429| conversion is performed according to the IEC/IEEE Standard for Binary
5430| Floating-Point Arithmetic---which means in particular that the conversion
5431| is rounded according to the current rounding mode.  If `a' is a NaN, the
5432| largest positive integer is returned.  Otherwise, if the conversion
5433| overflows, the largest integer with the same sign as `a' is returned.
5434*----------------------------------------------------------------------------*/
5435
5436int32_t floatx80_to_int32(floatx80 a, float_status *status)
5437{
5438    flag aSign;
5439    int32_t aExp, shiftCount;
5440    uint64_t aSig;
5441
5442    if (floatx80_invalid_encoding(a)) {
5443        float_raise(float_flag_invalid, status);
5444        return 1 << 31;
5445    }
5446    aSig = extractFloatx80Frac( a );
5447    aExp = extractFloatx80Exp( a );
5448    aSign = extractFloatx80Sign( a );
5449    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5450    shiftCount = 0x4037 - aExp;
5451    if ( shiftCount <= 0 ) shiftCount = 1;
5452    shift64RightJamming( aSig, shiftCount, &aSig );
5453    return roundAndPackInt32(aSign, aSig, status);
5454
5455}
5456
5457/*----------------------------------------------------------------------------
5458| Returns the result of converting the extended double-precision floating-
5459| point value `a' to the 32-bit two's complement integer format.  The
5460| conversion is performed according to the IEC/IEEE Standard for Binary
5461| Floating-Point Arithmetic, except that the conversion is always rounded
5462| toward zero.  If `a' is a NaN, the largest positive integer is returned.
5463| Otherwise, if the conversion overflows, the largest integer with the same
5464| sign as `a' is returned.
5465*----------------------------------------------------------------------------*/
5466
5467int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5468{
5469    flag aSign;
5470    int32_t aExp, shiftCount;
5471    uint64_t aSig, savedASig;
5472    int32_t z;
5473
5474    if (floatx80_invalid_encoding(a)) {
5475        float_raise(float_flag_invalid, status);
5476        return 1 << 31;
5477    }
5478    aSig = extractFloatx80Frac( a );
5479    aExp = extractFloatx80Exp( a );
5480    aSign = extractFloatx80Sign( a );
5481    if ( 0x401E < aExp ) {
5482        if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5483        goto invalid;
5484    }
5485    else if ( aExp < 0x3FFF ) {
5486        if (aExp || aSig) {
5487            status->float_exception_flags |= float_flag_inexact;
5488        }
5489        return 0;
5490    }
5491    shiftCount = 0x403E - aExp;
5492    savedASig = aSig;
5493    aSig >>= shiftCount;
5494    z = aSig;
5495    if ( aSign ) z = - z;
5496    if ( ( z < 0 ) ^ aSign ) {
5497 invalid:
5498        float_raise(float_flag_invalid, status);
5499        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5500    }
5501    if ( ( aSig<<shiftCount ) != savedASig ) {
5502        status->float_exception_flags |= float_flag_inexact;
5503    }
5504    return z;
5505
5506}
5507
5508/*----------------------------------------------------------------------------
5509| Returns the result of converting the extended double-precision floating-
5510| point value `a' to the 64-bit two's complement integer format.  The
5511| conversion is performed according to the IEC/IEEE Standard for Binary
5512| Floating-Point Arithmetic---which means in particular that the conversion
5513| is rounded according to the current rounding mode.  If `a' is a NaN,
5514| the largest positive integer is returned.  Otherwise, if the conversion
5515| overflows, the largest integer with the same sign as `a' is returned.
5516*----------------------------------------------------------------------------*/
5517
5518int64_t floatx80_to_int64(floatx80 a, float_status *status)
5519{
5520    flag aSign;
5521    int32_t aExp, shiftCount;
5522    uint64_t aSig, aSigExtra;
5523
5524    if (floatx80_invalid_encoding(a)) {
5525        float_raise(float_flag_invalid, status);
5526        return 1ULL << 63;
5527    }
5528    aSig = extractFloatx80Frac( a );
5529    aExp = extractFloatx80Exp( a );
5530    aSign = extractFloatx80Sign( a );
5531    shiftCount = 0x403E - aExp;
5532    if ( shiftCount <= 0 ) {
5533        if ( shiftCount ) {
5534            float_raise(float_flag_invalid, status);
5535            if (!aSign || floatx80_is_any_nan(a)) {
5536                return INT64_MAX;
5537            }
5538            return INT64_MIN;
5539        }
5540        aSigExtra = 0;
5541    }
5542    else {
5543        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5544    }
5545    return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5546
5547}
5548
5549/*----------------------------------------------------------------------------
5550| Returns the result of converting the extended double-precision floating-
5551| point value `a' to the 64-bit two's complement integer format.  The
5552| conversion is performed according to the IEC/IEEE Standard for Binary
5553| Floating-Point Arithmetic, except that the conversion is always rounded
5554| toward zero.  If `a' is a NaN, the largest positive integer is returned.
5555| Otherwise, if the conversion overflows, the largest integer with the same
5556| sign as `a' is returned.
5557*----------------------------------------------------------------------------*/
5558
5559int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5560{
5561    flag aSign;
5562    int32_t aExp, shiftCount;
5563    uint64_t aSig;
5564    int64_t z;
5565
5566    if (floatx80_invalid_encoding(a)) {
5567        float_raise(float_flag_invalid, status);
5568        return 1ULL << 63;
5569    }
5570    aSig = extractFloatx80Frac( a );
5571    aExp = extractFloatx80Exp( a );
5572    aSign = extractFloatx80Sign( a );
5573    shiftCount = aExp - 0x403E;
5574    if ( 0 <= shiftCount ) {
5575        aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5576        if ( ( a.high != 0xC03E ) || aSig ) {
5577            float_raise(float_flag_invalid, status);
5578            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5579                return INT64_MAX;
5580            }
5581        }
5582        return INT64_MIN;
5583    }
5584    else if ( aExp < 0x3FFF ) {
5585        if (aExp | aSig) {
5586            status->float_exception_flags |= float_flag_inexact;
5587        }
5588        return 0;
5589    }
5590    z = aSig>>( - shiftCount );
5591    if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5592        status->float_exception_flags |= float_flag_inexact;
5593    }
5594    if ( aSign ) z = - z;
5595    return z;
5596
5597}
5598
5599/*----------------------------------------------------------------------------
5600| Returns the result of converting the extended double-precision floating-
5601| point value `a' to the single-precision floating-point format.  The
5602| conversion is performed according to the IEC/IEEE Standard for Binary
5603| Floating-Point Arithmetic.
5604*----------------------------------------------------------------------------*/
5605
5606float32 floatx80_to_float32(floatx80 a, float_status *status)
5607{
5608    flag aSign;
5609    int32_t aExp;
5610    uint64_t aSig;
5611
5612    if (floatx80_invalid_encoding(a)) {
5613        float_raise(float_flag_invalid, status);
5614        return float32_default_nan(status);
5615    }
5616    aSig = extractFloatx80Frac( a );
5617    aExp = extractFloatx80Exp( a );
5618    aSign = extractFloatx80Sign( a );
5619    if ( aExp == 0x7FFF ) {
5620        if ( (uint64_t) ( aSig<<1 ) ) {
5621            return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5622        }
5623        return packFloat32( aSign, 0xFF, 0 );
5624    }
5625    shift64RightJamming( aSig, 33, &aSig );
5626    if ( aExp || aSig ) aExp -= 0x3F81;
5627    return roundAndPackFloat32(aSign, aExp, aSig, status);
5628
5629}
5630
<