LXR qemu/include/fpu/softfloat-macros.h

   1/*
   2 * QEMU float support macros
   3 *
   4 * The code in this source file is derived from release 2a of the SoftFloat
   5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6 * some later contributions) are provided under that license, as detailed below.
   7 * It has subsequently been modified by contributors to the QEMU Project,
   8 * so some portions are provided under:
   9 *  the SoftFloat-2a license
  10 *  the BSD license
  11 *  GPL-v2-or-later
  12 *
  13 * Any future contributions to this file after December 1st 2014 will be
  14 * taken to be licensed under the Softfloat-2a license unless specifically
  15 * indicated otherwise.
  16 */
  17
  18/*
  19===============================================================================
  20This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
  21Arithmetic Package, Release 2a.
  22
  23Written by John R. Hauser.  This work was made possible in part by the
  24International Computer Science Institute, located at Suite 600, 1947 Center
  25Street, Berkeley, California 94704.  Funding was partially provided by the
  26National Science Foundation under grant MIP-9311980.  The original version
  27of this code was written as part of a project to build a fixed-point vector
  28processor in collaboration with the University of California at Berkeley,
  29overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31arithmetic/SoftFloat.html'.
  32
  33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39Derivative works are acceptable, even for commercial purposes, so long as
  40(1) they include prominent notice that the work is derivative, and (2) they
  41include prominent notice akin to these four paragraphs for those parts of
  42this code that are retained.
  43
  44===============================================================================
  45*/
  46
  47/* BSD licensing:
  48 * Copyright (c) 2006, Fabrice Bellard
  49 * All rights reserved.
  50 *
  51 * Redistribution and use in source and binary forms, with or without
  52 * modification, are permitted provided that the following conditions are met:
  53 *
  54 * 1. Redistributions of source code must retain the above copyright notice,
  55 * this list of conditions and the following disclaimer.
  56 *
  57 * 2. Redistributions in binary form must reproduce the above copyright notice,
  58 * this list of conditions and the following disclaimer in the documentation
  59 * and/or other materials provided with the distribution.
  60 *
  61 * 3. Neither the name of the copyright holder nor the names of its contributors
  62 * may be used to endorse or promote products derived from this software without
  63 * specific prior written permission.
  64 *
  65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75 * THE POSSIBILITY OF SUCH DAMAGE.
  76 */
  77
  78/* Portions of this work are licensed under the terms of the GNU GPL,
  79 * version 2 or later. See the COPYING file in the top-level directory.
  80 */
  81
  82#ifndef FPU_SOFTFLOAT_MACROS_H
  83#define FPU_SOFTFLOAT_MACROS_H
  84
  85#include "fpu/softfloat-types.h"
  86#include "qemu/host-utils.h"
  87
  88/**
  89 * shl_double: double-word merging left shift
  90 * @l: left or most-significant word
  91 * @r: right or least-significant word
  92 * @c: shift count
  93 *
  94 * Shift @l left by @c bits, shifting in bits from @r.
  95 */
  96static inline uint64_t shl_double(uint64_t l, uint64_t r, int c)
  97{
  98#if defined(__x86_64__)
  99    asm("shld %b2, %1, %0" : "+r"(l) : "r"(r), "ci"(c));
 100    return l;
 101#else
 102    return c ? (l << c) | (r >> (64 - c)) : l;
 103#endif
 104}
 105
 106/**
 107 * shr_double: double-word merging right shift
 108 * @l: left or most-significant word
 109 * @r: right or least-significant word
 110 * @c: shift count
 111 *
 112 * Shift @r right by @c bits, shifting in bits from @l.
 113 */
 114static inline uint64_t shr_double(uint64_t l, uint64_t r, int c)
 115{
 116#if defined(__x86_64__)
 117    asm("shrd %b2, %1, %0" : "+r"(r) : "r"(l), "ci"(c));
 118    return r;
 119#else
 120    return c ? (r >> c) | (l << (64 - c)) : r;
 121#endif
 122}
 123
 124/*----------------------------------------------------------------------------
 125| Shifts `a' right by the number of bits given in `count'.  If any nonzero
 126| bits are shifted off, they are ``jammed'' into the least significant bit of
 127| the result by setting the least significant bit to 1.  The value of `count'
 128| can be arbitrarily large; in particular, if `count' is greater than 32, the
 129| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
 130| The result is stored in the location pointed to by `zPtr'.
 131*----------------------------------------------------------------------------*/
 132
 133static inline void shift32RightJamming(uint32_t a, int count, uint32_t *zPtr)
 134{
 135    uint32_t z;
 136
 137    if ( count == 0 ) {
 138        z = a;
 139    }
 140    else if ( count < 32 ) {
 141        z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
 142    }
 143    else {
 144        z = ( a != 0 );
 145    }
 146    *zPtr = z;
 147
 148}
 149
 150/*----------------------------------------------------------------------------
 151| Shifts `a' right by the number of bits given in `count'.  If any nonzero
 152| bits are shifted off, they are ``jammed'' into the least significant bit of
 153| the result by setting the least significant bit to 1.  The value of `count'
 154| can be arbitrarily large; in particular, if `count' is greater than 64, the
 155| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
 156| The result is stored in the location pointed to by `zPtr'.
 157*----------------------------------------------------------------------------*/
 158
 159static inline void shift64RightJamming(uint64_t a, int count, uint64_t *zPtr)
 160{
 161    uint64_t z;
 162
 163    if ( count == 0 ) {
 164        z = a;
 165    }
 166    else if ( count < 64 ) {
 167        z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
 168    }
 169    else {
 170        z = ( a != 0 );
 171    }
 172    *zPtr = z;
 173
 174}
 175
 176/*----------------------------------------------------------------------------
 177| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
 178| _plus_ the number of bits given in `count'.  The shifted result is at most
 179| 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'.  The
 180| bits shifted off form a second 64-bit result as follows:  The _last_ bit
 181| shifted off is the most-significant bit of the extra result, and the other
 182| 63 bits of the extra result are all zero if and only if _all_but_the_last_
 183| bits shifted off were all zero.  This extra result is stored in the location
 184| pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large.
 185|     (This routine makes more sense if `a0' and `a1' are considered to form a
 186| fixed-point value with binary point between `a0' and `a1'.  This fixed-point
 187| value is shifted right by the number of bits given in `count', and the
 188| integer part of the result is returned at the location pointed to by
 189| `z0Ptr'.  The fractional part of the result may be slightly corrupted as
 190| described above, and is returned at the location pointed to by `z1Ptr'.)
 191*----------------------------------------------------------------------------*/
 192
 193static inline void
 194 shift64ExtraRightJamming(
 195     uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
 196{
 197    uint64_t z0, z1;
 198    int8_t negCount = ( - count ) & 63;
 199
 200    if ( count == 0 ) {
 201        z1 = a1;
 202        z0 = a0;
 203    }
 204    else if ( count < 64 ) {
 205        z1 = ( a0<<negCount ) | ( a1 != 0 );
 206        z0 = a0>>count;
 207    }
 208    else {
 209        if ( count == 64 ) {
 210            z1 = a0 | ( a1 != 0 );
 211        }
 212        else {
 213            z1 = ( ( a0 | a1 ) != 0 );
 214        }
 215        z0 = 0;
 216    }
 217    *z1Ptr = z1;
 218    *z0Ptr = z0;
 219
 220}
 221
 222/*----------------------------------------------------------------------------
 223| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
 224| number of bits given in `count'.  Any bits shifted off are lost.  The value
 225| of `count' can be arbitrarily large; in particular, if `count' is greater
 226| than 128, the result will be 0.  The result is broken into two 64-bit pieces
 227| which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 228*----------------------------------------------------------------------------*/
 229
 230static inline void
 231 shift128Right(
 232     uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
 233{
 234    uint64_t z0, z1;
 235    int8_t negCount = ( - count ) & 63;
 236
 237    if ( count == 0 ) {
 238        z1 = a1;
 239        z0 = a0;
 240    }
 241    else if ( count < 64 ) {
 242        z1 = ( a0<<negCount ) | ( a1>>count );
 243        z0 = a0>>count;
 244    }
 245    else {
 246        z1 = (count < 128) ? (a0 >> (count & 63)) : 0;
 247        z0 = 0;
 248    }
 249    *z1Ptr = z1;
 250    *z0Ptr = z0;
 251
 252}
 253
 254/*----------------------------------------------------------------------------
 255| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
 256| number of bits given in `count'.  If any nonzero bits are shifted off, they
 257| are ``jammed'' into the least significant bit of the result by setting the
 258| least significant bit to 1.  The value of `count' can be arbitrarily large;
 259| in particular, if `count' is greater than 128, the result will be either
 260| 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
 261| nonzero.  The result is broken into two 64-bit pieces which are stored at
 262| the locations pointed to by `z0Ptr' and `z1Ptr'.
 263*----------------------------------------------------------------------------*/
 264
 265static inline void
 266 shift128RightJamming(
 267     uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
 268{
 269    uint64_t z0, z1;
 270    int8_t negCount = ( - count ) & 63;
 271
 272    if ( count == 0 ) {
 273        z1 = a1;
 274        z0 = a0;
 275    }
 276    else if ( count < 64 ) {
 277        z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
 278        z0 = a0>>count;
 279    }
 280    else {
 281        if ( count == 64 ) {
 282            z1 = a0 | ( a1 != 0 );
 283        }
 284        else if ( count < 128 ) {
 285            z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
 286        }
 287        else {
 288            z1 = ( ( a0 | a1 ) != 0 );
 289        }
 290        z0 = 0;
 291    }
 292    *z1Ptr = z1;
 293    *z0Ptr = z0;
 294
 295}
 296
 297/*----------------------------------------------------------------------------
 298| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
 299| by 64 _plus_ the number of bits given in `count'.  The shifted result is
 300| at most 128 nonzero bits; these are broken into two 64-bit pieces which are
 301| stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
 302| off form a third 64-bit result as follows:  The _last_ bit shifted off is
 303| the most-significant bit of the extra result, and the other 63 bits of the
 304| extra result are all zero if and only if _all_but_the_last_ bits shifted off
 305| were all zero.  This extra result is stored in the location pointed to by
 306| `z2Ptr'.  The value of `count' can be arbitrarily large.
 307|     (This routine makes more sense if `a0', `a1', and `a2' are considered
 308| to form a fixed-point value with binary point between `a1' and `a2'.  This
 309| fixed-point value is shifted right by the number of bits given in `count',
 310| and the integer part of the result is returned at the locations pointed to
 311| by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
 312| corrupted as described above, and is returned at the location pointed to by
 313| `z2Ptr'.)
 314*----------------------------------------------------------------------------*/
 315
 316static inline void
 317 shift128ExtraRightJamming(
 318     uint64_t a0,
 319     uint64_t a1,
 320     uint64_t a2,
 321     int count,
 322     uint64_t *z0Ptr,
 323     uint64_t *z1Ptr,
 324     uint64_t *z2Ptr
 325 )
 326{
 327    uint64_t z0, z1, z2;
 328    int8_t negCount = ( - count ) & 63;
 329
 330    if ( count == 0 ) {
 331        z2 = a2;
 332        z1 = a1;
 333        z0 = a0;
 334    }
 335    else {
 336        if ( count < 64 ) {
 337            z2 = a1<<negCount;
 338            z1 = ( a0<<negCount ) | ( a1>>count );
 339            z0 = a0>>count;
 340        }
 341        else {
 342            if ( count == 64 ) {
 343                z2 = a1;
 344                z1 = a0;
 345            }
 346            else {
 347                a2 |= a1;
 348                if ( count < 128 ) {
 349                    z2 = a0<<negCount;
 350                    z1 = a0>>( count & 63 );
 351                }
 352                else {
 353                    z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
 354                    z1 = 0;
 355                }
 356            }
 357            z0 = 0;
 358        }
 359        z2 |= ( a2 != 0 );
 360    }
 361    *z2Ptr = z2;
 362    *z1Ptr = z1;
 363    *z0Ptr = z0;
 364
 365}
 366
 367/*----------------------------------------------------------------------------
 368| Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
 369| number of bits given in `count'.  Any bits shifted off are lost.  The value
 370| of `count' must be less than 64.  The result is broken into two 64-bit
 371| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 372*----------------------------------------------------------------------------*/
 373
 374static inline void shortShift128Left(uint64_t a0, uint64_t a1, int count,
 375                                     uint64_t *z0Ptr, uint64_t *z1Ptr)
 376{
 377    *z1Ptr = a1 << count;
 378    *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
 379}
 380
 381/*----------------------------------------------------------------------------
 382| Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
 383| number of bits given in `count'.  Any bits shifted off are lost.  The value
 384| of `count' may be greater than 64.  The result is broken into two 64-bit
 385| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 386*----------------------------------------------------------------------------*/
 387
 388static inline void shift128Left(uint64_t a0, uint64_t a1, int count,
 389                                uint64_t *z0Ptr, uint64_t *z1Ptr)
 390{
 391    if (count < 64) {
 392        *z1Ptr = a1 << count;
 393        *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
 394    } else {
 395        *z1Ptr = 0;
 396        *z0Ptr = a1 << (count - 64);
 397    }
 398}
 399
 400/*----------------------------------------------------------------------------
 401| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
 402| by the number of bits given in `count'.  Any bits shifted off are lost.
 403| The value of `count' must be less than 64.  The result is broken into three
 404| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
 405| `z1Ptr', and `z2Ptr'.
 406*----------------------------------------------------------------------------*/
 407
 408static inline void
 409 shortShift192Left(
 410     uint64_t a0,
 411     uint64_t a1,
 412     uint64_t a2,
 413     int count,
 414     uint64_t *z0Ptr,
 415     uint64_t *z1Ptr,
 416     uint64_t *z2Ptr
 417 )
 418{
 419    uint64_t z0, z1, z2;
 420    int8_t negCount;
 421
 422    z2 = a2<<count;
 423    z1 = a1<<count;
 424    z0 = a0<<count;
 425    if ( 0 < count ) {
 426        negCount = ( ( - count ) & 63 );
 427        z1 |= a2>>negCount;
 428        z0 |= a1>>negCount;
 429    }
 430    *z2Ptr = z2;
 431    *z1Ptr = z1;
 432    *z0Ptr = z0;
 433
 434}
 435
 436/*----------------------------------------------------------------------------
 437| Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
 438| value formed by concatenating `b0' and `b1'.  Addition is modulo 2^128, so
 439| any carry out is lost.  The result is broken into two 64-bit pieces which
 440| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 441*----------------------------------------------------------------------------*/
 442
 443static inline void add128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
 444                          uint64_t *z0Ptr, uint64_t *z1Ptr)
 445{
 446    bool c = 0;
 447    *z1Ptr = uadd64_carry(a1, b1, &c);
 448    *z0Ptr = uadd64_carry(a0, b0, &c);
 449}
 450
 451/*----------------------------------------------------------------------------
 452| Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
 453| 192-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
 454| modulo 2^192, so any carry out is lost.  The result is broken into three
 455| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
 456| `z1Ptr', and `z2Ptr'.
 457*----------------------------------------------------------------------------*/
 458
 459static inline void add192(uint64_t a0, uint64_t a1, uint64_t a2,
 460                          uint64_t b0, uint64_t b1, uint64_t b2,
 461                          uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
 462{
 463    bool c = 0;
 464    *z2Ptr = uadd64_carry(a2, b2, &c);
 465    *z1Ptr = uadd64_carry(a1, b1, &c);
 466    *z0Ptr = uadd64_carry(a0, b0, &c);
 467}
 468
 469/*----------------------------------------------------------------------------
 470| Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
 471| 128-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo
 472| 2^128, so any borrow out (carry out) is lost.  The result is broken into two
 473| 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
 474| `z1Ptr'.
 475*----------------------------------------------------------------------------*/
 476
 477static inline void sub128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
 478                          uint64_t *z0Ptr, uint64_t *z1Ptr)
 479{
 480    bool c = 0;
 481    *z1Ptr = usub64_borrow(a1, b1, &c);
 482    *z0Ptr = usub64_borrow(a0, b0, &c);
 483}
 484
 485/*----------------------------------------------------------------------------
 486| Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
 487| from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
 488| Subtraction is modulo 2^192, so any borrow out (carry out) is lost.  The
 489| result is broken into three 64-bit pieces which are stored at the locations
 490| pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
 491*----------------------------------------------------------------------------*/
 492
 493static inline void sub192(uint64_t a0, uint64_t a1, uint64_t a2,
 494                          uint64_t b0, uint64_t b1, uint64_t b2,
 495                          uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
 496{
 497    bool c = 0;
 498    *z2Ptr = usub64_borrow(a2, b2, &c);
 499    *z1Ptr = usub64_borrow(a1, b1, &c);
 500    *z0Ptr = usub64_borrow(a0, b0, &c);
 501}
 502
 503/*----------------------------------------------------------------------------
 504| Multiplies `a' by `b' to obtain a 128-bit product.  The product is broken
 505| into two 64-bit pieces which are stored at the locations pointed to by
 506| `z0Ptr' and `z1Ptr'.
 507*----------------------------------------------------------------------------*/
 508
 509static inline void
 510mul64To128(uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr)
 511{
 512    mulu64(z1Ptr, z0Ptr, a, b);
 513}
 514
 515/*----------------------------------------------------------------------------
 516| Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
 517| `b' to obtain a 192-bit product.  The product is broken into three 64-bit
 518| pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
 519| `z2Ptr'.
 520*----------------------------------------------------------------------------*/
 521
 522static inline void
 523mul128By64To192(uint64_t a0, uint64_t a1, uint64_t b,
 524                uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
 525{
 526    uint64_t z0, z1, m1;
 527
 528    mul64To128(a1, b, &m1, z2Ptr);
 529    mul64To128(a0, b, &z0, &z1);
 530    add128(z0, z1, 0, m1, z0Ptr, z1Ptr);
 531}
 532
 533/*----------------------------------------------------------------------------
 534| Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
 535| 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
 536| product.  The product is broken into four 64-bit pieces which are stored at
 537| the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
 538*----------------------------------------------------------------------------*/
 539
 540static inline void mul128To256(uint64_t a0, uint64_t a1,
 541                               uint64_t b0, uint64_t b1,
 542                               uint64_t *z0Ptr, uint64_t *z1Ptr,
 543                               uint64_t *z2Ptr, uint64_t *z3Ptr)
 544{
 545    uint64_t z0, z1, z2;
 546    uint64_t m0, m1, m2, n1, n2;
 547
 548    mul64To128(a1, b0, &m1, &m2);
 549    mul64To128(a0, b1, &n1, &n2);
 550    mul64To128(a1, b1, &z2, z3Ptr);
 551    mul64To128(a0, b0, &z0, &z1);
 552
 553    add192( 0, m1, m2,  0, n1, n2, &m0, &m1, &m2);
 554    add192(m0, m1, m2, z0, z1, z2, z0Ptr, z1Ptr, z2Ptr);
 555}
 556
 557/*----------------------------------------------------------------------------
 558| Returns an approximation to the 64-bit integer quotient obtained by dividing
 559| `b' into the 128-bit value formed by concatenating `a0' and `a1'.  The
 560| divisor `b' must be at least 2^63.  If q is the exact quotient truncated
 561| toward zero, the approximation returned lies between q and q + 2 inclusive.
 562| If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
 563| unsigned integer is returned.
 564*----------------------------------------------------------------------------*/
 565
 566static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
 567{
 568    uint64_t b0, b1;
 569    uint64_t rem0, rem1, term0, term1;
 570    uint64_t z;
 571
 572    if ( b <= a0 ) return UINT64_C(0xFFFFFFFFFFFFFFFF);
 573    b0 = b>>32;
 574    z = ( b0<<32 <= a0 ) ? UINT64_C(0xFFFFFFFF00000000) : ( a0 / b0 )<<32;
 575    mul64To128( b, z, &term0, &term1 );
 576    sub128( a0, a1, term0, term1, &rem0, &rem1 );
 577    while ( ( (int64_t) rem0 ) < 0 ) {
 578        z -= UINT64_C(0x100000000);
 579        b1 = b<<32;
 580        add128( rem0, rem1, b0, b1, &rem0, &rem1 );
 581    }
 582    rem0 = ( rem0<<32 ) | ( rem1>>32 );
 583    z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
 584    return z;
 585
 586}
 587
 588/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
 589 * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
 590 *
 591 * Licensed under the GPLv2/LGPLv3
 592 */
 593static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
 594                                  uint64_t n0, uint64_t d)
 595{
 596#if defined(__x86_64__)
 597    uint64_t q;
 598    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
 599    return q;
 600#elif defined(__s390x__) && !defined(__clang__)
 601    /* Need to use a TImode type to get an even register pair for DLGR.  */
 602    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
 603    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
 604    *r = n >> 64;
 605    return n;
 606#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
 607    /* From Power ISA 2.06, programming note for divdeu.  */
 608    uint64_t q1, q2, Q, r1, r2, R;
 609    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
 610        : "=&r"(q1), "=r"(q2)
 611        : "r"(n1), "r"(n0), "r"(d));
 612    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
 613    r2 = n0 - (q2 * d);
 614    Q = q1 + q2;
 615    R = r1 + r2;
 616    if (R >= d || R < r2) { /* overflow implies R > d */
 617        Q += 1;
 618        R -= d;
 619    }
 620    *r = R;
 621    return Q;
 622#else
 623    uint64_t d0, d1, q0, q1, r1, r0, m;
 624
 625    d0 = (uint32_t)d;
 626    d1 = d >> 32;
 627
 628    r1 = n1 % d1;
 629    q1 = n1 / d1;
 630    m = q1 * d0;
 631    r1 = (r1 << 32) | (n0 >> 32);
 632    if (r1 < m) {
 633        q1 -= 1;
 634        r1 += d;
 635        if (r1 >= d) {
 636            if (r1 < m) {
 637                q1 -= 1;
 638                r1 += d;
 639            }
 640        }
 641    }
 642    r1 -= m;
 643
 644    r0 = r1 % d1;
 645    q0 = r1 / d1;
 646    m = q0 * d0;
 647    r0 = (r0 << 32) | (uint32_t)n0;
 648    if (r0 < m) {
 649        q0 -= 1;
 650        r0 += d;
 651        if (r0 >= d) {
 652            if (r0 < m) {
 653                q0 -= 1;
 654                r0 += d;
 655            }
 656        }
 657    }
 658    r0 -= m;
 659
 660    *r = r0;
 661    return (q1 << 32) | q0;
 662#endif
 663}
 664
 665/*----------------------------------------------------------------------------
 666| Returns an approximation to the square root of the 32-bit significand given
 667| by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
 668| `aExp' (the least significant bit) is 1, the integer returned approximates
 669| 2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp'
 670| is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either
 671| case, the approximation returned lies strictly within +/-2 of the exact
 672| value.
 673*----------------------------------------------------------------------------*/
 674
 675static inline uint32_t estimateSqrt32(int aExp, uint32_t a)
 676{
 677    static const uint16_t sqrtOddAdjustments[] = {
 678        0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
 679        0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
 680    };
 681    static const uint16_t sqrtEvenAdjustments[] = {
 682        0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
 683        0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
 684    };
 685    int8_t index;
 686    uint32_t z;
 687
 688    index = ( a>>27 ) & 15;
 689    if ( aExp & 1 ) {
 690        z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ];
 691        z = ( ( a / z )<<14 ) + ( z<<15 );
 692        a >>= 1;
 693    }
 694    else {
 695        z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ];
 696        z = a / z + z;
 697        z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
 698        if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 );
 699    }
 700    return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 );
 701
 702}
 703
 704/*----------------------------------------------------------------------------
 705| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
 706| is equal to the 128-bit value formed by concatenating `b0' and `b1'.
 707| Otherwise, returns 0.
 708*----------------------------------------------------------------------------*/
 709
 710static inline bool eq128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
 711{
 712    return a0 == b0 && a1 == b1;
 713}
 714
 715/*----------------------------------------------------------------------------
 716| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
 717| than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
 718| Otherwise, returns 0.
 719*----------------------------------------------------------------------------*/
 720
 721static inline bool le128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
 722{
 723    return a0 < b0 || (a0 == b0 && a1 <= b1);
 724}
 725
 726/*----------------------------------------------------------------------------
 727| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
 728| than the 128-bit value formed by concatenating `b0' and `b1'.  Otherwise,
 729| returns 0.
 730*----------------------------------------------------------------------------*/
 731
 732static inline bool lt128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
 733{
 734    return a0 < b0 || (a0 == b0 && a1 < b1);
 735}
 736
 737/*----------------------------------------------------------------------------
 738| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
 739| not equal to the 128-bit value formed by concatenating `b0' and `b1'.
 740| Otherwise, returns 0.
 741*----------------------------------------------------------------------------*/
 742
 743static inline bool ne128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
 744{
 745    return a0 != b0 || a1 != b1;
 746}
 747
 748/*
 749 * Similarly, comparisons of 192-bit values.
 750 */
 751
 752static inline bool eq192(uint64_t a0, uint64_t a1, uint64_t a2,
 753                         uint64_t b0, uint64_t b1, uint64_t b2)
 754{
 755    return ((a0 ^ b0) | (a1 ^ b1) | (a2 ^ b2)) == 0;
 756}
 757
 758static inline bool le192(uint64_t a0, uint64_t a1, uint64_t a2,
 759                         uint64_t b0, uint64_t b1, uint64_t b2)
 760{
 761    if (a0 != b0) {
 762        return a0 < b0;
 763    }
 764    if (a1 != b1) {
 765        return a1 < b1;
 766    }
 767    return a2 <= b2;
 768}
 769
 770static inline bool lt192(uint64_t a0, uint64_t a1, uint64_t a2,
 771                         uint64_t b0, uint64_t b1, uint64_t b2)
 772{
 773    if (a0 != b0) {
 774        return a0 < b0;
 775    }
 776    if (a1 != b1) {
 777        return a1 < b1;
 778    }
 779    return a2 < b2;
 780}
 781
 782#endif
 783