qemu/include/fpu/softfloat-macros.h
<<
>>
Prefs
   1/*
   2 * QEMU float support macros
   3 *
   4 * The code in this source file is derived from release 2a of the SoftFloat
   5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6 * some later contributions) are provided under that license, as detailed below.
   7 * It has subsequently been modified by contributors to the QEMU Project,
   8 * so some portions are provided under:
   9 *  the SoftFloat-2a license
  10 *  the BSD license
  11 *  GPL-v2-or-later
  12 *
  13 * Any future contributions to this file after December 1st 2014 will be
  14 * taken to be licensed under the Softfloat-2a license unless specifically
  15 * indicated otherwise.
  16 */
  17
  18/*
  19===============================================================================
  20This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
  21Arithmetic Package, Release 2a.
  22
  23Written by John R. Hauser.  This work was made possible in part by the
  24International Computer Science Institute, located at Suite 600, 1947 Center
  25Street, Berkeley, California 94704.  Funding was partially provided by the
  26National Science Foundation under grant MIP-9311980.  The original version
  27of this code was written as part of a project to build a fixed-point vector
  28processor in collaboration with the University of California at Berkeley,
  29overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31arithmetic/SoftFloat.html'.
  32
  33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39Derivative works are acceptable, even for commercial purposes, so long as
  40(1) they include prominent notice that the work is derivative, and (2) they
  41include prominent notice akin to these four paragraphs for those parts of
  42this code that are retained.
  43
  44===============================================================================
  45*/
  46
  47/* BSD licensing:
  48 * Copyright (c) 2006, Fabrice Bellard
  49 * All rights reserved.
  50 *
  51 * Redistribution and use in source and binary forms, with or without
  52 * modification, are permitted provided that the following conditions are met:
  53 *
  54 * 1. Redistributions of source code must retain the above copyright notice,
  55 * this list of conditions and the following disclaimer.
  56 *
  57 * 2. Redistributions in binary form must reproduce the above copyright notice,
  58 * this list of conditions and the following disclaimer in the documentation
  59 * and/or other materials provided with the distribution.
  60 *
  61 * 3. Neither the name of the copyright holder nor the names of its contributors
  62 * may be used to endorse or promote products derived from this software without
  63 * specific prior written permission.
  64 *
  65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75 * THE POSSIBILITY OF SUCH DAMAGE.
  76 */
  77
  78/* Portions of this work are licensed under the terms of the GNU GPL,
  79 * version 2 or later. See the COPYING file in the top-level directory.
  80 */
  81
  82/*----------------------------------------------------------------------------
  83| Shifts `a' right by the number of bits given in `count'.  If any nonzero
  84| bits are shifted off, they are ``jammed'' into the least significant bit of
  85| the result by setting the least significant bit to 1.  The value of `count'
  86| can be arbitrarily large; in particular, if `count' is greater than 32, the
  87| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  88| The result is stored in the location pointed to by `zPtr'.
  89*----------------------------------------------------------------------------*/
  90
  91static inline void shift32RightJamming(uint32_t a, int count, uint32_t *zPtr)
  92{
  93    uint32_t z;
  94
  95    if ( count == 0 ) {
  96        z = a;
  97    }
  98    else if ( count < 32 ) {
  99        z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
 100    }
 101    else {
 102        z = ( a != 0 );
 103    }
 104    *zPtr = z;
 105
 106}
 107
 108/*----------------------------------------------------------------------------
 109| Shifts `a' right by the number of bits given in `count'.  If any nonzero
 110| bits are shifted off, they are ``jammed'' into the least significant bit of
 111| the result by setting the least significant bit to 1.  The value of `count'
 112| can be arbitrarily large; in particular, if `count' is greater than 64, the
 113| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
 114| The result is stored in the location pointed to by `zPtr'.
 115*----------------------------------------------------------------------------*/
 116
 117static inline void shift64RightJamming(uint64_t a, int count, uint64_t *zPtr)
 118{
 119    uint64_t z;
 120
 121    if ( count == 0 ) {
 122        z = a;
 123    }
 124    else if ( count < 64 ) {
 125        z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
 126    }
 127    else {
 128        z = ( a != 0 );
 129    }
 130    *zPtr = z;
 131
 132}
 133
 134/*----------------------------------------------------------------------------
 135| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
 136| _plus_ the number of bits given in `count'.  The shifted result is at most
 137| 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'.  The
 138| bits shifted off form a second 64-bit result as follows:  The _last_ bit
 139| shifted off is the most-significant bit of the extra result, and the other
 140| 63 bits of the extra result are all zero if and only if _all_but_the_last_
 141| bits shifted off were all zero.  This extra result is stored in the location
 142| pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large.
 143|     (This routine makes more sense if `a0' and `a1' are considered to form a
 144| fixed-point value with binary point between `a0' and `a1'.  This fixed-point
 145| value is shifted right by the number of bits given in `count', and the
 146| integer part of the result is returned at the location pointed to by
 147| `z0Ptr'.  The fractional part of the result may be slightly corrupted as
 148| described above, and is returned at the location pointed to by `z1Ptr'.)
 149*----------------------------------------------------------------------------*/
 150
 151static inline void
 152 shift64ExtraRightJamming(
 153     uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
 154{
 155    uint64_t z0, z1;
 156    int8_t negCount = ( - count ) & 63;
 157
 158    if ( count == 0 ) {
 159        z1 = a1;
 160        z0 = a0;
 161    }
 162    else if ( count < 64 ) {
 163        z1 = ( a0<<negCount ) | ( a1 != 0 );
 164        z0 = a0>>count;
 165    }
 166    else {
 167        if ( count == 64 ) {
 168            z1 = a0 | ( a1 != 0 );
 169        }
 170        else {
 171            z1 = ( ( a0 | a1 ) != 0 );
 172        }
 173        z0 = 0;
 174    }
 175    *z1Ptr = z1;
 176    *z0Ptr = z0;
 177
 178}
 179
 180/*----------------------------------------------------------------------------
 181| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
 182| number of bits given in `count'.  Any bits shifted off are lost.  The value
 183| of `count' can be arbitrarily large; in particular, if `count' is greater
 184| than 128, the result will be 0.  The result is broken into two 64-bit pieces
 185| which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 186*----------------------------------------------------------------------------*/
 187
 188static inline void
 189 shift128Right(
 190     uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
 191{
 192    uint64_t z0, z1;
 193    int8_t negCount = ( - count ) & 63;
 194
 195    if ( count == 0 ) {
 196        z1 = a1;
 197        z0 = a0;
 198    }
 199    else if ( count < 64 ) {
 200        z1 = ( a0<<negCount ) | ( a1>>count );
 201        z0 = a0>>count;
 202    }
 203    else {
 204        z1 = (count < 128) ? (a0 >> (count & 63)) : 0;
 205        z0 = 0;
 206    }
 207    *z1Ptr = z1;
 208    *z0Ptr = z0;
 209
 210}
 211
 212/*----------------------------------------------------------------------------
 213| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
 214| number of bits given in `count'.  If any nonzero bits are shifted off, they
 215| are ``jammed'' into the least significant bit of the result by setting the
 216| least significant bit to 1.  The value of `count' can be arbitrarily large;
 217| in particular, if `count' is greater than 128, the result will be either
 218| 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
 219| nonzero.  The result is broken into two 64-bit pieces which are stored at
 220| the locations pointed to by `z0Ptr' and `z1Ptr'.
 221*----------------------------------------------------------------------------*/
 222
 223static inline void
 224 shift128RightJamming(
 225     uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
 226{
 227    uint64_t z0, z1;
 228    int8_t negCount = ( - count ) & 63;
 229
 230    if ( count == 0 ) {
 231        z1 = a1;
 232        z0 = a0;
 233    }
 234    else if ( count < 64 ) {
 235        z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
 236        z0 = a0>>count;
 237    }
 238    else {
 239        if ( count == 64 ) {
 240            z1 = a0 | ( a1 != 0 );
 241        }
 242        else if ( count < 128 ) {
 243            z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
 244        }
 245        else {
 246            z1 = ( ( a0 | a1 ) != 0 );
 247        }
 248        z0 = 0;
 249    }
 250    *z1Ptr = z1;
 251    *z0Ptr = z0;
 252
 253}
 254
 255/*----------------------------------------------------------------------------
 256| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
 257| by 64 _plus_ the number of bits given in `count'.  The shifted result is
 258| at most 128 nonzero bits; these are broken into two 64-bit pieces which are
 259| stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
 260| off form a third 64-bit result as follows:  The _last_ bit shifted off is
 261| the most-significant bit of the extra result, and the other 63 bits of the
 262| extra result are all zero if and only if _all_but_the_last_ bits shifted off
 263| were all zero.  This extra result is stored in the location pointed to by
 264| `z2Ptr'.  The value of `count' can be arbitrarily large.
 265|     (This routine makes more sense if `a0', `a1', and `a2' are considered
 266| to form a fixed-point value with binary point between `a1' and `a2'.  This
 267| fixed-point value is shifted right by the number of bits given in `count',
 268| and the integer part of the result is returned at the locations pointed to
 269| by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
 270| corrupted as described above, and is returned at the location pointed to by
 271| `z2Ptr'.)
 272*----------------------------------------------------------------------------*/
 273
 274static inline void
 275 shift128ExtraRightJamming(
 276     uint64_t a0,
 277     uint64_t a1,
 278     uint64_t a2,
 279     int count,
 280     uint64_t *z0Ptr,
 281     uint64_t *z1Ptr,
 282     uint64_t *z2Ptr
 283 )
 284{
 285    uint64_t z0, z1, z2;
 286    int8_t negCount = ( - count ) & 63;
 287
 288    if ( count == 0 ) {
 289        z2 = a2;
 290        z1 = a1;
 291        z0 = a0;
 292    }
 293    else {
 294        if ( count < 64 ) {
 295            z2 = a1<<negCount;
 296            z1 = ( a0<<negCount ) | ( a1>>count );
 297            z0 = a0>>count;
 298        }
 299        else {
 300            if ( count == 64 ) {
 301                z2 = a1;
 302                z1 = a0;
 303            }
 304            else {
 305                a2 |= a1;
 306                if ( count < 128 ) {
 307                    z2 = a0<<negCount;
 308                    z1 = a0>>( count & 63 );
 309                }
 310                else {
 311                    z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
 312                    z1 = 0;
 313                }
 314            }
 315            z0 = 0;
 316        }
 317        z2 |= ( a2 != 0 );
 318    }
 319    *z2Ptr = z2;
 320    *z1Ptr = z1;
 321    *z0Ptr = z0;
 322
 323}
 324
 325/*----------------------------------------------------------------------------
 326| Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
 327| number of bits given in `count'.  Any bits shifted off are lost.  The value
 328| of `count' must be less than 64.  The result is broken into two 64-bit
 329| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 330*----------------------------------------------------------------------------*/
 331
 332static inline void shortShift128Left(uint64_t a0, uint64_t a1, int count,
 333                                     uint64_t *z0Ptr, uint64_t *z1Ptr)
 334{
 335    *z1Ptr = a1 << count;
 336    *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
 337}
 338
 339/*----------------------------------------------------------------------------
 340| Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
 341| number of bits given in `count'.  Any bits shifted off are lost.  The value
 342| of `count' may be greater than 64.  The result is broken into two 64-bit
 343| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 344*----------------------------------------------------------------------------*/
 345
 346static inline void shift128Left(uint64_t a0, uint64_t a1, int count,
 347                                uint64_t *z0Ptr, uint64_t *z1Ptr)
 348{
 349    if (count < 64) {
 350        *z1Ptr = a1 << count;
 351        *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
 352    } else {
 353        *z1Ptr = 0;
 354        *z0Ptr = a1 << (count - 64);
 355    }
 356}
 357
 358/*----------------------------------------------------------------------------
 359| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
 360| by the number of bits given in `count'.  Any bits shifted off are lost.
 361| The value of `count' must be less than 64.  The result is broken into three
 362| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
 363| `z1Ptr', and `z2Ptr'.
 364*----------------------------------------------------------------------------*/
 365
 366static inline void
 367 shortShift192Left(
 368     uint64_t a0,
 369     uint64_t a1,
 370     uint64_t a2,
 371     int count,
 372     uint64_t *z0Ptr,
 373     uint64_t *z1Ptr,
 374     uint64_t *z2Ptr
 375 )
 376{
 377    uint64_t z0, z1, z2;
 378    int8_t negCount;
 379
 380    z2 = a2<<count;
 381    z1 = a1<<count;
 382    z0 = a0<<count;
 383    if ( 0 < count ) {
 384        negCount = ( ( - count ) & 63 );
 385        z1 |= a2>>negCount;
 386        z0 |= a1>>negCount;
 387    }
 388    *z2Ptr = z2;
 389    *z1Ptr = z1;
 390    *z0Ptr = z0;
 391
 392}
 393
 394/*----------------------------------------------------------------------------
 395| Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
 396| value formed by concatenating `b0' and `b1'.  Addition is modulo 2^128, so
 397| any carry out is lost.  The result is broken into two 64-bit pieces which
 398| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 399*----------------------------------------------------------------------------*/
 400
 401static inline void
 402 add128(
 403     uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr )
 404{
 405    uint64_t z1;
 406
 407    z1 = a1 + b1;
 408    *z1Ptr = z1;
 409    *z0Ptr = a0 + b0 + ( z1 < a1 );
 410
 411}
 412
 413/*----------------------------------------------------------------------------
 414| Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
 415| 192-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
 416| modulo 2^192, so any carry out is lost.  The result is broken into three
 417| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
 418| `z1Ptr', and `z2Ptr'.
 419*----------------------------------------------------------------------------*/
 420
 421static inline void
 422 add192(
 423     uint64_t a0,
 424     uint64_t a1,
 425     uint64_t a2,
 426     uint64_t b0,
 427     uint64_t b1,
 428     uint64_t b2,
 429     uint64_t *z0Ptr,
 430     uint64_t *z1Ptr,
 431     uint64_t *z2Ptr
 432 )
 433{
 434    uint64_t z0, z1, z2;
 435    int8_t carry0, carry1;
 436
 437    z2 = a2 + b2;
 438    carry1 = ( z2 < a2 );
 439    z1 = a1 + b1;
 440    carry0 = ( z1 < a1 );
 441    z0 = a0 + b0;
 442    z1 += carry1;
 443    z0 += ( z1 < carry1 );
 444    z0 += carry0;
 445    *z2Ptr = z2;
 446    *z1Ptr = z1;
 447    *z0Ptr = z0;
 448
 449}
 450
 451/*----------------------------------------------------------------------------
 452| Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
 453| 128-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo
 454| 2^128, so any borrow out (carry out) is lost.  The result is broken into two
 455| 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
 456| `z1Ptr'.
 457*----------------------------------------------------------------------------*/
 458
 459static inline void
 460 sub128(
 461     uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr )
 462{
 463
 464    *z1Ptr = a1 - b1;
 465    *z0Ptr = a0 - b0 - ( a1 < b1 );
 466
 467}
 468
 469/*----------------------------------------------------------------------------
 470| Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
 471| from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
 472| Subtraction is modulo 2^192, so any borrow out (carry out) is lost.  The
 473| result is broken into three 64-bit pieces which are stored at the locations
 474| pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
 475*----------------------------------------------------------------------------*/
 476
 477static inline void
 478 sub192(
 479     uint64_t a0,
 480     uint64_t a1,
 481     uint64_t a2,
 482     uint64_t b0,
 483     uint64_t b1,
 484     uint64_t b2,
 485     uint64_t *z0Ptr,
 486     uint64_t *z1Ptr,
 487     uint64_t *z2Ptr
 488 )
 489{
 490    uint64_t z0, z1, z2;
 491    int8_t borrow0, borrow1;
 492
 493    z2 = a2 - b2;
 494    borrow1 = ( a2 < b2 );
 495    z1 = a1 - b1;
 496    borrow0 = ( a1 < b1 );
 497    z0 = a0 - b0;
 498    z0 -= ( z1 < borrow1 );
 499    z1 -= borrow1;
 500    z0 -= borrow0;
 501    *z2Ptr = z2;
 502    *z1Ptr = z1;
 503    *z0Ptr = z0;
 504
 505}
 506
 507/*----------------------------------------------------------------------------
 508| Multiplies `a' by `b' to obtain a 128-bit product.  The product is broken
 509| into two 64-bit pieces which are stored at the locations pointed to by
 510| `z0Ptr' and `z1Ptr'.
 511*----------------------------------------------------------------------------*/
 512
 513static inline void mul64To128( uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr )
 514{
 515    uint32_t aHigh, aLow, bHigh, bLow;
 516    uint64_t z0, zMiddleA, zMiddleB, z1;
 517
 518    aLow = a;
 519    aHigh = a>>32;
 520    bLow = b;
 521    bHigh = b>>32;
 522    z1 = ( (uint64_t) aLow ) * bLow;
 523    zMiddleA = ( (uint64_t) aLow ) * bHigh;
 524    zMiddleB = ( (uint64_t) aHigh ) * bLow;
 525    z0 = ( (uint64_t) aHigh ) * bHigh;
 526    zMiddleA += zMiddleB;
 527    z0 += ( ( (uint64_t) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 );
 528    zMiddleA <<= 32;
 529    z1 += zMiddleA;
 530    z0 += ( z1 < zMiddleA );
 531    *z1Ptr = z1;
 532    *z0Ptr = z0;
 533
 534}
 535
 536/*----------------------------------------------------------------------------
 537| Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
 538| `b' to obtain a 192-bit product.  The product is broken into three 64-bit
 539| pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
 540| `z2Ptr'.
 541*----------------------------------------------------------------------------*/
 542
 543static inline void
 544 mul128By64To192(
 545     uint64_t a0,
 546     uint64_t a1,
 547     uint64_t b,
 548     uint64_t *z0Ptr,
 549     uint64_t *z1Ptr,
 550     uint64_t *z2Ptr
 551 )
 552{
 553    uint64_t z0, z1, z2, more1;
 554
 555    mul64To128( a1, b, &z1, &z2 );
 556    mul64To128( a0, b, &z0, &more1 );
 557    add128( z0, more1, 0, z1, &z0, &z1 );
 558    *z2Ptr = z2;
 559    *z1Ptr = z1;
 560    *z0Ptr = z0;
 561
 562}
 563
 564/*----------------------------------------------------------------------------
 565| Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
 566| 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
 567| product.  The product is broken into four 64-bit pieces which are stored at
 568| the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
 569*----------------------------------------------------------------------------*/
 570
 571static inline void
 572 mul128To256(
 573     uint64_t a0,
 574     uint64_t a1,
 575     uint64_t b0,
 576     uint64_t b1,
 577     uint64_t *z0Ptr,
 578     uint64_t *z1Ptr,
 579     uint64_t *z2Ptr,
 580     uint64_t *z3Ptr
 581 )
 582{
 583    uint64_t z0, z1, z2, z3;
 584    uint64_t more1, more2;
 585
 586    mul64To128( a1, b1, &z2, &z3 );
 587    mul64To128( a1, b0, &z1, &more2 );
 588    add128( z1, more2, 0, z2, &z1, &z2 );
 589    mul64To128( a0, b0, &z0, &more1 );
 590    add128( z0, more1, 0, z1, &z0, &z1 );
 591    mul64To128( a0, b1, &more1, &more2 );
 592    add128( more1, more2, 0, z2, &more1, &z2 );
 593    add128( z0, z1, 0, more1, &z0, &z1 );
 594    *z3Ptr = z3;
 595    *z2Ptr = z2;
 596    *z1Ptr = z1;
 597    *z0Ptr = z0;
 598
 599}
 600
 601/*----------------------------------------------------------------------------
 602| Returns an approximation to the 64-bit integer quotient obtained by dividing
 603| `b' into the 128-bit value formed by concatenating `a0' and `a1'.  The
 604| divisor `b' must be at least 2^63.  If q is the exact quotient truncated
 605| toward zero, the approximation returned lies between q and q + 2 inclusive.
 606| If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
 607| unsigned integer is returned.
 608*----------------------------------------------------------------------------*/
 609
 610static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
 611{
 612    uint64_t b0, b1;
 613    uint64_t rem0, rem1, term0, term1;
 614    uint64_t z;
 615
 616    if ( b <= a0 ) return LIT64( 0xFFFFFFFFFFFFFFFF );
 617    b0 = b>>32;
 618    z = ( b0<<32 <= a0 ) ? LIT64( 0xFFFFFFFF00000000 ) : ( a0 / b0 )<<32;
 619    mul64To128( b, z, &term0, &term1 );
 620    sub128( a0, a1, term0, term1, &rem0, &rem1 );
 621    while ( ( (int64_t) rem0 ) < 0 ) {
 622        z -= LIT64( 0x100000000 );
 623        b1 = b<<32;
 624        add128( rem0, rem1, b0, b1, &rem0, &rem1 );
 625    }
 626    rem0 = ( rem0<<32 ) | ( rem1>>32 );
 627    z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
 628    return z;
 629
 630}
 631
 632/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
 633 * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
 634 *
 635 * Licensed under the GPLv2/LGPLv3
 636 */
 637static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
 638                                  uint64_t n0, uint64_t d)
 639{
 640#if defined(__x86_64__)
 641    uint64_t q;
 642    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
 643    return q;
 644#elif defined(__s390x__)
 645    /* Need to use a TImode type to get an even register pair for DLGR.  */
 646    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
 647    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
 648    *r = n >> 64;
 649    return n;
 650#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
 651    /* From Power ISA 2.06, programming note for divdeu.  */
 652    uint64_t q1, q2, Q, r1, r2, R;
 653    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
 654        : "=&r"(q1), "=r"(q2)
 655        : "r"(n1), "r"(n0), "r"(d));
 656    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
 657    r2 = n0 - (q2 * d);
 658    Q = q1 + q2;
 659    R = r1 + r2;
 660    if (R >= d || R < r2) { /* overflow implies R > d */
 661        Q += 1;
 662        R -= d;
 663    }
 664    *r = R;
 665    return Q;
 666#else
 667    uint64_t d0, d1, q0, q1, r1, r0, m;
 668
 669    d0 = (uint32_t)d;
 670    d1 = d >> 32;
 671
 672    r1 = n1 % d1;
 673    q1 = n1 / d1;
 674    m = q1 * d0;
 675    r1 = (r1 << 32) | (n0 >> 32);
 676    if (r1 < m) {
 677        q1 -= 1;
 678        r1 += d;
 679        if (r1 >= d) {
 680            if (r1 < m) {
 681                q1 -= 1;
 682                r1 += d;
 683            }
 684        }
 685    }
 686    r1 -= m;
 687
 688    r0 = r1 % d1;
 689    q0 = r1 / d1;
 690    m = q0 * d0;
 691    r0 = (r0 << 32) | (uint32_t)n0;
 692    if (r0 < m) {
 693        q0 -= 1;
 694        r0 += d;
 695        if (r0 >= d) {
 696            if (r0 < m) {
 697                q0 -= 1;
 698                r0 += d;
 699            }
 700        }
 701    }
 702    r0 -= m;
 703
 704    *r = r0;
 705    return (q1 << 32) | q0;
 706#endif
 707}
 708
 709/*----------------------------------------------------------------------------
 710| Returns an approximation to the square root of the 32-bit significand given
 711| by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
 712| `aExp' (the least significant bit) is 1, the integer returned approximates
 713| 2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp'
 714| is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either
 715| case, the approximation returned lies strictly within +/-2 of the exact
 716| value.
 717*----------------------------------------------------------------------------*/
 718
 719static inline uint32_t estimateSqrt32(int aExp, uint32_t a)
 720{
 721    static const uint16_t sqrtOddAdjustments[] = {
 722        0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
 723        0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
 724    };
 725    static const uint16_t sqrtEvenAdjustments[] = {
 726        0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
 727        0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
 728    };
 729    int8_t index;
 730    uint32_t z;
 731
 732    index = ( a>>27 ) & 15;
 733    if ( aExp & 1 ) {
 734        z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ];
 735        z = ( ( a / z )<<14 ) + ( z<<15 );
 736        a >>= 1;
 737    }
 738    else {
 739        z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ];
 740        z = a / z + z;
 741        z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
 742        if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 );
 743    }
 744    return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 );
 745
 746}
 747
 748/*----------------------------------------------------------------------------
 749| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
 750| is equal to the 128-bit value formed by concatenating `b0' and `b1'.
 751| Otherwise, returns 0.
 752*----------------------------------------------------------------------------*/
 753
 754static inline flag eq128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
 755{
 756
 757    return ( a0 == b0 ) && ( a1 == b1 );
 758
 759}
 760
 761/*----------------------------------------------------------------------------
 762| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
 763| than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
 764| Otherwise, returns 0.
 765*----------------------------------------------------------------------------*/
 766
 767static inline flag le128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
 768{
 769
 770    return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 <= b1 ) );
 771
 772}
 773
 774/*----------------------------------------------------------------------------
 775| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
 776| than the 128-bit value formed by concatenating `b0' and `b1'.  Otherwise,
 777| returns 0.
 778*----------------------------------------------------------------------------*/
 779
 780static inline flag lt128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
 781{
 782
 783    return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 < b1 ) );
 784
 785}
 786
 787/*----------------------------------------------------------------------------
 788| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
 789| not equal to the 128-bit value formed by concatenating `b0' and `b1'.
 790| Otherwise, returns 0.
 791*----------------------------------------------------------------------------*/
 792
 793static inline flag ne128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
 794{
 795
 796    return ( a0 != b0 ) || ( a1 != b1 );
 797
 798}
 799