qemu/include/fpu/softfloat-macros.h
<<
>>
Prefs
   1/*
   2 * QEMU float support macros
   3 *
   4 * The code in this source file is derived from release 2a of the SoftFloat
   5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6 * some later contributions) are provided under that license, as detailed below.
   7 * It has subsequently been modified by contributors to the QEMU Project,
   8 * so some portions are provided under:
   9 *  the SoftFloat-2a license
  10 *  the BSD license
  11 *  GPL-v2-or-later
  12 *
  13 * Any future contributions to this file after December 1st 2014 will be
  14 * taken to be licensed under the Softfloat-2a license unless specifically
  15 * indicated otherwise.
  16 */
  17
  18/*
  19===============================================================================
  20This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
  21Arithmetic Package, Release 2a.
  22
  23Written by John R. Hauser.  This work was made possible in part by the
  24International Computer Science Institute, located at Suite 600, 1947 Center
  25Street, Berkeley, California 94704.  Funding was partially provided by the
  26National Science Foundation under grant MIP-9311980.  The original version
  27of this code was written as part of a project to build a fixed-point vector
  28processor in collaboration with the University of California at Berkeley,
  29overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31arithmetic/SoftFloat.html'.
  32
  33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39Derivative works are acceptable, even for commercial purposes, so long as
  40(1) they include prominent notice that the work is derivative, and (2) they
  41include prominent notice akin to these four paragraphs for those parts of
  42this code that are retained.
  43
  44===============================================================================
  45*/
  46
  47/* BSD licensing:
  48 * Copyright (c) 2006, Fabrice Bellard
  49 * All rights reserved.
  50 *
  51 * Redistribution and use in source and binary forms, with or without
  52 * modification, are permitted provided that the following conditions are met:
  53 *
  54 * 1. Redistributions of source code must retain the above copyright notice,
  55 * this list of conditions and the following disclaimer.
  56 *
  57 * 2. Redistributions in binary form must reproduce the above copyright notice,
  58 * this list of conditions and the following disclaimer in the documentation
  59 * and/or other materials provided with the distribution.
  60 *
  61 * 3. Neither the name of the copyright holder nor the names of its contributors
  62 * may be used to endorse or promote products derived from this software without
  63 * specific prior written permission.
  64 *
  65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75 * THE POSSIBILITY OF SUCH DAMAGE.
  76 */
  77
  78/* Portions of this work are licensed under the terms of the GNU GPL,
  79 * version 2 or later. See the COPYING file in the top-level directory.
  80 */
  81
  82#ifndef FPU_SOFTFLOAT_MACROS_H
  83#define FPU_SOFTFLOAT_MACROS_H
  84
  85/*----------------------------------------------------------------------------
  86| Shifts `a' right by the number of bits given in `count'.  If any nonzero
  87| bits are shifted off, they are ``jammed'' into the least significant bit of
  88| the result by setting the least significant bit to 1.  The value of `count'
  89| can be arbitrarily large; in particular, if `count' is greater than 32, the
  90| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  91| The result is stored in the location pointed to by `zPtr'.
  92*----------------------------------------------------------------------------*/
  93
  94static inline void shift32RightJamming(uint32_t a, int count, uint32_t *zPtr)
  95{
  96    uint32_t z;
  97
  98    if ( count == 0 ) {
  99        z = a;
 100    }
 101    else if ( count < 32 ) {
 102        z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
 103    }
 104    else {
 105        z = ( a != 0 );
 106    }
 107    *zPtr = z;
 108
 109}
 110
 111/*----------------------------------------------------------------------------
 112| Shifts `a' right by the number of bits given in `count'.  If any nonzero
 113| bits are shifted off, they are ``jammed'' into the least significant bit of
 114| the result by setting the least significant bit to 1.  The value of `count'
 115| can be arbitrarily large; in particular, if `count' is greater than 64, the
 116| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
 117| The result is stored in the location pointed to by `zPtr'.
 118*----------------------------------------------------------------------------*/
 119
 120static inline void shift64RightJamming(uint64_t a, int count, uint64_t *zPtr)
 121{
 122    uint64_t z;
 123
 124    if ( count == 0 ) {
 125        z = a;
 126    }
 127    else if ( count < 64 ) {
 128        z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
 129    }
 130    else {
 131        z = ( a != 0 );
 132    }
 133    *zPtr = z;
 134
 135}
 136
 137/*----------------------------------------------------------------------------
 138| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
 139| _plus_ the number of bits given in `count'.  The shifted result is at most
 140| 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'.  The
 141| bits shifted off form a second 64-bit result as follows:  The _last_ bit
 142| shifted off is the most-significant bit of the extra result, and the other
 143| 63 bits of the extra result are all zero if and only if _all_but_the_last_
 144| bits shifted off were all zero.  This extra result is stored in the location
 145| pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large.
 146|     (This routine makes more sense if `a0' and `a1' are considered to form a
 147| fixed-point value with binary point between `a0' and `a1'.  This fixed-point
 148| value is shifted right by the number of bits given in `count', and the
 149| integer part of the result is returned at the location pointed to by
 150| `z0Ptr'.  The fractional part of the result may be slightly corrupted as
 151| described above, and is returned at the location pointed to by `z1Ptr'.)
 152*----------------------------------------------------------------------------*/
 153
 154static inline void
 155 shift64ExtraRightJamming(
 156     uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
 157{
 158    uint64_t z0, z1;
 159    int8_t negCount = ( - count ) & 63;
 160
 161    if ( count == 0 ) {
 162        z1 = a1;
 163        z0 = a0;
 164    }
 165    else if ( count < 64 ) {
 166        z1 = ( a0<<negCount ) | ( a1 != 0 );
 167        z0 = a0>>count;
 168    }
 169    else {
 170        if ( count == 64 ) {
 171            z1 = a0 | ( a1 != 0 );
 172        }
 173        else {
 174            z1 = ( ( a0 | a1 ) != 0 );
 175        }
 176        z0 = 0;
 177    }
 178    *z1Ptr = z1;
 179    *z0Ptr = z0;
 180
 181}
 182
 183/*----------------------------------------------------------------------------
 184| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
 185| number of bits given in `count'.  Any bits shifted off are lost.  The value
 186| of `count' can be arbitrarily large; in particular, if `count' is greater
 187| than 128, the result will be 0.  The result is broken into two 64-bit pieces
 188| which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 189*----------------------------------------------------------------------------*/
 190
 191static inline void
 192 shift128Right(
 193     uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
 194{
 195    uint64_t z0, z1;
 196    int8_t negCount = ( - count ) & 63;
 197
 198    if ( count == 0 ) {
 199        z1 = a1;
 200        z0 = a0;
 201    }
 202    else if ( count < 64 ) {
 203        z1 = ( a0<<negCount ) | ( a1>>count );
 204        z0 = a0>>count;
 205    }
 206    else {
 207        z1 = (count < 128) ? (a0 >> (count & 63)) : 0;
 208        z0 = 0;
 209    }
 210    *z1Ptr = z1;
 211    *z0Ptr = z0;
 212
 213}
 214
 215/*----------------------------------------------------------------------------
 216| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
 217| number of bits given in `count'.  If any nonzero bits are shifted off, they
 218| are ``jammed'' into the least significant bit of the result by setting the
 219| least significant bit to 1.  The value of `count' can be arbitrarily large;
 220| in particular, if `count' is greater than 128, the result will be either
 221| 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
 222| nonzero.  The result is broken into two 64-bit pieces which are stored at
 223| the locations pointed to by `z0Ptr' and `z1Ptr'.
 224*----------------------------------------------------------------------------*/
 225
 226static inline void
 227 shift128RightJamming(
 228     uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
 229{
 230    uint64_t z0, z1;
 231    int8_t negCount = ( - count ) & 63;
 232
 233    if ( count == 0 ) {
 234        z1 = a1;
 235        z0 = a0;
 236    }
 237    else if ( count < 64 ) {
 238        z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
 239        z0 = a0>>count;
 240    }
 241    else {
 242        if ( count == 64 ) {
 243            z1 = a0 | ( a1 != 0 );
 244        }
 245        else if ( count < 128 ) {
 246            z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
 247        }
 248        else {
 249            z1 = ( ( a0 | a1 ) != 0 );
 250        }
 251        z0 = 0;
 252    }
 253    *z1Ptr = z1;
 254    *z0Ptr = z0;
 255
 256}
 257
 258/*----------------------------------------------------------------------------
 259| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
 260| by 64 _plus_ the number of bits given in `count'.  The shifted result is
 261| at most 128 nonzero bits; these are broken into two 64-bit pieces which are
 262| stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
 263| off form a third 64-bit result as follows:  The _last_ bit shifted off is
 264| the most-significant bit of the extra result, and the other 63 bits of the
 265| extra result are all zero if and only if _all_but_the_last_ bits shifted off
 266| were all zero.  This extra result is stored in the location pointed to by
 267| `z2Ptr'.  The value of `count' can be arbitrarily large.
 268|     (This routine makes more sense if `a0', `a1', and `a2' are considered
 269| to form a fixed-point value with binary point between `a1' and `a2'.  This
 270| fixed-point value is shifted right by the number of bits given in `count',
 271| and the integer part of the result is returned at the locations pointed to
 272| by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
 273| corrupted as described above, and is returned at the location pointed to by
 274| `z2Ptr'.)
 275*----------------------------------------------------------------------------*/
 276
 277static inline void
 278 shift128ExtraRightJamming(
 279     uint64_t a0,
 280     uint64_t a1,
 281     uint64_t a2,
 282     int count,
 283     uint64_t *z0Ptr,
 284     uint64_t *z1Ptr,
 285     uint64_t *z2Ptr
 286 )
 287{
 288    uint64_t z0, z1, z2;
 289    int8_t negCount = ( - count ) & 63;
 290
 291    if ( count == 0 ) {
 292        z2 = a2;
 293        z1 = a1;
 294        z0 = a0;
 295    }
 296    else {
 297        if ( count < 64 ) {
 298            z2 = a1<<negCount;
 299            z1 = ( a0<<negCount ) | ( a1>>count );
 300            z0 = a0>>count;
 301        }
 302        else {
 303            if ( count == 64 ) {
 304                z2 = a1;
 305                z1 = a0;
 306            }
 307            else {
 308                a2 |= a1;
 309                if ( count < 128 ) {
 310                    z2 = a0<<negCount;
 311                    z1 = a0>>( count & 63 );
 312                }
 313                else {
 314                    z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
 315                    z1 = 0;
 316                }
 317            }
 318            z0 = 0;
 319        }
 320        z2 |= ( a2 != 0 );
 321    }
 322    *z2Ptr = z2;
 323    *z1Ptr = z1;
 324    *z0Ptr = z0;
 325
 326}
 327
 328/*----------------------------------------------------------------------------
 329| Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
 330| number of bits given in `count'.  Any bits shifted off are lost.  The value
 331| of `count' must be less than 64.  The result is broken into two 64-bit
 332| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 333*----------------------------------------------------------------------------*/
 334
 335static inline void shortShift128Left(uint64_t a0, uint64_t a1, int count,
 336                                     uint64_t *z0Ptr, uint64_t *z1Ptr)
 337{
 338    *z1Ptr = a1 << count;
 339    *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
 340}
 341
 342/*----------------------------------------------------------------------------
 343| Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
 344| number of bits given in `count'.  Any bits shifted off are lost.  The value
 345| of `count' may be greater than 64.  The result is broken into two 64-bit
 346| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 347*----------------------------------------------------------------------------*/
 348
 349static inline void shift128Left(uint64_t a0, uint64_t a1, int count,
 350                                uint64_t *z0Ptr, uint64_t *z1Ptr)
 351{
 352    if (count < 64) {
 353        *z1Ptr = a1 << count;
 354        *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
 355    } else {
 356        *z1Ptr = 0;
 357        *z0Ptr = a1 << (count - 64);
 358    }
 359}
 360
 361/*----------------------------------------------------------------------------
 362| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
 363| by the number of bits given in `count'.  Any bits shifted off are lost.
 364| The value of `count' must be less than 64.  The result is broken into three
 365| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
 366| `z1Ptr', and `z2Ptr'.
 367*----------------------------------------------------------------------------*/
 368
 369static inline void
 370 shortShift192Left(
 371     uint64_t a0,
 372     uint64_t a1,
 373     uint64_t a2,
 374     int count,
 375     uint64_t *z0Ptr,
 376     uint64_t *z1Ptr,
 377     uint64_t *z2Ptr
 378 )
 379{
 380    uint64_t z0, z1, z2;
 381    int8_t negCount;
 382
 383    z2 = a2<<count;
 384    z1 = a1<<count;
 385    z0 = a0<<count;
 386    if ( 0 < count ) {
 387        negCount = ( ( - count ) & 63 );
 388        z1 |= a2>>negCount;
 389        z0 |= a1>>negCount;
 390    }
 391    *z2Ptr = z2;
 392    *z1Ptr = z1;
 393    *z0Ptr = z0;
 394
 395}
 396
 397/*----------------------------------------------------------------------------
 398| Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
 399| value formed by concatenating `b0' and `b1'.  Addition is modulo 2^128, so
 400| any carry out is lost.  The result is broken into two 64-bit pieces which
 401| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 402*----------------------------------------------------------------------------*/
 403
 404static inline void
 405 add128(
 406     uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr )
 407{
 408    uint64_t z1;
 409
 410    z1 = a1 + b1;
 411    *z1Ptr = z1;
 412    *z0Ptr = a0 + b0 + ( z1 < a1 );
 413
 414}
 415
 416/*----------------------------------------------------------------------------
 417| Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
 418| 192-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
 419| modulo 2^192, so any carry out is lost.  The result is broken into three
 420| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
 421| `z1Ptr', and `z2Ptr'.
 422*----------------------------------------------------------------------------*/
 423
 424static inline void
 425 add192(
 426     uint64_t a0,
 427     uint64_t a1,
 428     uint64_t a2,
 429     uint64_t b0,
 430     uint64_t b1,
 431     uint64_t b2,
 432     uint64_t *z0Ptr,
 433     uint64_t *z1Ptr,
 434     uint64_t *z2Ptr
 435 )
 436{
 437    uint64_t z0, z1, z2;
 438    int8_t carry0, carry1;
 439
 440    z2 = a2 + b2;
 441    carry1 = ( z2 < a2 );
 442    z1 = a1 + b1;
 443    carry0 = ( z1 < a1 );
 444    z0 = a0 + b0;
 445    z1 += carry1;
 446    z0 += ( z1 < carry1 );
 447    z0 += carry0;
 448    *z2Ptr = z2;
 449    *z1Ptr = z1;
 450    *z0Ptr = z0;
 451
 452}
 453
 454/*----------------------------------------------------------------------------
 455| Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
 456| 128-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo
 457| 2^128, so any borrow out (carry out) is lost.  The result is broken into two
 458| 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
 459| `z1Ptr'.
 460*----------------------------------------------------------------------------*/
 461
 462static inline void
 463 sub128(
 464     uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr )
 465{
 466
 467    *z1Ptr = a1 - b1;
 468    *z0Ptr = a0 - b0 - ( a1 < b1 );
 469
 470}
 471
 472/*----------------------------------------------------------------------------
 473| Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
 474| from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
 475| Subtraction is modulo 2^192, so any borrow out (carry out) is lost.  The
 476| result is broken into three 64-bit pieces which are stored at the locations
 477| pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
 478*----------------------------------------------------------------------------*/
 479
 480static inline void
 481 sub192(
 482     uint64_t a0,
 483     uint64_t a1,
 484     uint64_t a2,
 485     uint64_t b0,
 486     uint64_t b1,
 487     uint64_t b2,
 488     uint64_t *z0Ptr,
 489     uint64_t *z1Ptr,
 490     uint64_t *z2Ptr
 491 )
 492{
 493    uint64_t z0, z1, z2;
 494    int8_t borrow0, borrow1;
 495
 496    z2 = a2 - b2;
 497    borrow1 = ( a2 < b2 );
 498    z1 = a1 - b1;
 499    borrow0 = ( a1 < b1 );
 500    z0 = a0 - b0;
 501    z0 -= ( z1 < borrow1 );
 502    z1 -= borrow1;
 503    z0 -= borrow0;
 504    *z2Ptr = z2;
 505    *z1Ptr = z1;
 506    *z0Ptr = z0;
 507
 508}
 509
 510/*----------------------------------------------------------------------------
 511| Multiplies `a' by `b' to obtain a 128-bit product.  The product is broken
 512| into two 64-bit pieces which are stored at the locations pointed to by
 513| `z0Ptr' and `z1Ptr'.
 514*----------------------------------------------------------------------------*/
 515
 516static inline void mul64To128( uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr )
 517{
 518    uint32_t aHigh, aLow, bHigh, bLow;
 519    uint64_t z0, zMiddleA, zMiddleB, z1;
 520
 521    aLow = a;
 522    aHigh = a>>32;
 523    bLow = b;
 524    bHigh = b>>32;
 525    z1 = ( (uint64_t) aLow ) * bLow;
 526    zMiddleA = ( (uint64_t) aLow ) * bHigh;
 527    zMiddleB = ( (uint64_t) aHigh ) * bLow;
 528    z0 = ( (uint64_t) aHigh ) * bHigh;
 529    zMiddleA += zMiddleB;
 530    z0 += ( ( (uint64_t) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 );
 531    zMiddleA <<= 32;
 532    z1 += zMiddleA;
 533    z0 += ( z1 < zMiddleA );
 534    *z1Ptr = z1;
 535    *z0Ptr = z0;
 536
 537}
 538
 539/*----------------------------------------------------------------------------
 540| Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
 541| `b' to obtain a 192-bit product.  The product is broken into three 64-bit
 542| pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
 543| `z2Ptr'.
 544*----------------------------------------------------------------------------*/
 545
 546static inline void
 547 mul128By64To192(
 548     uint64_t a0,
 549     uint64_t a1,
 550     uint64_t b,
 551     uint64_t *z0Ptr,
 552     uint64_t *z1Ptr,
 553     uint64_t *z2Ptr
 554 )
 555{
 556    uint64_t z0, z1, z2, more1;
 557
 558    mul64To128( a1, b, &z1, &z2 );
 559    mul64To128( a0, b, &z0, &more1 );
 560    add128( z0, more1, 0, z1, &z0, &z1 );
 561    *z2Ptr = z2;
 562    *z1Ptr = z1;
 563    *z0Ptr = z0;
 564
 565}
 566
 567/*----------------------------------------------------------------------------
 568| Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
 569| 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
 570| product.  The product is broken into four 64-bit pieces which are stored at
 571| the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
 572*----------------------------------------------------------------------------*/
 573
 574static inline void
 575 mul128To256(
 576     uint64_t a0,
 577     uint64_t a1,
 578     uint64_t b0,
 579     uint64_t b1,
 580     uint64_t *z0Ptr,
 581     uint64_t *z1Ptr,
 582     uint64_t *z2Ptr,
 583     uint64_t *z3Ptr
 584 )
 585{
 586    uint64_t z0, z1, z2, z3;
 587    uint64_t more1, more2;
 588
 589    mul64To128( a1, b1, &z2, &z3 );
 590    mul64To128( a1, b0, &z1, &more2 );
 591    add128( z1, more2, 0, z2, &z1, &z2 );
 592    mul64To128( a0, b0, &z0, &more1 );
 593    add128( z0, more1, 0, z1, &z0, &z1 );
 594    mul64To128( a0, b1, &more1, &more2 );
 595    add128( more1, more2, 0, z2, &more1, &z2 );
 596    add128( z0, z1, 0, more1, &z0, &z1 );
 597    *z3Ptr = z3;
 598    *z2Ptr = z2;
 599    *z1Ptr = z1;
 600    *z0Ptr = z0;
 601
 602}
 603
 604/*----------------------------------------------------------------------------
 605| Returns an approximation to the 64-bit integer quotient obtained by dividing
 606| `b' into the 128-bit value formed by concatenating `a0' and `a1'.  The
 607| divisor `b' must be at least 2^63.  If q is the exact quotient truncated
 608| toward zero, the approximation returned lies between q and q + 2 inclusive.
 609| If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
 610| unsigned integer is returned.
 611*----------------------------------------------------------------------------*/
 612
 613static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
 614{
 615    uint64_t b0, b1;
 616    uint64_t rem0, rem1, term0, term1;
 617    uint64_t z;
 618
 619    if ( b <= a0 ) return LIT64( 0xFFFFFFFFFFFFFFFF );
 620    b0 = b>>32;
 621    z = ( b0<<32 <= a0 ) ? LIT64( 0xFFFFFFFF00000000 ) : ( a0 / b0 )<<32;
 622    mul64To128( b, z, &term0, &term1 );
 623    sub128( a0, a1, term0, term1, &rem0, &rem1 );
 624    while ( ( (int64_t) rem0 ) < 0 ) {
 625        z -= LIT64( 0x100000000 );
 626        b1 = b<<32;
 627        add128( rem0, rem1, b0, b1, &rem0, &rem1 );
 628    }
 629    rem0 = ( rem0<<32 ) | ( rem1>>32 );
 630    z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
 631    return z;
 632
 633}
 634
 635/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
 636 * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
 637 *
 638 * Licensed under the GPLv2/LGPLv3
 639 */
 640static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
 641                                  uint64_t n0, uint64_t d)
 642{
 643#if defined(__x86_64__)
 644    uint64_t q;
 645    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
 646    return q;
 647#elif defined(__s390x__) && !defined(__clang__)
 648    /* Need to use a TImode type to get an even register pair for DLGR.  */
 649    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
 650    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
 651    *r = n >> 64;
 652    return n;
 653#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
 654    /* From Power ISA 2.06, programming note for divdeu.  */
 655    uint64_t q1, q2, Q, r1, r2, R;
 656    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
 657        : "=&r"(q1), "=r"(q2)
 658        : "r"(n1), "r"(n0), "r"(d));
 659    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
 660    r2 = n0 - (q2 * d);
 661    Q = q1 + q2;
 662    R = r1 + r2;
 663    if (R >= d || R < r2) { /* overflow implies R > d */
 664        Q += 1;
 665        R -= d;
 666    }
 667    *r = R;
 668    return Q;
 669#else
 670    uint64_t d0, d1, q0, q1, r1, r0, m;
 671
 672    d0 = (uint32_t)d;
 673    d1 = d >> 32;
 674
 675    r1 = n1 % d1;
 676    q1 = n1 / d1;
 677    m = q1 * d0;
 678    r1 = (r1 << 32) | (n0 >> 32);
 679    if (r1 < m) {
 680        q1 -= 1;
 681        r1 += d;
 682        if (r1 >= d) {
 683            if (r1 < m) {
 684                q1 -= 1;
 685                r1 += d;
 686            }
 687        }
 688    }
 689    r1 -= m;
 690
 691    r0 = r1 % d1;
 692    q0 = r1 / d1;
 693    m = q0 * d0;
 694    r0 = (r0 << 32) | (uint32_t)n0;
 695    if (r0 < m) {
 696        q0 -= 1;
 697        r0 += d;
 698        if (r0 >= d) {
 699            if (r0 < m) {
 700                q0 -= 1;
 701                r0 += d;
 702            }
 703        }
 704    }
 705    r0 -= m;
 706
 707    *r = r0;
 708    return (q1 << 32) | q0;
 709#endif
 710}
 711
 712/*----------------------------------------------------------------------------
 713| Returns an approximation to the square root of the 32-bit significand given
 714| by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
 715| `aExp' (the least significant bit) is 1, the integer returned approximates
 716| 2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp'
 717| is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either
 718| case, the approximation returned lies strictly within +/-2 of the exact
 719| value.
 720*----------------------------------------------------------------------------*/
 721
 722static inline uint32_t estimateSqrt32(int aExp, uint32_t a)
 723{
 724    static const uint16_t sqrtOddAdjustments[] = {
 725        0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
 726        0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
 727    };
 728    static const uint16_t sqrtEvenAdjustments[] = {
 729        0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
 730        0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
 731    };
 732    int8_t index;
 733    uint32_t z;
 734
 735    index = ( a>>27 ) & 15;
 736    if ( aExp & 1 ) {
 737        z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ];
 738        z = ( ( a / z )<<14 ) + ( z<<15 );
 739        a >>= 1;
 740    }
 741    else {
 742        z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ];
 743        z = a / z + z;
 744        z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
 745        if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 );
 746    }
 747    return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 );
 748
 749}
 750
 751/*----------------------------------------------------------------------------
 752| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
 753| is equal to the 128-bit value formed by concatenating `b0' and `b1'.
 754| Otherwise, returns 0.
 755*----------------------------------------------------------------------------*/
 756
 757static inline flag eq128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
 758{
 759
 760    return ( a0 == b0 ) && ( a1 == b1 );
 761
 762}
 763
 764/*----------------------------------------------------------------------------
 765| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
 766| than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
 767| Otherwise, returns 0.
 768*----------------------------------------------------------------------------*/
 769
 770static inline flag le128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
 771{
 772
 773    return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 <= b1 ) );
 774
 775}
 776
 777/*----------------------------------------------------------------------------
 778| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
 779| than the 128-bit value formed by concatenating `b0' and `b1'.  Otherwise,
 780| returns 0.
 781*----------------------------------------------------------------------------*/
 782
 783static inline flag lt128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
 784{
 785
 786    return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 < b1 ) );
 787
 788}
 789
 790/*----------------------------------------------------------------------------
 791| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
 792| not equal to the 128-bit value formed by concatenating `b0' and `b1'.
 793| Otherwise, returns 0.
 794*----------------------------------------------------------------------------*/
 795
 796static inline flag ne128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
 797{
 798
 799    return ( a0 != b0 ) || ( a1 != b1 );
 800
 801}
 802
 803#endif
 804