qemu/include/fpu/softfloat-macros.h
<<
>>
Prefs
   1/*
   2 * QEMU float support macros
   3 *
   4 * The code in this source file is derived from release 2a of the SoftFloat
   5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6 * some later contributions) are provided under that license, as detailed below.
   7 * It has subsequently been modified by contributors to the QEMU Project,
   8 * so some portions are provided under:
   9 *  the SoftFloat-2a license
  10 *  the BSD license
  11 *  GPL-v2-or-later
  12 *
  13 * Any future contributions to this file after December 1st 2014 will be
  14 * taken to be licensed under the Softfloat-2a license unless specifically
  15 * indicated otherwise.
  16 */
  17
  18/*
  19===============================================================================
  20This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
  21Arithmetic Package, Release 2a.
  22
  23Written by John R. Hauser.  This work was made possible in part by the
  24International Computer Science Institute, located at Suite 600, 1947 Center
  25Street, Berkeley, California 94704.  Funding was partially provided by the
  26National Science Foundation under grant MIP-9311980.  The original version
  27of this code was written as part of a project to build a fixed-point vector
  28processor in collaboration with the University of California at Berkeley,
  29overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31arithmetic/SoftFloat.html'.
  32
  33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39Derivative works are acceptable, even for commercial purposes, so long as
  40(1) they include prominent notice that the work is derivative, and (2) they
  41include prominent notice akin to these four paragraphs for those parts of
  42this code that are retained.
  43
  44===============================================================================
  45*/
  46
  47/* BSD licensing:
  48 * Copyright (c) 2006, Fabrice Bellard
  49 * All rights reserved.
  50 *
  51 * Redistribution and use in source and binary forms, with or without
  52 * modification, are permitted provided that the following conditions are met:
  53 *
  54 * 1. Redistributions of source code must retain the above copyright notice,
  55 * this list of conditions and the following disclaimer.
  56 *
  57 * 2. Redistributions in binary form must reproduce the above copyright notice,
  58 * this list of conditions and the following disclaimer in the documentation
  59 * and/or other materials provided with the distribution.
  60 *
  61 * 3. Neither the name of the copyright holder nor the names of its contributors
  62 * may be used to endorse or promote products derived from this software without
  63 * specific prior written permission.
  64 *
  65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75 * THE POSSIBILITY OF SUCH DAMAGE.
  76 */
  77
  78/* Portions of this work are licensed under the terms of the GNU GPL,
  79 * version 2 or later. See the COPYING file in the top-level directory.
  80 */
  81
  82#ifndef FPU_SOFTFLOAT_MACROS_H
  83#define FPU_SOFTFLOAT_MACROS_H
  84
  85#include "fpu/softfloat-types.h"
  86
  87/*----------------------------------------------------------------------------
  88| Shifts `a' right by the number of bits given in `count'.  If any nonzero
  89| bits are shifted off, they are ``jammed'' into the least significant bit of
  90| the result by setting the least significant bit to 1.  The value of `count'
  91| can be arbitrarily large; in particular, if `count' is greater than 32, the
  92| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  93| The result is stored in the location pointed to by `zPtr'.
  94*----------------------------------------------------------------------------*/
  95
  96static inline void shift32RightJamming(uint32_t a, int count, uint32_t *zPtr)
  97{
  98    uint32_t z;
  99
 100    if ( count == 0 ) {
 101        z = a;
 102    }
 103    else if ( count < 32 ) {
 104        z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
 105    }
 106    else {
 107        z = ( a != 0 );
 108    }
 109    *zPtr = z;
 110
 111}
 112
 113/*----------------------------------------------------------------------------
 114| Shifts `a' right by the number of bits given in `count'.  If any nonzero
 115| bits are shifted off, they are ``jammed'' into the least significant bit of
 116| the result by setting the least significant bit to 1.  The value of `count'
 117| can be arbitrarily large; in particular, if `count' is greater than 64, the
 118| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
 119| The result is stored in the location pointed to by `zPtr'.
 120*----------------------------------------------------------------------------*/
 121
 122static inline void shift64RightJamming(uint64_t a, int count, uint64_t *zPtr)
 123{
 124    uint64_t z;
 125
 126    if ( count == 0 ) {
 127        z = a;
 128    }
 129    else if ( count < 64 ) {
 130        z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
 131    }
 132    else {
 133        z = ( a != 0 );
 134    }
 135    *zPtr = z;
 136
 137}
 138
 139/*----------------------------------------------------------------------------
 140| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
 141| _plus_ the number of bits given in `count'.  The shifted result is at most
 142| 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'.  The
 143| bits shifted off form a second 64-bit result as follows:  The _last_ bit
 144| shifted off is the most-significant bit of the extra result, and the other
 145| 63 bits of the extra result are all zero if and only if _all_but_the_last_
 146| bits shifted off were all zero.  This extra result is stored in the location
 147| pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large.
 148|     (This routine makes more sense if `a0' and `a1' are considered to form a
 149| fixed-point value with binary point between `a0' and `a1'.  This fixed-point
 150| value is shifted right by the number of bits given in `count', and the
 151| integer part of the result is returned at the location pointed to by
 152| `z0Ptr'.  The fractional part of the result may be slightly corrupted as
 153| described above, and is returned at the location pointed to by `z1Ptr'.)
 154*----------------------------------------------------------------------------*/
 155
 156static inline void
 157 shift64ExtraRightJamming(
 158     uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
 159{
 160    uint64_t z0, z1;
 161    int8_t negCount = ( - count ) & 63;
 162
 163    if ( count == 0 ) {
 164        z1 = a1;
 165        z0 = a0;
 166    }
 167    else if ( count < 64 ) {
 168        z1 = ( a0<<negCount ) | ( a1 != 0 );
 169        z0 = a0>>count;
 170    }
 171    else {
 172        if ( count == 64 ) {
 173            z1 = a0 | ( a1 != 0 );
 174        }
 175        else {
 176            z1 = ( ( a0 | a1 ) != 0 );
 177        }
 178        z0 = 0;
 179    }
 180    *z1Ptr = z1;
 181    *z0Ptr = z0;
 182
 183}
 184
 185/*----------------------------------------------------------------------------
 186| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
 187| number of bits given in `count'.  Any bits shifted off are lost.  The value
 188| of `count' can be arbitrarily large; in particular, if `count' is greater
 189| than 128, the result will be 0.  The result is broken into two 64-bit pieces
 190| which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 191*----------------------------------------------------------------------------*/
 192
 193static inline void
 194 shift128Right(
 195     uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
 196{
 197    uint64_t z0, z1;
 198    int8_t negCount = ( - count ) & 63;
 199
 200    if ( count == 0 ) {
 201        z1 = a1;
 202        z0 = a0;
 203    }
 204    else if ( count < 64 ) {
 205        z1 = ( a0<<negCount ) | ( a1>>count );
 206        z0 = a0>>count;
 207    }
 208    else {
 209        z1 = (count < 128) ? (a0 >> (count & 63)) : 0;
 210        z0 = 0;
 211    }
 212    *z1Ptr = z1;
 213    *z0Ptr = z0;
 214
 215}
 216
 217/*----------------------------------------------------------------------------
 218| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
 219| number of bits given in `count'.  If any nonzero bits are shifted off, they
 220| are ``jammed'' into the least significant bit of the result by setting the
 221| least significant bit to 1.  The value of `count' can be arbitrarily large;
 222| in particular, if `count' is greater than 128, the result will be either
 223| 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
 224| nonzero.  The result is broken into two 64-bit pieces which are stored at
 225| the locations pointed to by `z0Ptr' and `z1Ptr'.
 226*----------------------------------------------------------------------------*/
 227
 228static inline void
 229 shift128RightJamming(
 230     uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
 231{
 232    uint64_t z0, z1;
 233    int8_t negCount = ( - count ) & 63;
 234
 235    if ( count == 0 ) {
 236        z1 = a1;
 237        z0 = a0;
 238    }
 239    else if ( count < 64 ) {
 240        z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
 241        z0 = a0>>count;
 242    }
 243    else {
 244        if ( count == 64 ) {
 245            z1 = a0 | ( a1 != 0 );
 246        }
 247        else if ( count < 128 ) {
 248            z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
 249        }
 250        else {
 251            z1 = ( ( a0 | a1 ) != 0 );
 252        }
 253        z0 = 0;
 254    }
 255    *z1Ptr = z1;
 256    *z0Ptr = z0;
 257
 258}
 259
 260/*----------------------------------------------------------------------------
 261| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
 262| by 64 _plus_ the number of bits given in `count'.  The shifted result is
 263| at most 128 nonzero bits; these are broken into two 64-bit pieces which are
 264| stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
 265| off form a third 64-bit result as follows:  The _last_ bit shifted off is
 266| the most-significant bit of the extra result, and the other 63 bits of the
 267| extra result are all zero if and only if _all_but_the_last_ bits shifted off
 268| were all zero.  This extra result is stored in the location pointed to by
 269| `z2Ptr'.  The value of `count' can be arbitrarily large.
 270|     (This routine makes more sense if `a0', `a1', and `a2' are considered
 271| to form a fixed-point value with binary point between `a1' and `a2'.  This
 272| fixed-point value is shifted right by the number of bits given in `count',
 273| and the integer part of the result is returned at the locations pointed to
 274| by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
 275| corrupted as described above, and is returned at the location pointed to by
 276| `z2Ptr'.)
 277*----------------------------------------------------------------------------*/
 278
 279static inline void
 280 shift128ExtraRightJamming(
 281     uint64_t a0,
 282     uint64_t a1,
 283     uint64_t a2,
 284     int count,
 285     uint64_t *z0Ptr,
 286     uint64_t *z1Ptr,
 287     uint64_t *z2Ptr
 288 )
 289{
 290    uint64_t z0, z1, z2;
 291    int8_t negCount = ( - count ) & 63;
 292
 293    if ( count == 0 ) {
 294        z2 = a2;
 295        z1 = a1;
 296        z0 = a0;
 297    }
 298    else {
 299        if ( count < 64 ) {
 300            z2 = a1<<negCount;
 301            z1 = ( a0<<negCount ) | ( a1>>count );
 302            z0 = a0>>count;
 303        }
 304        else {
 305            if ( count == 64 ) {
 306                z2 = a1;
 307                z1 = a0;
 308            }
 309            else {
 310                a2 |= a1;
 311                if ( count < 128 ) {
 312                    z2 = a0<<negCount;
 313                    z1 = a0>>( count & 63 );
 314                }
 315                else {
 316                    z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
 317                    z1 = 0;
 318                }
 319            }
 320            z0 = 0;
 321        }
 322        z2 |= ( a2 != 0 );
 323    }
 324    *z2Ptr = z2;
 325    *z1Ptr = z1;
 326    *z0Ptr = z0;
 327
 328}
 329
 330/*----------------------------------------------------------------------------
 331| Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
 332| number of bits given in `count'.  Any bits shifted off are lost.  The value
 333| of `count' must be less than 64.  The result is broken into two 64-bit
 334| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 335*----------------------------------------------------------------------------*/
 336
 337static inline void shortShift128Left(uint64_t a0, uint64_t a1, int count,
 338                                     uint64_t *z0Ptr, uint64_t *z1Ptr)
 339{
 340    *z1Ptr = a1 << count;
 341    *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
 342}
 343
 344/*----------------------------------------------------------------------------
 345| Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
 346| number of bits given in `count'.  Any bits shifted off are lost.  The value
 347| of `count' may be greater than 64.  The result is broken into two 64-bit
 348| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 349*----------------------------------------------------------------------------*/
 350
 351static inline void shift128Left(uint64_t a0, uint64_t a1, int count,
 352                                uint64_t *z0Ptr, uint64_t *z1Ptr)
 353{
 354    if (count < 64) {
 355        *z1Ptr = a1 << count;
 356        *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
 357    } else {
 358        *z1Ptr = 0;
 359        *z0Ptr = a1 << (count - 64);
 360    }
 361}
 362
 363/*----------------------------------------------------------------------------
 364| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
 365| by the number of bits given in `count'.  Any bits shifted off are lost.
 366| The value of `count' must be less than 64.  The result is broken into three
 367| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
 368| `z1Ptr', and `z2Ptr'.
 369*----------------------------------------------------------------------------*/
 370
 371static inline void
 372 shortShift192Left(
 373     uint64_t a0,
 374     uint64_t a1,
 375     uint64_t a2,
 376     int count,
 377     uint64_t *z0Ptr,
 378     uint64_t *z1Ptr,
 379     uint64_t *z2Ptr
 380 )
 381{
 382    uint64_t z0, z1, z2;
 383    int8_t negCount;
 384
 385    z2 = a2<<count;
 386    z1 = a1<<count;
 387    z0 = a0<<count;
 388    if ( 0 < count ) {
 389        negCount = ( ( - count ) & 63 );
 390        z1 |= a2>>negCount;
 391        z0 |= a1>>negCount;
 392    }
 393    *z2Ptr = z2;
 394    *z1Ptr = z1;
 395    *z0Ptr = z0;
 396
 397}
 398
 399/*----------------------------------------------------------------------------
 400| Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
 401| value formed by concatenating `b0' and `b1'.  Addition is modulo 2^128, so
 402| any carry out is lost.  The result is broken into two 64-bit pieces which
 403| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 404*----------------------------------------------------------------------------*/
 405
 406static inline void
 407 add128(
 408     uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr )
 409{
 410    uint64_t z1;
 411
 412    z1 = a1 + b1;
 413    *z1Ptr = z1;
 414    *z0Ptr = a0 + b0 + ( z1 < a1 );
 415
 416}
 417
 418/*----------------------------------------------------------------------------
 419| Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
 420| 192-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
 421| modulo 2^192, so any carry out is lost.  The result is broken into three
 422| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
 423| `z1Ptr', and `z2Ptr'.
 424*----------------------------------------------------------------------------*/
 425
 426static inline void
 427 add192(
 428     uint64_t a0,
 429     uint64_t a1,
 430     uint64_t a2,
 431     uint64_t b0,
 432     uint64_t b1,
 433     uint64_t b2,
 434     uint64_t *z0Ptr,
 435     uint64_t *z1Ptr,
 436     uint64_t *z2Ptr
 437 )
 438{
 439    uint64_t z0, z1, z2;
 440    int8_t carry0, carry1;
 441
 442    z2 = a2 + b2;
 443    carry1 = ( z2 < a2 );
 444    z1 = a1 + b1;
 445    carry0 = ( z1 < a1 );
 446    z0 = a0 + b0;
 447    z1 += carry1;
 448    z0 += ( z1 < carry1 );
 449    z0 += carry0;
 450    *z2Ptr = z2;
 451    *z1Ptr = z1;
 452    *z0Ptr = z0;
 453
 454}
 455
 456/*----------------------------------------------------------------------------
 457| Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
 458| 128-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo
 459| 2^128, so any borrow out (carry out) is lost.  The result is broken into two
 460| 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
 461| `z1Ptr'.
 462*----------------------------------------------------------------------------*/
 463
 464static inline void
 465 sub128(
 466     uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr )
 467{
 468
 469    *z1Ptr = a1 - b1;
 470    *z0Ptr = a0 - b0 - ( a1 < b1 );
 471
 472}
 473
 474/*----------------------------------------------------------------------------
 475| Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
 476| from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
 477| Subtraction is modulo 2^192, so any borrow out (carry out) is lost.  The
 478| result is broken into three 64-bit pieces which are stored at the locations
 479| pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
 480*----------------------------------------------------------------------------*/
 481
 482static inline void
 483 sub192(
 484     uint64_t a0,
 485     uint64_t a1,
 486     uint64_t a2,
 487     uint64_t b0,
 488     uint64_t b1,
 489     uint64_t b2,
 490     uint64_t *z0Ptr,
 491     uint64_t *z1Ptr,
 492     uint64_t *z2Ptr
 493 )
 494{
 495    uint64_t z0, z1, z2;
 496    int8_t borrow0, borrow1;
 497
 498    z2 = a2 - b2;
 499    borrow1 = ( a2 < b2 );
 500    z1 = a1 - b1;
 501    borrow0 = ( a1 < b1 );
 502    z0 = a0 - b0;
 503    z0 -= ( z1 < borrow1 );
 504    z1 -= borrow1;
 505    z0 -= borrow0;
 506    *z2Ptr = z2;
 507    *z1Ptr = z1;
 508    *z0Ptr = z0;
 509
 510}
 511
 512/*----------------------------------------------------------------------------
 513| Multiplies `a' by `b' to obtain a 128-bit product.  The product is broken
 514| into two 64-bit pieces which are stored at the locations pointed to by
 515| `z0Ptr' and `z1Ptr'.
 516*----------------------------------------------------------------------------*/
 517
 518static inline void mul64To128( uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr )
 519{
 520    uint32_t aHigh, aLow, bHigh, bLow;
 521    uint64_t z0, zMiddleA, zMiddleB, z1;
 522
 523    aLow = a;
 524    aHigh = a>>32;
 525    bLow = b;
 526    bHigh = b>>32;
 527    z1 = ( (uint64_t) aLow ) * bLow;
 528    zMiddleA = ( (uint64_t) aLow ) * bHigh;
 529    zMiddleB = ( (uint64_t) aHigh ) * bLow;
 530    z0 = ( (uint64_t) aHigh ) * bHigh;
 531    zMiddleA += zMiddleB;
 532    z0 += ( ( (uint64_t) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 );
 533    zMiddleA <<= 32;
 534    z1 += zMiddleA;
 535    z0 += ( z1 < zMiddleA );
 536    *z1Ptr = z1;
 537    *z0Ptr = z0;
 538
 539}
 540
 541/*----------------------------------------------------------------------------
 542| Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
 543| `b' to obtain a 192-bit product.  The product is broken into three 64-bit
 544| pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
 545| `z2Ptr'.
 546*----------------------------------------------------------------------------*/
 547
 548static inline void
 549 mul128By64To192(
 550     uint64_t a0,
 551     uint64_t a1,
 552     uint64_t b,
 553     uint64_t *z0Ptr,
 554     uint64_t *z1Ptr,
 555     uint64_t *z2Ptr
 556 )
 557{
 558    uint64_t z0, z1, z2, more1;
 559
 560    mul64To128( a1, b, &z1, &z2 );
 561    mul64To128( a0, b, &z0, &more1 );
 562    add128( z0, more1, 0, z1, &z0, &z1 );
 563    *z2Ptr = z2;
 564    *z1Ptr = z1;
 565    *z0Ptr = z0;
 566
 567}
 568
 569/*----------------------------------------------------------------------------
 570| Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
 571| 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
 572| product.  The product is broken into four 64-bit pieces which are stored at
 573| the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
 574*----------------------------------------------------------------------------*/
 575
 576static inline void
 577 mul128To256(
 578     uint64_t a0,
 579     uint64_t a1,
 580     uint64_t b0,
 581     uint64_t b1,
 582     uint64_t *z0Ptr,
 583     uint64_t *z1Ptr,
 584     uint64_t *z2Ptr,
 585     uint64_t *z3Ptr
 586 )
 587{
 588    uint64_t z0, z1, z2, z3;
 589    uint64_t more1, more2;
 590
 591    mul64To128( a1, b1, &z2, &z3 );
 592    mul64To128( a1, b0, &z1, &more2 );
 593    add128( z1, more2, 0, z2, &z1, &z2 );
 594    mul64To128( a0, b0, &z0, &more1 );
 595    add128( z0, more1, 0, z1, &z0, &z1 );
 596    mul64To128( a0, b1, &more1, &more2 );
 597    add128( more1, more2, 0, z2, &more1, &z2 );
 598    add128( z0, z1, 0, more1, &z0, &z1 );
 599    *z3Ptr = z3;
 600    *z2Ptr = z2;
 601    *z1Ptr = z1;
 602    *z0Ptr = z0;
 603
 604}
 605
 606/*----------------------------------------------------------------------------
 607| Returns an approximation to the 64-bit integer quotient obtained by dividing
 608| `b' into the 128-bit value formed by concatenating `a0' and `a1'.  The
 609| divisor `b' must be at least 2^63.  If q is the exact quotient truncated
 610| toward zero, the approximation returned lies between q and q + 2 inclusive.
 611| If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
 612| unsigned integer is returned.
 613*----------------------------------------------------------------------------*/
 614
 615static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
 616{
 617    uint64_t b0, b1;
 618    uint64_t rem0, rem1, term0, term1;
 619    uint64_t z;
 620
 621    if ( b <= a0 ) return UINT64_C(0xFFFFFFFFFFFFFFFF);
 622    b0 = b>>32;
 623    z = ( b0<<32 <= a0 ) ? UINT64_C(0xFFFFFFFF00000000) : ( a0 / b0 )<<32;
 624    mul64To128( b, z, &term0, &term1 );
 625    sub128( a0, a1, term0, term1, &rem0, &rem1 );
 626    while ( ( (int64_t) rem0 ) < 0 ) {
 627        z -= UINT64_C(0x100000000);
 628        b1 = b<<32;
 629        add128( rem0, rem1, b0, b1, &rem0, &rem1 );
 630    }
 631    rem0 = ( rem0<<32 ) | ( rem1>>32 );
 632    z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
 633    return z;
 634
 635}
 636
 637/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
 638 * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
 639 *
 640 * Licensed under the GPLv2/LGPLv3
 641 */
 642static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
 643                                  uint64_t n0, uint64_t d)
 644{
 645#if defined(__x86_64__)
 646    uint64_t q;
 647    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
 648    return q;
 649#elif defined(__s390x__) && !defined(__clang__)
 650    /* Need to use a TImode type to get an even register pair for DLGR.  */
 651    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
 652    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
 653    *r = n >> 64;
 654    return n;
 655#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
 656    /* From Power ISA 2.06, programming note for divdeu.  */
 657    uint64_t q1, q2, Q, r1, r2, R;
 658    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
 659        : "=&r"(q1), "=r"(q2)
 660        : "r"(n1), "r"(n0), "r"(d));
 661    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
 662    r2 = n0 - (q2 * d);
 663    Q = q1 + q2;
 664    R = r1 + r2;
 665    if (R >= d || R < r2) { /* overflow implies R > d */
 666        Q += 1;
 667        R -= d;
 668    }
 669    *r = R;
 670    return Q;
 671#else
 672    uint64_t d0, d1, q0, q1, r1, r0, m;
 673
 674    d0 = (uint32_t)d;
 675    d1 = d >> 32;
 676
 677    r1 = n1 % d1;
 678    q1 = n1 / d1;
 679    m = q1 * d0;
 680    r1 = (r1 << 32) | (n0 >> 32);
 681    if (r1 < m) {
 682        q1 -= 1;
 683        r1 += d;
 684        if (r1 >= d) {
 685            if (r1 < m) {
 686                q1 -= 1;
 687                r1 += d;
 688            }
 689        }
 690    }
 691    r1 -= m;
 692
 693    r0 = r1 % d1;
 694    q0 = r1 / d1;
 695    m = q0 * d0;
 696    r0 = (r0 << 32) | (uint32_t)n0;
 697    if (r0 < m) {
 698        q0 -= 1;
 699        r0 += d;
 700        if (r0 >= d) {
 701            if (r0 < m) {
 702                q0 -= 1;
 703                r0 += d;
 704            }
 705        }
 706    }
 707    r0 -= m;
 708
 709    *r = r0;
 710    return (q1 << 32) | q0;
 711#endif
 712}
 713
 714/*----------------------------------------------------------------------------
 715| Returns an approximation to the square root of the 32-bit significand given
 716| by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
 717| `aExp' (the least significant bit) is 1, the integer returned approximates
 718| 2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp'
 719| is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either
 720| case, the approximation returned lies strictly within +/-2 of the exact
 721| value.
 722*----------------------------------------------------------------------------*/
 723
 724static inline uint32_t estimateSqrt32(int aExp, uint32_t a)
 725{
 726    static const uint16_t sqrtOddAdjustments[] = {
 727        0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
 728        0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
 729    };
 730    static const uint16_t sqrtEvenAdjustments[] = {
 731        0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
 732        0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
 733    };
 734    int8_t index;
 735    uint32_t z;
 736
 737    index = ( a>>27 ) & 15;
 738    if ( aExp & 1 ) {
 739        z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ];
 740        z = ( ( a / z )<<14 ) + ( z<<15 );
 741        a >>= 1;
 742    }
 743    else {
 744        z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ];
 745        z = a / z + z;
 746        z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
 747        if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 );
 748    }
 749    return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 );
 750
 751}
 752
 753/*----------------------------------------------------------------------------
 754| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
 755| is equal to the 128-bit value formed by concatenating `b0' and `b1'.
 756| Otherwise, returns 0.
 757*----------------------------------------------------------------------------*/
 758
 759static inline bool eq128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
 760{
 761    return a0 == b0 && a1 == b1;
 762}
 763
 764/*----------------------------------------------------------------------------
 765| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
 766| than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
 767| Otherwise, returns 0.
 768*----------------------------------------------------------------------------*/
 769
 770static inline bool le128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
 771{
 772    return a0 < b0 || (a0 == b0 && a1 <= b1);
 773}
 774
 775/*----------------------------------------------------------------------------
 776| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
 777| than the 128-bit value formed by concatenating `b0' and `b1'.  Otherwise,
 778| returns 0.
 779*----------------------------------------------------------------------------*/
 780
 781static inline bool lt128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
 782{
 783    return a0 < b0 || (a0 == b0 && a1 < b1);
 784}
 785
 786/*----------------------------------------------------------------------------
 787| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
 788| not equal to the 128-bit value formed by concatenating `b0' and `b1'.
 789| Otherwise, returns 0.
 790*----------------------------------------------------------------------------*/
 791
 792static inline bool ne128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
 793{
 794    return a0 != b0 || a1 != b1;
 795}
 796
 797#endif
 798