linux/arch/m68k/math-emu/multi_arith.h
<<
>>
Prefs
   1/* multi_arith.h: multi-precision integer arithmetic functions, needed
   2   to do extended-precision floating point.
   3
   4   (c) 1998 David Huggins-Daines.
   5
   6   Somewhat based on arch/alpha/math-emu/ieee-math.c, which is (c)
   7   David Mosberger-Tang.
   8
   9   You may copy, modify, and redistribute this file under the terms of
  10   the GNU General Public License, version 2, or any later version, at
  11   your convenience. */
  12
  13/* Note:
  14
  15   These are not general multi-precision math routines.  Rather, they
  16   implement the subset of integer arithmetic that we need in order to
  17   multiply, divide, and normalize 128-bit unsigned mantissae.  */
  18
  19#ifndef MULTI_ARITH_H
  20#define MULTI_ARITH_H
  21
  22#if 0   /* old code... */
  23
  24/* Unsigned only, because we don't need signs to multiply and divide. */
  25typedef unsigned int int128[4];
  26
  27/* Word order */
  28enum {
  29        MSW128,
  30        NMSW128,
  31        NLSW128,
  32        LSW128
  33};
  34
  35/* big-endian */
  36#define LO_WORD(ll) (((unsigned int *) &ll)[1])
  37#define HI_WORD(ll) (((unsigned int *) &ll)[0])
  38
  39/* Convenience functions to stuff various integer values into int128s */
  40
  41static inline void zero128(int128 a)
  42{
  43        a[LSW128] = a[NLSW128] = a[NMSW128] = a[MSW128] = 0;
  44}
  45
  46/* Human-readable word order in the arguments */
  47static inline void set128(unsigned int i3, unsigned int i2, unsigned int i1,
  48                          unsigned int i0, int128 a)
  49{
  50        a[LSW128] = i0;
  51        a[NLSW128] = i1;
  52        a[NMSW128] = i2;
  53        a[MSW128] = i3;
  54}
  55
  56/* Convenience functions (for testing as well) */
  57static inline void int64_to_128(unsigned long long src, int128 dest)
  58{
  59        dest[LSW128] = (unsigned int) src;
  60        dest[NLSW128] = src >> 32;
  61        dest[NMSW128] = dest[MSW128] = 0;
  62}
  63
  64static inline void int128_to_64(const int128 src, unsigned long long *dest)
  65{
  66        *dest = src[LSW128] | (long long) src[NLSW128] << 32;
  67}
  68
  69static inline void put_i128(const int128 a)
  70{
  71        printk("%08x %08x %08x %08x\n", a[MSW128], a[NMSW128],
  72               a[NLSW128], a[LSW128]);
  73}
  74
  75/* Internal shifters:
  76
  77   Note that these are only good for 0 < count < 32.
  78 */
  79
  80static inline void _lsl128(unsigned int count, int128 a)
  81{
  82        a[MSW128] = (a[MSW128] << count) | (a[NMSW128] >> (32 - count));
  83        a[NMSW128] = (a[NMSW128] << count) | (a[NLSW128] >> (32 - count));
  84        a[NLSW128] = (a[NLSW128] << count) | (a[LSW128] >> (32 - count));
  85        a[LSW128] <<= count;
  86}
  87
  88static inline void _lsr128(unsigned int count, int128 a)
  89{
  90        a[LSW128] = (a[LSW128] >> count) | (a[NLSW128] << (32 - count));
  91        a[NLSW128] = (a[NLSW128] >> count) | (a[NMSW128] << (32 - count));
  92        a[NMSW128] = (a[NMSW128] >> count) | (a[MSW128] << (32 - count));
  93        a[MSW128] >>= count;
  94}
  95
  96/* Should be faster, one would hope */
  97
  98static inline void lslone128(int128 a)
  99{
 100        asm volatile ("lsl.l #1,%0\n"
 101                      "roxl.l #1,%1\n"
 102                      "roxl.l #1,%2\n"
 103                      "roxl.l #1,%3\n"
 104                      :
 105                      "=d" (a[LSW128]),
 106                      "=d"(a[NLSW128]),
 107                      "=d"(a[NMSW128]),
 108                      "=d"(a[MSW128])
 109                      :
 110                      "0"(a[LSW128]),
 111                      "1"(a[NLSW128]),
 112                      "2"(a[NMSW128]),
 113                      "3"(a[MSW128]));
 114}
 115
 116static inline void lsrone128(int128 a)
 117{
 118        asm volatile ("lsr.l #1,%0\n"
 119                      "roxr.l #1,%1\n"
 120                      "roxr.l #1,%2\n"
 121                      "roxr.l #1,%3\n"
 122                      :
 123                      "=d" (a[MSW128]),
 124                      "=d"(a[NMSW128]),
 125                      "=d"(a[NLSW128]),
 126                      "=d"(a[LSW128])
 127                      :
 128                      "0"(a[MSW128]),
 129                      "1"(a[NMSW128]),
 130                      "2"(a[NLSW128]),
 131                      "3"(a[LSW128]));
 132}
 133
 134/* Generalized 128-bit shifters:
 135
 136   These bit-shift to a multiple of 32, then move whole longwords.  */
 137
 138static inline void lsl128(unsigned int count, int128 a)
 139{
 140        int wordcount, i;
 141
 142        if (count % 32)
 143                _lsl128(count % 32, a);
 144
 145        if (0 == (wordcount = count / 32))
 146                return;
 147
 148        /* argh, gak, endian-sensitive */
 149        for (i = 0; i < 4 - wordcount; i++) {
 150                a[i] = a[i + wordcount];
 151        }
 152        for (i = 3; i >= 4 - wordcount; --i) {
 153                a[i] = 0;
 154        }
 155}
 156
 157static inline void lsr128(unsigned int count, int128 a)
 158{
 159        int wordcount, i;
 160
 161        if (count % 32)
 162                _lsr128(count % 32, a);
 163
 164        if (0 == (wordcount = count / 32))
 165                return;
 166
 167        for (i = 3; i >= wordcount; --i) {
 168                a[i] = a[i - wordcount];
 169        }
 170        for (i = 0; i < wordcount; i++) {
 171                a[i] = 0;
 172        }
 173}
 174
 175static inline int orl128(int a, int128 b)
 176{
 177        b[LSW128] |= a;
 178}
 179
 180static inline int btsthi128(const int128 a)
 181{
 182        return a[MSW128] & 0x80000000;
 183}
 184
 185/* test bits (numbered from 0 = LSB) up to and including "top" */
 186static inline int bftestlo128(int top, const int128 a)
 187{
 188        int r = 0;
 189
 190        if (top > 31)
 191                r |= a[LSW128];
 192        if (top > 63)
 193                r |= a[NLSW128];
 194        if (top > 95)
 195                r |= a[NMSW128];
 196
 197        r |= a[3 - (top / 32)] & ((1 << (top % 32 + 1)) - 1);
 198
 199        return (r != 0);
 200}
 201
 202/* Aargh.  We need these because GCC is broken */
 203/* FIXME: do them in assembly, for goodness' sake! */
 204static inline void mask64(int pos, unsigned long long *mask)
 205{
 206        *mask = 0;
 207
 208        if (pos < 32) {
 209                LO_WORD(*mask) = (1 << pos) - 1;
 210                return;
 211        }
 212        LO_WORD(*mask) = -1;
 213        HI_WORD(*mask) = (1 << (pos - 32)) - 1;
 214}
 215
 216static inline void bset64(int pos, unsigned long long *dest)
 217{
 218        /* This conditional will be optimized away.  Thanks, GCC! */
 219        if (pos < 32)
 220                asm volatile ("bset %1,%0":"=m"
 221                              (LO_WORD(*dest)):"id"(pos));
 222        else
 223                asm volatile ("bset %1,%0":"=m"
 224                              (HI_WORD(*dest)):"id"(pos - 32));
 225}
 226
 227static inline int btst64(int pos, unsigned long long dest)
 228{
 229        if (pos < 32)
 230                return (0 != (LO_WORD(dest) & (1 << pos)));
 231        else
 232                return (0 != (HI_WORD(dest) & (1 << (pos - 32))));
 233}
 234
 235static inline void lsl64(int count, unsigned long long *dest)
 236{
 237        if (count < 32) {
 238                HI_WORD(*dest) = (HI_WORD(*dest) << count)
 239                    | (LO_WORD(*dest) >> count);
 240                LO_WORD(*dest) <<= count;
 241                return;
 242        }
 243        count -= 32;
 244        HI_WORD(*dest) = LO_WORD(*dest) << count;
 245        LO_WORD(*dest) = 0;
 246}
 247
 248static inline void lsr64(int count, unsigned long long *dest)
 249{
 250        if (count < 32) {
 251                LO_WORD(*dest) = (LO_WORD(*dest) >> count)
 252                    | (HI_WORD(*dest) << (32 - count));
 253                HI_WORD(*dest) >>= count;
 254                return;
 255        }
 256        count -= 32;
 257        LO_WORD(*dest) = HI_WORD(*dest) >> count;
 258        HI_WORD(*dest) = 0;
 259}
 260#endif
 261
 262static inline void fp_denormalize(struct fp_ext *reg, unsigned int cnt)
 263{
 264        reg->exp += cnt;
 265
 266        switch (cnt) {
 267        case 0 ... 8:
 268                reg->lowmant = reg->mant.m32[1] << (8 - cnt);
 269                reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) |
 270                                   (reg->mant.m32[0] << (32 - cnt));
 271                reg->mant.m32[0] = reg->mant.m32[0] >> cnt;
 272                break;
 273        case 9 ... 32:
 274                reg->lowmant = reg->mant.m32[1] >> (cnt - 8);
 275                if (reg->mant.m32[1] << (40 - cnt))
 276                        reg->lowmant |= 1;
 277                reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) |
 278                                   (reg->mant.m32[0] << (32 - cnt));
 279                reg->mant.m32[0] = reg->mant.m32[0] >> cnt;
 280                break;
 281        case 33 ... 39:
 282                asm volatile ("bfextu %1{%2,#8},%0" : "=d" (reg->lowmant)
 283                        : "m" (reg->mant.m32[0]), "d" (64 - cnt));
 284                if (reg->mant.m32[1] << (40 - cnt))
 285                        reg->lowmant |= 1;
 286                reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32);
 287                reg->mant.m32[0] = 0;
 288                break;
 289        case 40 ... 71:
 290                reg->lowmant = reg->mant.m32[0] >> (cnt - 40);
 291                if ((reg->mant.m32[0] << (72 - cnt)) || reg->mant.m32[1])
 292                        reg->lowmant |= 1;
 293                reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32);
 294                reg->mant.m32[0] = 0;
 295                break;
 296        default:
 297                reg->lowmant = reg->mant.m32[0] || reg->mant.m32[1];
 298                reg->mant.m32[0] = 0;
 299                reg->mant.m32[1] = 0;
 300                break;
 301        }
 302}
 303
 304static inline int fp_overnormalize(struct fp_ext *reg)
 305{
 306        int shift;
 307
 308        if (reg->mant.m32[0]) {
 309                asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[0]));
 310                reg->mant.m32[0] = (reg->mant.m32[0] << shift) | (reg->mant.m32[1] >> (32 - shift));
 311                reg->mant.m32[1] = (reg->mant.m32[1] << shift);
 312        } else {
 313                asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[1]));
 314                reg->mant.m32[0] = (reg->mant.m32[1] << shift);
 315                reg->mant.m32[1] = 0;
 316                shift += 32;
 317        }
 318
 319        return shift;
 320}
 321
 322static inline int fp_addmant(struct fp_ext *dest, struct fp_ext *src)
 323{
 324        int carry;
 325
 326        /* we assume here, gcc only insert move and a clr instr */
 327        asm volatile ("add.b %1,%0" : "=d,g" (dest->lowmant)
 328                : "g,d" (src->lowmant), "0,0" (dest->lowmant));
 329        asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[1])
 330                : "d" (src->mant.m32[1]), "0" (dest->mant.m32[1]));
 331        asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[0])
 332                : "d" (src->mant.m32[0]), "0" (dest->mant.m32[0]));
 333        asm volatile ("addx.l %0,%0" : "=d" (carry) : "0" (0));
 334
 335        return carry;
 336}
 337
 338static inline int fp_addcarry(struct fp_ext *reg)
 339{
 340        if (++reg->exp == 0x7fff) {
 341                if (reg->mant.m64)
 342                        fp_set_sr(FPSR_EXC_INEX2);
 343                reg->mant.m64 = 0;
 344                fp_set_sr(FPSR_EXC_OVFL);
 345                return 0;
 346        }
 347        reg->lowmant = (reg->mant.m32[1] << 7) | (reg->lowmant ? 1 : 0);
 348        reg->mant.m32[1] = (reg->mant.m32[1] >> 1) |
 349                           (reg->mant.m32[0] << 31);
 350        reg->mant.m32[0] = (reg->mant.m32[0] >> 1) | 0x80000000;
 351
 352        return 1;
 353}
 354
 355static inline void fp_submant(struct fp_ext *dest, struct fp_ext *src1,
 356                              struct fp_ext *src2)
 357{
 358        /* we assume here, gcc only insert move and a clr instr */
 359        asm volatile ("sub.b %1,%0" : "=d,g" (dest->lowmant)
 360                : "g,d" (src2->lowmant), "0,0" (src1->lowmant));
 361        asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[1])
 362                : "d" (src2->mant.m32[1]), "0" (src1->mant.m32[1]));
 363        asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[0])
 364                : "d" (src2->mant.m32[0]), "0" (src1->mant.m32[0]));
 365}
 366
 367#define fp_mul64(desth, destl, src1, src2) ({                           \
 368        asm ("mulu.l %2,%1:%0" : "=d" (destl), "=d" (desth)             \
 369                : "dm" (src1), "0" (src2));                             \
 370})
 371#define fp_div64(quot, rem, srch, srcl, div)                            \
 372        asm ("divu.l %2,%1:%0" : "=d" (quot), "=d" (rem)                \
 373                : "dm" (div), "1" (srch), "0" (srcl))
 374#define fp_add64(dest1, dest2, src1, src2) ({                           \
 375        asm ("add.l %1,%0" : "=d,dm" (dest2)                            \
 376                : "dm,d" (src2), "0,0" (dest2));                        \
 377        asm ("addx.l %1,%0" : "=d" (dest1)                              \
 378                : "d" (src1), "0" (dest1));                             \
 379})
 380#define fp_addx96(dest, src) ({                                         \
 381        /* we assume here, gcc only insert move and a clr instr */      \
 382        asm volatile ("add.l %1,%0" : "=d,g" (dest->m32[2])             \
 383                : "g,d" (temp.m32[1]), "0,0" (dest->m32[2]));           \
 384        asm volatile ("addx.l %1,%0" : "=d" (dest->m32[1])              \
 385                : "d" (temp.m32[0]), "0" (dest->m32[1]));               \
 386        asm volatile ("addx.l %1,%0" : "=d" (dest->m32[0])              \
 387                : "d" (0), "0" (dest->m32[0]));                         \
 388})
 389#define fp_sub64(dest, src) ({                                          \
 390        asm ("sub.l %1,%0" : "=d,dm" (dest.m32[1])                      \
 391                : "dm,d" (src.m32[1]), "0,0" (dest.m32[1]));            \
 392        asm ("subx.l %1,%0" : "=d" (dest.m32[0])                        \
 393                : "d" (src.m32[0]), "0" (dest.m32[0]));                 \
 394})
 395#define fp_sub96c(dest, srch, srcm, srcl) ({                            \
 396        char carry;                                                     \
 397        asm ("sub.l %1,%0" : "=d,dm" (dest.m32[2])                      \
 398                : "dm,d" (srcl), "0,0" (dest.m32[2]));                  \
 399        asm ("subx.l %1,%0" : "=d" (dest.m32[1])                        \
 400                : "d" (srcm), "0" (dest.m32[1]));                       \
 401        asm ("subx.l %2,%1; scs %0" : "=d" (carry), "=d" (dest.m32[0])  \
 402                : "d" (srch), "1" (dest.m32[0]));                       \
 403        carry;                                                          \
 404})
 405
 406static inline void fp_multiplymant(union fp_mant128 *dest, struct fp_ext *src1,
 407                                   struct fp_ext *src2)
 408{
 409        union fp_mant64 temp;
 410
 411        fp_mul64(dest->m32[0], dest->m32[1], src1->mant.m32[0], src2->mant.m32[0]);
 412        fp_mul64(dest->m32[2], dest->m32[3], src1->mant.m32[1], src2->mant.m32[1]);
 413
 414        fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[0], src2->mant.m32[1]);
 415        fp_addx96(dest, temp);
 416
 417        fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[1], src2->mant.m32[0]);
 418        fp_addx96(dest, temp);
 419}
 420
 421static inline void fp_dividemant(union fp_mant128 *dest, struct fp_ext *src,
 422                                 struct fp_ext *div)
 423{
 424        union fp_mant128 tmp;
 425        union fp_mant64 tmp64;
 426        unsigned long *mantp = dest->m32;
 427        unsigned long fix, rem, first, dummy;
 428        int i;
 429
 430        /* the algorithm below requires dest to be smaller than div,
 431           but both have the high bit set */
 432        if (src->mant.m64 >= div->mant.m64) {
 433                fp_sub64(src->mant, div->mant);
 434                *mantp = 1;
 435        } else
 436                *mantp = 0;
 437        mantp++;
 438
 439        /* basic idea behind this algorithm: we can't divide two 64bit numbers
 440           (AB/CD) directly, but we can calculate AB/C0, but this means this
 441           quotient is off by C0/CD, so we have to multiply the first result
 442           to fix the result, after that we have nearly the correct result
 443           and only a few corrections are needed. */
 444
 445        /* C0/CD can be precalculated, but it's an 64bit division again, but
 446           we can make it a bit easier, by dividing first through C so we get
 447           10/1D and now only a single shift and the value fits into 32bit. */
 448        fix = 0x80000000;
 449        dummy = div->mant.m32[1] / div->mant.m32[0] + 1;
 450        dummy = (dummy >> 1) | fix;
 451        fp_div64(fix, dummy, fix, 0, dummy);
 452        fix--;
 453
 454        for (i = 0; i < 3; i++, mantp++) {
 455                if (src->mant.m32[0] == div->mant.m32[0]) {
 456                        fp_div64(first, rem, 0, src->mant.m32[1], div->mant.m32[0]);
 457
 458                        fp_mul64(*mantp, dummy, first, fix);
 459                        *mantp += fix;
 460                } else {
 461                        fp_div64(first, rem, src->mant.m32[0], src->mant.m32[1], div->mant.m32[0]);
 462
 463                        fp_mul64(*mantp, dummy, first, fix);
 464                }
 465
 466                fp_mul64(tmp.m32[0], tmp.m32[1], div->mant.m32[0], first - *mantp);
 467                fp_add64(tmp.m32[0], tmp.m32[1], 0, rem);
 468                tmp.m32[2] = 0;
 469
 470                fp_mul64(tmp64.m32[0], tmp64.m32[1], *mantp, div->mant.m32[1]);
 471                fp_sub96c(tmp, 0, tmp64.m32[0], tmp64.m32[1]);
 472
 473                src->mant.m32[0] = tmp.m32[1];
 474                src->mant.m32[1] = tmp.m32[2];
 475
 476                while (!fp_sub96c(tmp, 0, div->mant.m32[0], div->mant.m32[1])) {
 477                        src->mant.m32[0] = tmp.m32[1];
 478                        src->mant.m32[1] = tmp.m32[2];
 479                        *mantp += 1;
 480                }
 481        }
 482}
 483
 484#if 0
 485static inline unsigned int fp_fls128(union fp_mant128 *src)
 486{
 487        unsigned long data;
 488        unsigned int res, off;
 489
 490        if ((data = src->m32[0]))
 491                off = 0;
 492        else if ((data = src->m32[1]))
 493                off = 32;
 494        else if ((data = src->m32[2]))
 495                off = 64;
 496        else if ((data = src->m32[3]))
 497                off = 96;
 498        else
 499                return 128;
 500
 501        asm ("bfffo %1{#0,#32},%0" : "=d" (res) : "dm" (data));
 502        return res + off;
 503}
 504
 505static inline void fp_shiftmant128(union fp_mant128 *src, int shift)
 506{
 507        unsigned long sticky;
 508
 509        switch (shift) {
 510        case 0:
 511                return;
 512        case 1:
 513                asm volatile ("lsl.l #1,%0"
 514                        : "=d" (src->m32[3]) : "0" (src->m32[3]));
 515                asm volatile ("roxl.l #1,%0"
 516                        : "=d" (src->m32[2]) : "0" (src->m32[2]));
 517                asm volatile ("roxl.l #1,%0"
 518                        : "=d" (src->m32[1]) : "0" (src->m32[1]));
 519                asm volatile ("roxl.l #1,%0"
 520                        : "=d" (src->m32[0]) : "0" (src->m32[0]));
 521                return;
 522        case 2 ... 31:
 523                src->m32[0] = (src->m32[0] << shift) | (src->m32[1] >> (32 - shift));
 524                src->m32[1] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift));
 525                src->m32[2] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
 526                src->m32[3] = (src->m32[3] << shift);
 527                return;
 528        case 32 ... 63:
 529                shift -= 32;
 530                src->m32[0] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift));
 531                src->m32[1] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
 532                src->m32[2] = (src->m32[3] << shift);
 533                src->m32[3] = 0;
 534                return;
 535        case 64 ... 95:
 536                shift -= 64;
 537                src->m32[0] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
 538                src->m32[1] = (src->m32[3] << shift);
 539                src->m32[2] = src->m32[3] = 0;
 540                return;
 541        case 96 ... 127:
 542                shift -= 96;
 543                src->m32[0] = (src->m32[3] << shift);
 544                src->m32[1] = src->m32[2] = src->m32[3] = 0;
 545                return;
 546        case -31 ... -1:
 547                shift = -shift;
 548                sticky = 0;
 549                if (src->m32[3] << (32 - shift))
 550                        sticky = 1;
 551                src->m32[3] = (src->m32[3] >> shift) | (src->m32[2] << (32 - shift)) | sticky;
 552                src->m32[2] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift));
 553                src->m32[1] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift));
 554                src->m32[0] = (src->m32[0] >> shift);
 555                return;
 556        case -63 ... -32:
 557                shift = -shift - 32;
 558                sticky = 0;
 559                if ((src->m32[2] << (32 - shift)) || src->m32[3])
 560                        sticky = 1;
 561                src->m32[3] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift)) | sticky;
 562                src->m32[2] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift));
 563                src->m32[1] = (src->m32[0] >> shift);
 564                src->m32[0] = 0;
 565                return;
 566        case -95 ... -64:
 567                shift = -shift - 64;
 568                sticky = 0;
 569                if ((src->m32[1] << (32 - shift)) || src->m32[2] || src->m32[3])
 570                        sticky = 1;
 571                src->m32[3] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift)) | sticky;
 572                src->m32[2] = (src->m32[0] >> shift);
 573                src->m32[1] = src->m32[0] = 0;
 574                return;
 575        case -127 ... -96:
 576                shift = -shift - 96;
 577                sticky = 0;
 578                if ((src->m32[0] << (32 - shift)) || src->m32[1] || src->m32[2] || src->m32[3])
 579                        sticky = 1;
 580                src->m32[3] = (src->m32[0] >> shift) | sticky;
 581                src->m32[2] = src->m32[1] = src->m32[0] = 0;
 582                return;
 583        }
 584
 585        if (shift < 0 && (src->m32[0] || src->m32[1] || src->m32[2] || src->m32[3]))
 586                src->m32[3] = 1;
 587        else
 588                src->m32[3] = 0;
 589        src->m32[2] = 0;
 590        src->m32[1] = 0;
 591        src->m32[0] = 0;
 592}
 593#endif
 594
 595static inline void fp_putmant128(struct fp_ext *dest, union fp_mant128 *src,
 596                                 int shift)
 597{
 598        unsigned long tmp;
 599
 600        switch (shift) {
 601        case 0:
 602                dest->mant.m64 = src->m64[0];
 603                dest->lowmant = src->m32[2] >> 24;
 604                if (src->m32[3] || (src->m32[2] << 8))
 605                        dest->lowmant |= 1;
 606                break;
 607        case 1:
 608                asm volatile ("lsl.l #1,%0"
 609                        : "=d" (tmp) : "0" (src->m32[2]));
 610                asm volatile ("roxl.l #1,%0"
 611                        : "=d" (dest->mant.m32[1]) : "0" (src->m32[1]));
 612                asm volatile ("roxl.l #1,%0"
 613                        : "=d" (dest->mant.m32[0]) : "0" (src->m32[0]));
 614                dest->lowmant = tmp >> 24;
 615                if (src->m32[3] || (tmp << 8))
 616                        dest->lowmant |= 1;
 617                break;
 618        case 31:
 619                asm volatile ("lsr.l #1,%1; roxr.l #1,%0"
 620                        : "=d" (dest->mant.m32[0])
 621                        : "d" (src->m32[0]), "0" (src->m32[1]));
 622                asm volatile ("roxr.l #1,%0"
 623                        : "=d" (dest->mant.m32[1]) : "0" (src->m32[2]));
 624                asm volatile ("roxr.l #1,%0"
 625                        : "=d" (tmp) : "0" (src->m32[3]));
 626                dest->lowmant = tmp >> 24;
 627                if (src->m32[3] << 7)
 628                        dest->lowmant |= 1;
 629                break;
 630        case 32:
 631                dest->mant.m32[0] = src->m32[1];
 632                dest->mant.m32[1] = src->m32[2];
 633                dest->lowmant = src->m32[3] >> 24;
 634                if (src->m32[3] << 8)
 635                        dest->lowmant |= 1;
 636                break;
 637        }
 638}
 639
 640#if 0 /* old code... */
 641static inline int fls(unsigned int a)
 642{
 643        int r;
 644
 645        asm volatile ("bfffo %1{#0,#32},%0"
 646                      : "=d" (r) : "md" (a));
 647        return r;
 648}
 649
 650/* fls = "find last set" (cf. ffs(3)) */
 651static inline int fls128(const int128 a)
 652{
 653        if (a[MSW128])
 654                return fls(a[MSW128]);
 655        if (a[NMSW128])
 656                return fls(a[NMSW128]) + 32;
 657        /* XXX: it probably never gets beyond this point in actual
 658           use, but that's indicative of a more general problem in the
 659           algorithm (i.e. as per the actual 68881 implementation, we
 660           really only need at most 67 bits of precision [plus
 661           overflow]) so I'm not going to fix it. */
 662        if (a[NLSW128])
 663                return fls(a[NLSW128]) + 64;
 664        if (a[LSW128])
 665                return fls(a[LSW128]) + 96;
 666        else
 667                return -1;
 668}
 669
 670static inline int zerop128(const int128 a)
 671{
 672        return !(a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]);
 673}
 674
 675static inline int nonzerop128(const int128 a)
 676{
 677        return (a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]);
 678}
 679
 680/* Addition and subtraction */
 681/* Do these in "pure" assembly, because "extended" asm is unmanageable
 682   here */
 683static inline void add128(const int128 a, int128 b)
 684{
 685        /* rotating carry flags */
 686        unsigned int carry[2];
 687
 688        carry[0] = a[LSW128] > (0xffffffff - b[LSW128]);
 689        b[LSW128] += a[LSW128];
 690
 691        carry[1] = a[NLSW128] > (0xffffffff - b[NLSW128] - carry[0]);
 692        b[NLSW128] = a[NLSW128] + b[NLSW128] + carry[0];
 693
 694        carry[0] = a[NMSW128] > (0xffffffff - b[NMSW128] - carry[1]);
 695        b[NMSW128] = a[NMSW128] + b[NMSW128] + carry[1];
 696
 697        b[MSW128] = a[MSW128] + b[MSW128] + carry[0];
 698}
 699
 700/* Note: assembler semantics: "b -= a" */
 701static inline void sub128(const int128 a, int128 b)
 702{
 703        /* rotating borrow flags */
 704        unsigned int borrow[2];
 705
 706        borrow[0] = b[LSW128] < a[LSW128];
 707        b[LSW128] -= a[LSW128];
 708
 709        borrow[1] = b[NLSW128] < a[NLSW128] + borrow[0];
 710        b[NLSW128] = b[NLSW128] - a[NLSW128] - borrow[0];
 711
 712        borrow[0] = b[NMSW128] < a[NMSW128] + borrow[1];
 713        b[NMSW128] = b[NMSW128] - a[NMSW128] - borrow[1];
 714
 715        b[MSW128] = b[MSW128] - a[MSW128] - borrow[0];
 716}
 717
 718/* Poor man's 64-bit expanding multiply */
 719static inline void mul64(unsigned long long a, unsigned long long b, int128 c)
 720{
 721        unsigned long long acc;
 722        int128 acc128;
 723
 724        zero128(acc128);
 725        zero128(c);
 726
 727        /* first the low words */
 728        if (LO_WORD(a) && LO_WORD(b)) {
 729                acc = (long long) LO_WORD(a) * LO_WORD(b);
 730                c[NLSW128] = HI_WORD(acc);
 731                c[LSW128] = LO_WORD(acc);
 732        }
 733        /* Next the high words */
 734        if (HI_WORD(a) && HI_WORD(b)) {
 735                acc = (long long) HI_WORD(a) * HI_WORD(b);
 736                c[MSW128] = HI_WORD(acc);
 737                c[NMSW128] = LO_WORD(acc);
 738        }
 739        /* The middle words */
 740        if (LO_WORD(a) && HI_WORD(b)) {
 741                acc = (long long) LO_WORD(a) * HI_WORD(b);
 742                acc128[NMSW128] = HI_WORD(acc);
 743                acc128[NLSW128] = LO_WORD(acc);
 744                add128(acc128, c);
 745        }
 746        /* The first and last words */
 747        if (HI_WORD(a) && LO_WORD(b)) {
 748                acc = (long long) HI_WORD(a) * LO_WORD(b);
 749                acc128[NMSW128] = HI_WORD(acc);
 750                acc128[NLSW128] = LO_WORD(acc);
 751                add128(acc128, c);
 752        }
 753}
 754
 755/* Note: unsigned */
 756static inline int cmp128(int128 a, int128 b)
 757{
 758        if (a[MSW128] < b[MSW128])
 759                return -1;
 760        if (a[MSW128] > b[MSW128])
 761                return 1;
 762        if (a[NMSW128] < b[NMSW128])
 763                return -1;
 764        if (a[NMSW128] > b[NMSW128])
 765                return 1;
 766        if (a[NLSW128] < b[NLSW128])
 767                return -1;
 768        if (a[NLSW128] > b[NLSW128])
 769                return 1;
 770
 771        return (signed) a[LSW128] - b[LSW128];
 772}
 773
 774inline void div128(int128 a, int128 b, int128 c)
 775{
 776        int128 mask;
 777
 778        /* Algorithm:
 779
 780           Shift the divisor until it's at least as big as the
 781           dividend, keeping track of the position to which we've
 782           shifted it, i.e. the power of 2 which we've multiplied it
 783           by.
 784
 785           Then, for this power of 2 (the mask), and every one smaller
 786           than it, subtract the mask from the dividend and add it to
 787           the quotient until the dividend is smaller than the raised
 788           divisor.  At this point, divide the dividend and the mask
 789           by 2 (i.e. shift one place to the right).  Lather, rinse,
 790           and repeat, until there are no more powers of 2 left. */
 791
 792        /* FIXME: needless to say, there's room for improvement here too. */
 793
 794        /* Shift up */
 795        /* XXX: since it just has to be "at least as big", we can
 796           probably eliminate this horribly wasteful loop.  I will
 797           have to prove this first, though */
 798        set128(0, 0, 0, 1, mask);
 799        while (cmp128(b, a) < 0 && !btsthi128(b)) {
 800                lslone128(b);
 801                lslone128(mask);
 802        }
 803
 804        /* Shift down */
 805        zero128(c);
 806        do {
 807                if (cmp128(a, b) >= 0) {
 808                        sub128(b, a);
 809                        add128(mask, c);
 810                }
 811                lsrone128(mask);
 812                lsrone128(b);
 813        } while (nonzerop128(mask));
 814
 815        /* The remainder is in a... */
 816}
 817#endif
 818
 819#endif  /* MULTI_ARITH_H */
 820