LXR linux/arch/m68k/fpsp040/setox.S

   1|
   2|       setox.sa 3.1 12/10/90
   3|
   4|       The entry point setox computes the exponential of a value.
   5|       setoxd does the same except the input value is a denormalized
   6|       number. setoxm1 computes exp(X)-1, and setoxm1d computes
   7|       exp(X)-1 for denormalized X.
   8|
   9|       INPUT
  10|       -----
  11|       Double-extended value in memory location pointed to by address
  12|       register a0.
  13|
  14|       OUTPUT
  15|       ------
  16|       exp(X) or exp(X)-1 returned in floating-point register fp0.
  17|
  18|       ACCURACY and MONOTONICITY
  19|       -------------------------
  20|       The returned result is within 0.85 ulps in 64 significant bit, i.e.
  21|       within 0.5001 ulp to 53 bits if the result is subsequently rounded
  22|       to double precision. The result is provably monotonic in double
  23|       precision.
  24|
  25|       SPEED
  26|       -----
  27|       Two timings are measured, both in the copy-back mode. The
  28|       first one is measured when the function is invoked the first time
  29|       (so the instructions and data are not in cache), and the
  30|       second one is measured when the function is reinvoked at the same
  31|       input argument.
  32|
  33|       The program setox takes approximately 210/190 cycles for input
  34|       argument X whose magnitude is less than 16380 log2, which
  35|       is the usual situation. For the less common arguments,
  36|       depending on their values, the program may run faster or slower --
  37|       but no worse than 10% slower even in the extreme cases.
  38|
  39|       The program setoxm1 takes approximately ??? / ??? cycles for input
  40|       argument X, 0.25 <= |X| < 70log2. For |X| < 0.25, it takes
  41|       approximately ??? / ??? cycles. For the less common arguments,
  42|       depending on their values, the program may run faster or slower --
  43|       but no worse than 10% slower even in the extreme cases.
  44|
  45|       ALGORITHM and IMPLEMENTATION NOTES
  46|       ----------------------------------
  47|
  48|       setoxd
  49|       ------
  50|       Step 1. Set ans := 1.0
  51|
  52|       Step 2. Return  ans := ans + sign(X)*2^(-126). Exit.
  53|       Notes:  This will always generate one exception -- inexact.
  54|
  55|
  56|       setox
  57|       -----
  58|
  59|       Step 1. Filter out extreme cases of input argument.
  60|               1.1     If |X| >= 2^(-65), go to Step 1.3.
  61|               1.2     Go to Step 7.
  62|               1.3     If |X| < 16380 log(2), go to Step 2.
  63|               1.4     Go to Step 8.
  64|       Notes:  The usual case should take the branches 1.1 -> 1.3 -> 2.
  65|                To avoid the use of floating-point comparisons, a
  66|                compact representation of |X| is used. This format is a
  67|                32-bit integer, the upper (more significant) 16 bits are
  68|                the sign and biased exponent field of |X|; the lower 16
  69|                bits are the 16 most significant fraction (including the
  70|                explicit bit) bits of |X|. Consequently, the comparisons
  71|                in Steps 1.1 and 1.3 can be performed by integer comparison.
  72|                Note also that the constant 16380 log(2) used in Step 1.3
  73|                is also in the compact form. Thus taking the branch
  74|                to Step 2 guarantees |X| < 16380 log(2). There is no harm
  75|                to have a small number of cases where |X| is less than,
  76|                but close to, 16380 log(2) and the branch to Step 9 is
  77|                taken.
  78|
  79|       Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).
  80|               2.1     Set AdjFlag := 0 (indicates the branch 1.3 -> 2 was taken)
  81|               2.2     N := round-to-nearest-integer( X * 64/log2 ).
  82|               2.3     Calculate       J = N mod 64; so J = 0,1,2,..., or 63.
  83|               2.4     Calculate       M = (N - J)/64; so N = 64M + J.
  84|               2.5     Calculate the address of the stored value of 2^(J/64).
  85|               2.6     Create the value Scale = 2^M.
  86|       Notes:  The calculation in 2.2 is really performed by
  87|
  88|                       Z := X * constant
  89|                       N := round-to-nearest-integer(Z)
  90|
  91|                where
  92|
  93|                       constant := single-precision( 64/log 2 ).
  94|
  95|                Using a single-precision constant avoids memory access.
  96|                Another effect of using a single-precision "constant" is
  97|                that the calculated value Z is
  98|
  99|                       Z = X*(64/log2)*(1+eps), |eps| <= 2^(-24).
 100|
 101|                This error has to be considered later in Steps 3 and 4.
 102|
 103|       Step 3. Calculate X - N*log2/64.
 104|               3.1     R := X + N*L1, where L1 := single-precision(-log2/64).
 105|               3.2     R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
 106|       Notes:  a) The way L1 and L2 are chosen ensures L1+L2 approximate
 107|                the value      -log2/64        to 88 bits of accuracy.
 108|                b) N*L1 is exact because N is no longer than 22 bits and
 109|                L1 is no longer than 24 bits.
 110|                c) The calculation X+N*L1 is also exact due to cancellation.
 111|                Thus, R is practically X+N(L1+L2) to full 64 bits.
 112|                d) It is important to estimate how large can |R| be after
 113|                Step 3.2.
 114|
 115|                       N = rnd-to-int( X*64/log2 (1+eps) ), |eps|<=2^(-24)
 116|                       X*64/log2 (1+eps)       =       N + f,  |f| <= 0.5
 117|                       X*64/log2 - N   =       f - eps*X 64/log2
 118|                       X - N*log2/64   =       f*log2/64 - eps*X
 119|
 120|
 121|                Now |X| <= 16446 log2, thus
 122|
 123|                       |X - N*log2/64| <= (0.5 + 16446/2^(18))*log2/64
 124|                                       <= 0.57 log2/64.
 125|                This bound will be used in Step 4.
 126|
 127|       Step 4. Approximate exp(R)-1 by a polynomial
 128|                       p = R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))
 129|       Notes:  a) In order to reduce memory access, the coefficients are
 130|                made as "short" as possible: A1 (which is 1/2), A4 and A5
 131|                are single precision; A2 and A3 are double precision.
 132|                b) Even with the restrictions above,
 133|                       |p - (exp(R)-1)| < 2^(-68.8) for all |R| <= 0.0062.
 134|                Note that 0.0062 is slightly bigger than 0.57 log2/64.
 135|                c) To fully utilize the pipeline, p is separated into
 136|                two independent pieces of roughly equal complexities
 137|                       p = [ R + R*S*(A2 + S*A4) ]     +
 138|                               [ S*(A1 + S*(A3 + S*A5)) ]
 139|                where S = R*R.
 140|
 141|       Step 5. Compute 2^(J/64)*exp(R) = 2^(J/64)*(1+p) by
 142|                               ans := T + ( T*p + t)
 143|                where T and t are the stored values for 2^(J/64).
 144|       Notes:  2^(J/64) is stored as T and t where T+t approximates
 145|                2^(J/64) to roughly 85 bits; T is in extended precision
 146|                and t is in single precision. Note also that T is rounded
 147|                to 62 bits so that the last two bits of T are zero. The
 148|                reason for such a special form is that T-1, T-2, and T-8
 149|                will all be exact --- a property that will give much
 150|                more accurate computation of the function EXPM1.
 151|
 152|       Step 6. Reconstruction of exp(X)
 153|                       exp(X) = 2^M * 2^(J/64) * exp(R).
 154|               6.1     If AdjFlag = 0, go to 6.3
 155|               6.2     ans := ans * AdjScale
 156|               6.3     Restore the user FPCR
 157|               6.4     Return ans := ans * Scale. Exit.
 158|       Notes:  If AdjFlag = 0, we have X = Mlog2 + Jlog2/64 + R,
 159|                |M| <= 16380, and Scale = 2^M. Moreover, exp(X) will
 160|                neither overflow nor underflow. If AdjFlag = 1, that
 161|                means that
 162|                       X = (M1+M)log2 + Jlog2/64 + R, |M1+M| >= 16380.
 163|                Hence, exp(X) may overflow or underflow or neither.
 164|                When that is the case, AdjScale = 2^(M1) where M1 is
 165|                approximately M. Thus 6.2 will never cause over/underflow.
 166|                Possible exception in 6.4 is overflow or underflow.
 167|                The inexact exception is not generated in 6.4. Although
 168|                one can argue that the inexact flag should always be
 169|                raised, to simulate that exception cost to much than the
 170|                flag is worth in practical uses.
 171|
 172|       Step 7. Return 1 + X.
 173|               7.1     ans := X
 174|               7.2     Restore user FPCR.
 175|               7.3     Return ans := 1 + ans. Exit
 176|       Notes:  For non-zero X, the inexact exception will always be
 177|                raised by 7.3. That is the only exception raised by 7.3.
 178|                Note also that we use the FMOVEM instruction to move X
 179|                in Step 7.1 to avoid unnecessary trapping. (Although
 180|                the FMOVEM may not seem relevant since X is normalized,
 181|                the precaution will be useful in the library version of
 182|                this code where the separate entry for denormalized inputs
 183|                will be done away with.)
 184|
 185|       Step 8. Handle exp(X) where |X| >= 16380log2.
 186|               8.1     If |X| > 16480 log2, go to Step 9.
 187|               (mimic 2.2 - 2.6)
 188|               8.2     N := round-to-integer( X * 64/log2 )
 189|               8.3     Calculate J = N mod 64, J = 0,1,...,63
 190|               8.4     K := (N-J)/64, M1 := truncate(K/2), M = K-M1, AdjFlag := 1.
 191|               8.5     Calculate the address of the stored value 2^(J/64).
 192|               8.6     Create the values Scale = 2^M, AdjScale = 2^M1.
 193|               8.7     Go to Step 3.
 194|       Notes:  Refer to notes for 2.2 - 2.6.
 195|
 196|       Step 9. Handle exp(X), |X| > 16480 log2.
 197|               9.1     If X < 0, go to 9.3
 198|               9.2     ans := Huge, go to 9.4
 199|               9.3     ans := Tiny.
 200|               9.4     Restore user FPCR.
 201|               9.5     Return ans := ans * ans. Exit.
 202|       Notes:  Exp(X) will surely overflow or underflow, depending on
 203|                X's sign. "Huge" and "Tiny" are respectively large/tiny
 204|                extended-precision numbers whose square over/underflow
 205|                with an inexact result. Thus, 9.5 always raises the
 206|                inexact together with either overflow or underflow.
 207|
 208|
 209|       setoxm1d
 210|       --------
 211|
 212|       Step 1. Set ans := 0
 213|
 214|       Step 2. Return  ans := X + ans. Exit.
 215|       Notes:  This will return X with the appropriate rounding
 216|                precision prescribed by the user FPCR.
 217|
 218|       setoxm1
 219|       -------
 220|
 221|       Step 1. Check |X|
 222|               1.1     If |X| >= 1/4, go to Step 1.3.
 223|               1.2     Go to Step 7.
 224|               1.3     If |X| < 70 log(2), go to Step 2.
 225|               1.4     Go to Step 10.
 226|       Notes:  The usual case should take the branches 1.1 -> 1.3 -> 2.
 227|                However, it is conceivable |X| can be small very often
 228|                because EXPM1 is intended to evaluate exp(X)-1 accurately
 229|                when |X| is small. For further details on the comparisons,
 230|                see the notes on Step 1 of setox.
 231|
 232|       Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).
 233|               2.1     N := round-to-nearest-integer( X * 64/log2 ).
 234|               2.2     Calculate       J = N mod 64; so J = 0,1,2,..., or 63.
 235|               2.3     Calculate       M = (N - J)/64; so N = 64M + J.
 236|               2.4     Calculate the address of the stored value of 2^(J/64).
 237|               2.5     Create the values Sc = 2^M and OnebySc := -2^(-M).
 238|       Notes:  See the notes on Step 2 of setox.
 239|
 240|       Step 3. Calculate X - N*log2/64.
 241|               3.1     R := X + N*L1, where L1 := single-precision(-log2/64).
 242|               3.2     R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
 243|       Notes:  Applying the analysis of Step 3 of setox in this case
 244|                shows that |R| <= 0.0055 (note that |X| <= 70 log2 in
 245|                this case).
 246|
 247|       Step 4. Approximate exp(R)-1 by a polynomial
 248|                       p = R+R*R*(A1+R*(A2+R*(A3+R*(A4+R*(A5+R*A6)))))
 249|       Notes:  a) In order to reduce memory access, the coefficients are
 250|                made as "short" as possible: A1 (which is 1/2), A5 and A6
 251|                are single precision; A2, A3 and A4 are double precision.
 252|                b) Even with the restriction above,
 253|                       |p - (exp(R)-1)| <      |R| * 2^(-72.7)
 254|                for all |R| <= 0.0055.
 255|                c) To fully utilize the pipeline, p is separated into
 256|                two independent pieces of roughly equal complexity
 257|                       p = [ R*S*(A2 + S*(A4 + S*A6)) ]        +
 258|                               [ R + S*(A1 + S*(A3 + S*A5)) ]
 259|                where S = R*R.
 260|
 261|       Step 5. Compute 2^(J/64)*p by
 262|                               p := T*p
 263|                where T and t are the stored values for 2^(J/64).
 264|       Notes:  2^(J/64) is stored as T and t where T+t approximates
 265|                2^(J/64) to roughly 85 bits; T is in extended precision
 266|                and t is in single precision. Note also that T is rounded
 267|                to 62 bits so that the last two bits of T are zero. The
 268|                reason for such a special form is that T-1, T-2, and T-8
 269|                will all be exact --- a property that will be exploited
 270|                in Step 6 below. The total relative error in p is no
 271|                bigger than 2^(-67.7) compared to the final result.
 272|
 273|       Step 6. Reconstruction of exp(X)-1
 274|                       exp(X)-1 = 2^M * ( 2^(J/64) + p - 2^(-M) ).
 275|               6.1     If M <= 63, go to Step 6.3.
 276|               6.2     ans := T + (p + (t + OnebySc)). Go to 6.6
 277|               6.3     If M >= -3, go to 6.5.
 278|               6.4     ans := (T + (p + t)) + OnebySc. Go to 6.6
 279|               6.5     ans := (T + OnebySc) + (p + t).
 280|               6.6     Restore user FPCR.
 281|               6.7     Return ans := Sc * ans. Exit.
 282|       Notes:  The various arrangements of the expressions give accurate
 283|                evaluations.
 284|
 285|       Step 7. exp(X)-1 for |X| < 1/4.
 286|               7.1     If |X| >= 2^(-65), go to Step 9.
 287|               7.2     Go to Step 8.
 288|
 289|       Step 8. Calculate exp(X)-1, |X| < 2^(-65).
 290|               8.1     If |X| < 2^(-16312), goto 8.3
 291|               8.2     Restore FPCR; return ans := X - 2^(-16382). Exit.
 292|               8.3     X := X * 2^(140).
 293|               8.4     Restore FPCR; ans := ans - 2^(-16382).
 294|                Return ans := ans*2^(140). Exit
 295|       Notes:  The idea is to return "X - tiny" under the user
 296|                precision and rounding modes. To avoid unnecessary
 297|                inefficiency, we stay away from denormalized numbers the
 298|                best we can. For |X| >= 2^(-16312), the straightforward
 299|                8.2 generates the inexact exception as the case warrants.
 300|
 301|       Step 9. Calculate exp(X)-1, |X| < 1/4, by a polynomial
 302|                       p = X + X*X*(B1 + X*(B2 + ... + X*B12))
 303|       Notes:  a) In order to reduce memory access, the coefficients are
 304|                made as "short" as possible: B1 (which is 1/2), B9 to B12
 305|                are single precision; B3 to B8 are double precision; and
 306|                B2 is double extended.
 307|                b) Even with the restriction above,
 308|                       |p - (exp(X)-1)| < |X| 2^(-70.6)
 309|                for all |X| <= 0.251.
 310|                Note that 0.251 is slightly bigger than 1/4.
 311|                c) To fully preserve accuracy, the polynomial is computed
 312|                as     X + ( S*B1 +    Q ) where S = X*X and
 313|                       Q       =       X*S*(B2 + X*(B3 + ... + X*B12))
 314|                d) To fully utilize the pipeline, Q is separated into
 315|                two independent pieces of roughly equal complexity
 316|                       Q = [ X*S*(B2 + S*(B4 + ... + S*B12)) ] +
 317|                               [ S*S*(B3 + S*(B5 + ... + S*B11)) ]
 318|
 319|       Step 10.        Calculate exp(X)-1 for |X| >= 70 log 2.
 320|               10.1 If X >= 70log2 , exp(X) - 1 = exp(X) for all practical
 321|                purposes. Therefore, go to Step 1 of setox.
 322|               10.2 If X <= -70log2, exp(X) - 1 = -1 for all practical purposes.
 323|                ans := -1
 324|                Restore user FPCR
 325|                Return ans := ans + 2^(-126). Exit.
 326|       Notes:  10.2 will always create an inexact and return -1 + tiny
 327|                in the user rounding precision and mode.
 328|
 329|
 330
 331|               Copyright (C) Motorola, Inc. 1990
 332|                       All Rights Reserved
 333|
 334|       For details on the license for this file, please see the
 335|       file, README, in this same directory.
 336
 337|setox  idnt    2,1 | Motorola 040 Floating Point Software Package
 338
 339        |section        8
 340
 341#include "fpsp.h"
 342
 343L2:     .long   0x3FDC0000,0x82E30865,0x4361C4C6,0x00000000
 344
 345EXPA3:  .long   0x3FA55555,0x55554431
 346EXPA2:  .long   0x3FC55555,0x55554018
 347
 348HUGE:   .long   0x7FFE0000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
 349TINY:   .long   0x00010000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
 350
 351EM1A4:  .long   0x3F811111,0x11174385
 352EM1A3:  .long   0x3FA55555,0x55554F5A
 353
 354EM1A2:  .long   0x3FC55555,0x55555555,0x00000000,0x00000000
 355
 356EM1B8:  .long   0x3EC71DE3,0xA5774682
 357EM1B7:  .long   0x3EFA01A0,0x19D7CB68
 358
 359EM1B6:  .long   0x3F2A01A0,0x1A019DF3
 360EM1B5:  .long   0x3F56C16C,0x16C170E2
 361
 362EM1B4:  .long   0x3F811111,0x11111111
 363EM1B3:  .long   0x3FA55555,0x55555555
 364
 365EM1B2:  .long   0x3FFC0000,0xAAAAAAAA,0xAAAAAAAB
 366        .long   0x00000000
 367
 368TWO140: .long   0x48B00000,0x00000000
 369TWON140:        .long   0x37300000,0x00000000
 370
 371EXPTBL:
 372        .long   0x3FFF0000,0x80000000,0x00000000,0x00000000
 373        .long   0x3FFF0000,0x8164D1F3,0xBC030774,0x9F841A9B
 374        .long   0x3FFF0000,0x82CD8698,0xAC2BA1D8,0x9FC1D5B9
 375        .long   0x3FFF0000,0x843A28C3,0xACDE4048,0xA0728369
 376        .long   0x3FFF0000,0x85AAC367,0xCC487B14,0x1FC5C95C
 377        .long   0x3FFF0000,0x871F6196,0x9E8D1010,0x1EE85C9F
 378        .long   0x3FFF0000,0x88980E80,0x92DA8528,0x9FA20729
 379        .long   0x3FFF0000,0x8A14D575,0x496EFD9C,0xA07BF9AF
 380        .long   0x3FFF0000,0x8B95C1E3,0xEA8BD6E8,0xA0020DCF
 381        .long   0x3FFF0000,0x8D1ADF5B,0x7E5BA9E4,0x205A63DA
 382        .long   0x3FFF0000,0x8EA4398B,0x45CD53C0,0x1EB70051
 383        .long   0x3FFF0000,0x9031DC43,0x1466B1DC,0x1F6EB029
 384        .long   0x3FFF0000,0x91C3D373,0xAB11C338,0xA0781494
 385        .long   0x3FFF0000,0x935A2B2F,0x13E6E92C,0x9EB319B0
 386        .long   0x3FFF0000,0x94F4EFA8,0xFEF70960,0x2017457D
 387        .long   0x3FFF0000,0x96942D37,0x20185A00,0x1F11D537
 388        .long   0x3FFF0000,0x9837F051,0x8DB8A970,0x9FB952DD
 389        .long   0x3FFF0000,0x99E04593,0x20B7FA64,0x1FE43087
 390        .long   0x3FFF0000,0x9B8D39B9,0xD54E5538,0x1FA2A818
 391        .long   0x3FFF0000,0x9D3ED9A7,0x2CFFB750,0x1FDE494D
 392        .long   0x3FFF0000,0x9EF53260,0x91A111AC,0x20504890
 393        .long   0x3FFF0000,0xA0B0510F,0xB9714FC4,0xA073691C
 394        .long   0x3FFF0000,0xA2704303,0x0C496818,0x1F9B7A05
 395        .long   0x3FFF0000,0xA43515AE,0x09E680A0,0xA0797126
 396        .long   0x3FFF0000,0xA5FED6A9,0xB15138EC,0xA071A140
 397        .long   0x3FFF0000,0xA7CD93B4,0xE9653568,0x204F62DA
 398        .long   0x3FFF0000,0xA9A15AB4,0xEA7C0EF8,0x1F283C4A
 399        .long   0x3FFF0000,0xAB7A39B5,0xA93ED338,0x9F9A7FDC
 400        .long   0x3FFF0000,0xAD583EEA,0x42A14AC8,0xA05B3FAC
 401        .long   0x3FFF0000,0xAF3B78AD,0x690A4374,0x1FDF2610
 402        .long   0x3FFF0000,0xB123F581,0xD2AC2590,0x9F705F90
 403        .long   0x3FFF0000,0xB311C412,0xA9112488,0x201F678A
 404        .long   0x3FFF0000,0xB504F333,0xF9DE6484,0x1F32FB13
 405        .long   0x3FFF0000,0xB6FD91E3,0x28D17790,0x20038B30
 406        .long   0x3FFF0000,0xB8FBAF47,0x62FB9EE8,0x200DC3CC
 407        .long   0x3FFF0000,0xBAFF5AB2,0x133E45FC,0x9F8B2AE6
 408        .long   0x3FFF0000,0xBD08A39F,0x580C36C0,0xA02BBF70
 409        .long   0x3FFF0000,0xBF1799B6,0x7A731084,0xA00BF518
 410        .long   0x3FFF0000,0xC12C4CCA,0x66709458,0xA041DD41
 411        .long   0x3FFF0000,0xC346CCDA,0x24976408,0x9FDF137B
 412        .long   0x3FFF0000,0xC5672A11,0x5506DADC,0x201F1568
 413        .long   0x3FFF0000,0xC78D74C8,0xABB9B15C,0x1FC13A2E
 414        .long   0x3FFF0000,0xC9B9BD86,0x6E2F27A4,0xA03F8F03
 415        .long   0x3FFF0000,0xCBEC14FE,0xF2727C5C,0x1FF4907D
 416        .long   0x3FFF0000,0xCE248C15,0x1F8480E4,0x9E6E53E4
 417        .long   0x3FFF0000,0xD06333DA,0xEF2B2594,0x1FD6D45C
 418        .long   0x3FFF0000,0xD2A81D91,0xF12AE45C,0xA076EDB9
 419        .long   0x3FFF0000,0xD4F35AAB,0xCFEDFA20,0x9FA6DE21
 420        .long   0x3FFF0000,0xD744FCCA,0xD69D6AF4,0x1EE69A2F
 421        .long   0x3FFF0000,0xD99D15C2,0x78AFD7B4,0x207F439F
 422        .long   0x3FFF0000,0xDBFBB797,0xDAF23754,0x201EC207
 423        .long   0x3FFF0000,0xDE60F482,0x5E0E9124,0x9E8BE175
 424        .long   0x3FFF0000,0xE0CCDEEC,0x2A94E110,0x20032C4B
 425        .long   0x3FFF0000,0xE33F8972,0xBE8A5A50,0x2004DFF5
 426        .long   0x3FFF0000,0xE5B906E7,0x7C8348A8,0x1E72F47A
 427        .long   0x3FFF0000,0xE8396A50,0x3C4BDC68,0x1F722F22
 428        .long   0x3FFF0000,0xEAC0C6E7,0xDD243930,0xA017E945
 429        .long   0x3FFF0000,0xED4F301E,0xD9942B84,0x1F401A5B
 430        .long   0x3FFF0000,0xEFE4B99B,0xDCDAF5CC,0x9FB9A9E3
 431        .long   0x3FFF0000,0xF281773C,0x59FFB138,0x20744C05
 432        .long   0x3FFF0000,0xF5257D15,0x2486CC2C,0x1F773A19
 433        .long   0x3FFF0000,0xF7D0DF73,0x0AD13BB8,0x1FFE90D5
 434        .long   0x3FFF0000,0xFA83B2DB,0x722A033C,0xA041ED22
 435        .long   0x3FFF0000,0xFD3E0C0C,0xF486C174,0x1F853F3A
 436
 437        .set    ADJFLAG,L_SCR2
 438        .set    SCALE,FP_SCR1
 439        .set    ADJSCALE,FP_SCR2
 440        .set    SC,FP_SCR3
 441        .set    ONEBYSC,FP_SCR4
 442
 443        | xref  t_frcinx
 444        |xref   t_extdnrm
 445        |xref   t_unfl
 446        |xref   t_ovfl
 447
 448        .global setoxd
 449setoxd:
 450|--entry point for EXP(X), X is denormalized
 451        movel           (%a0),%d0
 452        andil           #0x80000000,%d0
 453        oril            #0x00800000,%d0         | ...sign(X)*2^(-126)
 454        movel           %d0,-(%sp)
 455        fmoves          #0x3F800000,%fp0
 456        fmovel          %d1,%fpcr
 457        fadds           (%sp)+,%fp0
 458        bra             t_frcinx
 459
 460        .global setox
 461setox:
 462|--entry point for EXP(X), here X is finite, non-zero, and not NaN's
 463
 464|--Step 1.
 465        movel           (%a0),%d0        | ...load part of input X
 466        andil           #0x7FFF0000,%d0 | ...biased expo. of X
 467        cmpil           #0x3FBE0000,%d0 | ...2^(-65)
 468        bges            EXPC1           | ...normal case
 469        bra             EXPSM
 470
 471EXPC1:
 472|--The case |X| >= 2^(-65)
 473        movew           4(%a0),%d0      | ...expo. and partial sig. of |X|
 474        cmpil           #0x400CB167,%d0 | ...16380 log2 trunc. 16 bits
 475        blts            EXPMAIN  | ...normal case
 476        bra             EXPBIG
 477
 478EXPMAIN:
 479|--Step 2.
 480|--This is the normal branch:   2^(-65) <= |X| < 16380 log2.
 481        fmovex          (%a0),%fp0      | ...load input from (a0)
 482
 483        fmovex          %fp0,%fp1
 484        fmuls           #0x42B8AA3B,%fp0        | ...64/log2 * X
 485        fmovemx %fp2-%fp2/%fp3,-(%a7)           | ...save fp2
 486        movel           #0,ADJFLAG(%a6)
 487        fmovel          %fp0,%d0                | ...N = int( X * 64/log2 )
 488        lea             EXPTBL,%a1
 489        fmovel          %d0,%fp0                | ...convert to floating-format
 490
 491        movel           %d0,L_SCR1(%a6) | ...save N temporarily
 492        andil           #0x3F,%d0               | ...D0 is J = N mod 64
 493        lsll            #4,%d0
 494        addal           %d0,%a1         | ...address of 2^(J/64)
 495        movel           L_SCR1(%a6),%d0
 496        asrl            #6,%d0          | ...D0 is M
 497        addiw           #0x3FFF,%d0     | ...biased expo. of 2^(M)
 498        movew           L2,L_SCR1(%a6)  | ...prefetch L2, no need in CB
 499
 500EXPCONT1:
 501|--Step 3.
 502|--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
 503|--a0 points to 2^(J/64), D0 is biased expo. of 2^(M)
 504        fmovex          %fp0,%fp2
 505        fmuls           #0xBC317218,%fp0        | ...N * L1, L1 = lead(-log2/64)
 506        fmulx           L2,%fp2         | ...N * L2, L1+L2 = -log2/64
 507        faddx           %fp1,%fp0               | ...X + N*L1
 508        faddx           %fp2,%fp0               | ...fp0 is R, reduced arg.
 509|       MOVE.W          #$3FA5,EXPA3    ...load EXPA3 in cache
 510
 511|--Step 4.
 512|--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
 513|-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))
 514|--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
 515|--[R+R*S*(A2+S*A4)] + [S*(A1+S*(A3+S*A5))]
 516
 517        fmovex          %fp0,%fp1
 518        fmulx           %fp1,%fp1               | ...fp1 IS S = R*R
 519
 520        fmoves          #0x3AB60B70,%fp2        | ...fp2 IS A5
 521|       MOVE.W          #0,2(%a1)       ...load 2^(J/64) in cache
 522
 523        fmulx           %fp1,%fp2               | ...fp2 IS S*A5
 524        fmovex          %fp1,%fp3
 525        fmuls           #0x3C088895,%fp3        | ...fp3 IS S*A4
 526
 527        faddd           EXPA3,%fp2      | ...fp2 IS A3+S*A5
 528        faddd           EXPA2,%fp3      | ...fp3 IS A2+S*A4
 529
 530        fmulx           %fp1,%fp2               | ...fp2 IS S*(A3+S*A5)
 531        movew           %d0,SCALE(%a6)  | ...SCALE is 2^(M) in extended
 532        clrw            SCALE+2(%a6)
 533        movel           #0x80000000,SCALE+4(%a6)
 534        clrl            SCALE+8(%a6)
 535
 536        fmulx           %fp1,%fp3               | ...fp3 IS S*(A2+S*A4)
 537
 538        fadds           #0x3F000000,%fp2        | ...fp2 IS A1+S*(A3+S*A5)
 539        fmulx           %fp0,%fp3               | ...fp3 IS R*S*(A2+S*A4)
 540
 541        fmulx           %fp1,%fp2               | ...fp2 IS S*(A1+S*(A3+S*A5))
 542        faddx           %fp3,%fp0               | ...fp0 IS R+R*S*(A2+S*A4),
 543|                                       ...fp3 released
 544
 545        fmovex          (%a1)+,%fp1     | ...fp1 is lead. pt. of 2^(J/64)
 546        faddx           %fp2,%fp0               | ...fp0 is EXP(R) - 1
 547|                                       ...fp2 released
 548
 549|--Step 5
 550|--final reconstruction process
 551|--EXP(X) = 2^M * ( 2^(J/64) + 2^(J/64)*(EXP(R)-1) )
 552
 553        fmulx           %fp1,%fp0               | ...2^(J/64)*(Exp(R)-1)
 554        fmovemx (%a7)+,%fp2-%fp2/%fp3   | ...fp2 restored
 555        fadds           (%a1),%fp0      | ...accurate 2^(J/64)
 556
 557        faddx           %fp1,%fp0               | ...2^(J/64) + 2^(J/64)*...
 558        movel           ADJFLAG(%a6),%d0
 559
 560|--Step 6
 561        tstl            %d0
 562        beqs            NORMAL
 563ADJUST:
 564        fmulx           ADJSCALE(%a6),%fp0
 565NORMAL:
 566        fmovel          %d1,%FPCR               | ...restore user FPCR
 567        fmulx           SCALE(%a6),%fp0 | ...multiply 2^(M)
 568        bra             t_frcinx
 569
 570EXPSM:
 571|--Step 7
 572        fmovemx (%a0),%fp0-%fp0 | ...in case X is denormalized
 573        fmovel          %d1,%FPCR
 574        fadds           #0x3F800000,%fp0        | ...1+X in user mode
 575        bra             t_frcinx
 576
 577EXPBIG:
 578|--Step 8
 579        cmpil           #0x400CB27C,%d0 | ...16480 log2
 580        bgts            EXP2BIG
 581|--Steps 8.2 -- 8.6
 582        fmovex          (%a0),%fp0      | ...load input from (a0)
 583
 584        fmovex          %fp0,%fp1
 585        fmuls           #0x42B8AA3B,%fp0        | ...64/log2 * X
 586        fmovemx  %fp2-%fp2/%fp3,-(%a7)          | ...save fp2
 587        movel           #1,ADJFLAG(%a6)
 588        fmovel          %fp0,%d0                | ...N = int( X * 64/log2 )
 589        lea             EXPTBL,%a1
 590        fmovel          %d0,%fp0                | ...convert to floating-format
 591        movel           %d0,L_SCR1(%a6)                 | ...save N temporarily
 592        andil           #0x3F,%d0                | ...D0 is J = N mod 64
 593        lsll            #4,%d0
 594        addal           %d0,%a1                 | ...address of 2^(J/64)
 595        movel           L_SCR1(%a6),%d0
 596        asrl            #6,%d0                  | ...D0 is K
 597        movel           %d0,L_SCR1(%a6)                 | ...save K temporarily
 598        asrl            #1,%d0                  | ...D0 is M1
 599        subl            %d0,L_SCR1(%a6)                 | ...a1 is M
 600        addiw           #0x3FFF,%d0             | ...biased expo. of 2^(M1)
 601        movew           %d0,ADJSCALE(%a6)               | ...ADJSCALE := 2^(M1)
 602        clrw            ADJSCALE+2(%a6)
 603        movel           #0x80000000,ADJSCALE+4(%a6)
 604        clrl            ADJSCALE+8(%a6)
 605        movel           L_SCR1(%a6),%d0                 | ...D0 is M
 606        addiw           #0x3FFF,%d0             | ...biased expo. of 2^(M)
 607        bra             EXPCONT1                | ...go back to Step 3
 608
 609EXP2BIG:
 610|--Step 9
 611        fmovel          %d1,%FPCR
 612        movel           (%a0),%d0
 613        bclrb           #sign_bit,(%a0)         | ...setox always returns positive
 614        cmpil           #0,%d0
 615        blt             t_unfl
 616        bra             t_ovfl
 617
 618        .global setoxm1d
 619setoxm1d:
 620|--entry point for EXPM1(X), here X is denormalized
 621|--Step 0.
 622        bra             t_extdnrm
 623
 624
 625        .global setoxm1
 626setoxm1:
 627|--entry point for EXPM1(X), here X is finite, non-zero, non-NaN
 628
 629|--Step 1.
 630|--Step 1.1
 631        movel           (%a0),%d0        | ...load part of input X
 632        andil           #0x7FFF0000,%d0 | ...biased expo. of X
 633        cmpil           #0x3FFD0000,%d0 | ...1/4
 634        bges            EM1CON1  | ...|X| >= 1/4
 635        bra             EM1SM
 636
 637EM1CON1:
 638|--Step 1.3
 639|--The case |X| >= 1/4
 640        movew           4(%a0),%d0      | ...expo. and partial sig. of |X|
 641        cmpil           #0x4004C215,%d0 | ...70log2 rounded up to 16 bits
 642        bles            EM1MAIN  | ...1/4 <= |X| <= 70log2
 643        bra             EM1BIG
 644
 645EM1MAIN:
 646|--Step 2.
 647|--This is the case:    1/4 <= |X| <= 70 log2.
 648        fmovex          (%a0),%fp0      | ...load input from (a0)
 649
 650        fmovex          %fp0,%fp1
 651        fmuls           #0x42B8AA3B,%fp0        | ...64/log2 * X
 652        fmovemx %fp2-%fp2/%fp3,-(%a7)           | ...save fp2
 653|       MOVE.W          #$3F81,EM1A4            ...prefetch in CB mode
 654        fmovel          %fp0,%d0                | ...N = int( X * 64/log2 )
 655        lea             EXPTBL,%a1
 656        fmovel          %d0,%fp0                | ...convert to floating-format
 657
 658        movel           %d0,L_SCR1(%a6)                 | ...save N temporarily
 659        andil           #0x3F,%d0                | ...D0 is J = N mod 64
 660        lsll            #4,%d0
 661        addal           %d0,%a1                 | ...address of 2^(J/64)
 662        movel           L_SCR1(%a6),%d0
 663        asrl            #6,%d0                  | ...D0 is M
 664        movel           %d0,L_SCR1(%a6)                 | ...save a copy of M
 665|       MOVE.W          #$3FDC,L2               ...prefetch L2 in CB mode
 666
 667|--Step 3.
 668|--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
 669|--a0 points to 2^(J/64), D0 and a1 both contain M
 670        fmovex          %fp0,%fp2
 671        fmuls           #0xBC317218,%fp0        | ...N * L1, L1 = lead(-log2/64)
 672        fmulx           L2,%fp2         | ...N * L2, L1+L2 = -log2/64
 673        faddx           %fp1,%fp0        | ...X + N*L1
 674        faddx           %fp2,%fp0        | ...fp0 is R, reduced arg.
 675|       MOVE.W          #$3FC5,EM1A2            ...load EM1A2 in cache
 676        addiw           #0x3FFF,%d0             | ...D0 is biased expo. of 2^M
 677
 678|--Step 4.
 679|--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
 680|-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*(A5 + R*A6)))))
 681|--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
 682|--[R*S*(A2+S*(A4+S*A6))] + [R+S*(A1+S*(A3+S*A5))]
 683
 684        fmovex          %fp0,%fp1
 685        fmulx           %fp1,%fp1               | ...fp1 IS S = R*R
 686
 687        fmoves          #0x3950097B,%fp2        | ...fp2 IS a6
 688|       MOVE.W          #0,2(%a1)       ...load 2^(J/64) in cache
 689
 690        fmulx           %fp1,%fp2               | ...fp2 IS S*A6
 691        fmovex          %fp1,%fp3
 692        fmuls           #0x3AB60B6A,%fp3        | ...fp3 IS S*A5
 693
 694        faddd           EM1A4,%fp2      | ...fp2 IS A4+S*A6
 695        faddd           EM1A3,%fp3      | ...fp3 IS A3+S*A5
 696        movew           %d0,SC(%a6)             | ...SC is 2^(M) in extended
 697        clrw            SC+2(%a6)
 698        movel           #0x80000000,SC+4(%a6)
 699        clrl            SC+8(%a6)
 700
 701        fmulx           %fp1,%fp2               | ...fp2 IS S*(A4+S*A6)
 702        movel           L_SCR1(%a6),%d0         | ...D0 is      M
 703        negw            %d0             | ...D0 is -M
 704        fmulx           %fp1,%fp3               | ...fp3 IS S*(A3+S*A5)
 705        addiw           #0x3FFF,%d0     | ...biased expo. of 2^(-M)
 706        faddd           EM1A2,%fp2      | ...fp2 IS A2+S*(A4+S*A6)
 707        fadds           #0x3F000000,%fp3        | ...fp3 IS A1+S*(A3+S*A5)
 708
 709        fmulx           %fp1,%fp2               | ...fp2 IS S*(A2+S*(A4+S*A6))
 710        oriw            #0x8000,%d0     | ...signed/expo. of -2^(-M)
 711        movew           %d0,ONEBYSC(%a6)        | ...OnebySc is -2^(-M)
 712        clrw            ONEBYSC+2(%a6)
 713        movel           #0x80000000,ONEBYSC+4(%a6)
 714        clrl            ONEBYSC+8(%a6)
 715        fmulx           %fp3,%fp1               | ...fp1 IS S*(A1+S*(A3+S*A5))
 716|                                       ...fp3 released
 717
 718        fmulx           %fp0,%fp2               | ...fp2 IS R*S*(A2+S*(A4+S*A6))
 719        faddx           %fp1,%fp0               | ...fp0 IS R+S*(A1+S*(A3+S*A5))
 720|                                       ...fp1 released
 721
 722        faddx           %fp2,%fp0               | ...fp0 IS EXP(R)-1
 723|                                       ...fp2 released
 724        fmovemx (%a7)+,%fp2-%fp2/%fp3   | ...fp2 restored
 725
 726|--Step 5
 727|--Compute 2^(J/64)*p
 728
 729        fmulx           (%a1),%fp0      | ...2^(J/64)*(Exp(R)-1)
 730
 731|--Step 6
 732|--Step 6.1
 733        movel           L_SCR1(%a6),%d0         | ...retrieve M
 734        cmpil           #63,%d0
 735        bles            MLE63
 736|--Step 6.2     M >= 64
 737        fmoves          12(%a1),%fp1    | ...fp1 is t
 738        faddx           ONEBYSC(%a6),%fp1       | ...fp1 is t+OnebySc
 739        faddx           %fp1,%fp0               | ...p+(t+OnebySc), fp1 released
 740        faddx           (%a1),%fp0      | ...T+(p+(t+OnebySc))
 741        bras            EM1SCALE
 742MLE63:
 743|--Step 6.3     M <= 63
 744        cmpil           #-3,%d0
 745        bges            MGEN3
 746MLTN3:
 747|--Step 6.4     M <= -4
 748        fadds           12(%a1),%fp0    | ...p+t
 749        faddx           (%a1),%fp0      | ...T+(p+t)
 750        faddx           ONEBYSC(%a6),%fp0       | ...OnebySc + (T+(p+t))
 751        bras            EM1SCALE
 752MGEN3:
 753|--Step 6.5     -3 <= M <= 63
 754        fmovex          (%a1)+,%fp1     | ...fp1 is T
 755        fadds           (%a1),%fp0      | ...fp0 is p+t
 756        faddx           ONEBYSC(%a6),%fp1       | ...fp1 is T+OnebySc
 757        faddx           %fp1,%fp0               | ...(T+OnebySc)+(p+t)
 758
 759EM1SCALE:
 760|--Step 6.6
 761        fmovel          %d1,%FPCR
 762        fmulx           SC(%a6),%fp0
 763
 764        bra             t_frcinx
 765
 766EM1SM:
 767|--Step 7       |X| < 1/4.
 768        cmpil           #0x3FBE0000,%d0 | ...2^(-65)
 769        bges            EM1POLY
 770
 771EM1TINY:
 772|--Step 8       |X| < 2^(-65)
 773        cmpil           #0x00330000,%d0 | ...2^(-16312)
 774        blts            EM12TINY
 775|--Step 8.2
 776        movel           #0x80010000,SC(%a6)     | ...SC is -2^(-16382)
 777        movel           #0x80000000,SC+4(%a6)
 778        clrl            SC+8(%a6)
 779        fmovex          (%a0),%fp0
 780        fmovel          %d1,%FPCR
 781        faddx           SC(%a6),%fp0
 782
 783        bra             t_frcinx
 784
 785EM12TINY:
 786|--Step 8.3
 787        fmovex          (%a0),%fp0
 788        fmuld           TWO140,%fp0
 789        movel           #0x80010000,SC(%a6)
 790        movel           #0x80000000,SC+4(%a6)
 791        clrl            SC+8(%a6)
 792        faddx           SC(%a6),%fp0
 793        fmovel          %d1,%FPCR
 794        fmuld           TWON140,%fp0
 795
 796        bra             t_frcinx
 797
 798EM1POLY:
 799|--Step 9       exp(X)-1 by a simple polynomial
 800        fmovex          (%a0),%fp0      | ...fp0 is X
 801        fmulx           %fp0,%fp0               | ...fp0 is S := X*X
 802        fmovemx %fp2-%fp2/%fp3,-(%a7)   | ...save fp2
 803        fmoves          #0x2F30CAA8,%fp1        | ...fp1 is B12
 804        fmulx           %fp0,%fp1               | ...fp1 is S*B12
 805        fmoves          #0x310F8290,%fp2        | ...fp2 is B11
 806        fadds           #0x32D73220,%fp1        | ...fp1 is B10+S*B12
 807
 808        fmulx           %fp0,%fp2               | ...fp2 is S*B11
 809        fmulx           %fp0,%fp1               | ...fp1 is S*(B10 + ...
 810
 811        fadds           #0x3493F281,%fp2        | ...fp2 is B9+S*...
 812        faddd           EM1B8,%fp1      | ...fp1 is B8+S*...
 813
 814        fmulx           %fp0,%fp2               | ...fp2 is S*(B9+...
 815        fmulx           %fp0,%fp1               | ...fp1 is S*(B8+...
 816
 817        faddd           EM1B7,%fp2      | ...fp2 is B7+S*...
 818        faddd           EM1B6,%fp1      | ...fp1 is B6+S*...
 819
 820        fmulx           %fp0,%fp2               | ...fp2 is S*(B7+...
 821        fmulx           %fp0,%fp1               | ...fp1 is S*(B6+...
 822
 823        faddd           EM1B5,%fp2      | ...fp2 is B5+S*...
 824        faddd           EM1B4,%fp1      | ...fp1 is B4+S*...
 825
 826        fmulx           %fp0,%fp2               | ...fp2 is S*(B5+...
 827        fmulx           %fp0,%fp1               | ...fp1 is S*(B4+...
 828
 829        faddd           EM1B3,%fp2      | ...fp2 is B3+S*...
 830        faddx           EM1B2,%fp1      | ...fp1 is B2+S*...
 831
 832        fmulx           %fp0,%fp2               | ...fp2 is S*(B3+...
 833        fmulx           %fp0,%fp1               | ...fp1 is S*(B2+...
 834
 835        fmulx           %fp0,%fp2               | ...fp2 is S*S*(B3+...)
 836        fmulx           (%a0),%fp1      | ...fp1 is X*S*(B2...
 837
 838        fmuls           #0x3F000000,%fp0        | ...fp0 is S*B1
 839        faddx           %fp2,%fp1               | ...fp1 is Q
 840|                                       ...fp2 released
 841
 842        fmovemx (%a7)+,%fp2-%fp2/%fp3   | ...fp2 restored
 843
 844        faddx           %fp1,%fp0               | ...fp0 is S*B1+Q
 845|                                       ...fp1 released
 846
 847        fmovel          %d1,%FPCR
 848        faddx           (%a0),%fp0
 849
 850        bra             t_frcinx
 851
 852EM1BIG:
 853|--Step 10      |X| > 70 log2
 854        movel           (%a0),%d0
 855        cmpil           #0,%d0
 856        bgt             EXPC1
 857|--Step 10.2
 858        fmoves          #0xBF800000,%fp0        | ...fp0 is -1
 859        fmovel          %d1,%FPCR
 860        fadds           #0x00800000,%fp0        | ...-1 + 2^(-126)
 861
 862        bra             t_frcinx
 863
 864        |end
 865