linux/arch/ia64/lib/memset.S
<<
>>
Prefs
   1/* Optimized version of the standard memset() function.
   2
   3   Copyright (c) 2002 Hewlett-Packard Co/CERN
   4        Sverre Jarp <Sverre.Jarp@cern.ch>
   5
   6   Return: dest
   7
   8   Inputs:
   9        in0:    dest
  10        in1:    value
  11        in2:    count
  12
  13   The algorithm is fairly straightforward: set byte by byte until we
  14   we get to a 16B-aligned address, then loop on 128 B chunks using an
  15   early store as prefetching, then loop on 32B chucks, then clear remaining
  16   words, finally clear remaining bytes.
  17   Since a stf.spill f0 can store 16B in one go, we use this instruction
  18   to get peak speed when value = 0.  */
  19
  20#include <asm/asmmacro.h>
  21#undef ret
  22
  23#define dest            in0
  24#define value           in1
  25#define cnt             in2
  26
  27#define tmp             r31
  28#define save_lc         r30
  29#define ptr0            r29
  30#define ptr1            r28
  31#define ptr2            r27
  32#define ptr3            r26
  33#define ptr9            r24
  34#define loopcnt         r23
  35#define linecnt         r22
  36#define bytecnt         r21
  37
  38#define fvalue          f6
  39
  40// This routine uses only scratch predicate registers (p6 - p15)
  41#define p_scr           p6                      // default register for same-cycle branches
  42#define p_nz            p7
  43#define p_zr            p8
  44#define p_unalgn        p9
  45#define p_y             p11
  46#define p_n             p12
  47#define p_yy            p13
  48#define p_nn            p14
  49
  50#define MIN1            15
  51#define MIN1P1HALF      8
  52#define LINE_SIZE       128
  53#define LSIZE_SH        7                       // shift amount
  54#define PREF_AHEAD      8
  55
  56GLOBAL_ENTRY(memset)
  57{ .mmi
  58        .prologue
  59        alloc   tmp = ar.pfs, 3, 0, 0, 0
  60        lfetch.nt1 [dest]                       //
  61        .save   ar.lc, save_lc
  62        mov.i   save_lc = ar.lc
  63        .body
  64} { .mmi
  65        mov     ret0 = dest                     // return value
  66        cmp.ne  p_nz, p_zr = value, r0          // use stf.spill if value is zero
  67        cmp.eq  p_scr, p0 = cnt, r0
  68;; }
  69{ .mmi
  70        and     ptr2 = -(MIN1+1), dest          // aligned address
  71        and     tmp = MIN1, dest                // prepare to check for correct alignment
  72        tbit.nz p_y, p_n = dest, 0              // Do we have an odd address? (M_B_U)
  73} { .mib
  74        mov     ptr1 = dest
  75        mux1    value = value, @brcst           // create 8 identical bytes in word
  76(p_scr) br.ret.dpnt.many rp                     // return immediately if count = 0
  77;; }
  78{ .mib
  79        cmp.ne  p_unalgn, p0 = tmp, r0          //
  80} { .mib
  81        sub     bytecnt = (MIN1+1), tmp         // NB: # of bytes to move is 1 higher than loopcnt
  82        cmp.gt  p_scr, p0 = 16, cnt             // is it a minimalistic task?
  83(p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U)
  84;; }
  85{ .mmi
  86(p_unalgn) add  ptr1 = (MIN1+1), ptr2           // after alignment
  87(p_unalgn) add  ptr2 = MIN1P1HALF, ptr2         // after alignment
  88(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3    // should we do a st8 ?
  89;; }
  90{ .mib
  91(p_y)   add     cnt = -8, cnt                   //
  92(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2  // should we do a st4 ?
  93} { .mib
  94(p_y)   st8     [ptr2] = value,-4               //
  95(p_n)   add     ptr2 = 4, ptr2                  //
  96;; }
  97{ .mib
  98(p_yy)  add     cnt = -4, cnt                   //
  99(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1    // should we do a st2 ?
 100} { .mib
 101(p_yy)  st4     [ptr2] = value,-2               //
 102(p_nn)  add     ptr2 = 2, ptr2                  //
 103;; }
 104{ .mmi
 105        mov     tmp = LINE_SIZE+1               // for compare
 106(p_y)   add     cnt = -2, cnt                   //
 107(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0  // should we do a st1 ?
 108} { .mmi
 109        setf.sig fvalue=value                   // transfer value to FLP side
 110(p_y)   st2     [ptr2] = value,-1               //
 111(p_n)   add     ptr2 = 1, ptr2                  //
 112;; }
 113
 114{ .mmi
 115(p_yy)  st1     [ptr2] = value                  //
 116        cmp.gt  p_scr, p0 = tmp, cnt            // is it a minimalistic task?
 117} { .mbb
 118(p_yy)  add     cnt = -1, cnt                   //
 119(p_scr) br.cond.dpnt.many .fraction_of_line     // go move just a few
 120;; }
 121
 122{ .mib
 123        nop.m 0
 124        shr.u   linecnt = cnt, LSIZE_SH
 125(p_zr)  br.cond.dptk.many .l1b                  // Jump to use stf.spill
 126;; }
 127
 128        TEXT_ALIGN(32) // --------------------- //  L1A: store ahead into cache lines; fill later
 129{ .mmi
 130        and     tmp = -(LINE_SIZE), cnt         // compute end of range
 131        mov     ptr9 = ptr1                     // used for prefetching
 132        and     cnt = (LINE_SIZE-1), cnt        // remainder
 133} { .mmi
 134        mov     loopcnt = PREF_AHEAD-1          // default prefetch loop
 135        cmp.gt  p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
 136;; }
 137{ .mmi
 138(p_scr) add     loopcnt = -1, linecnt           //
 139        add     ptr2 = 8, ptr1                  // start of stores (beyond prefetch stores)
 140        add     ptr1 = tmp, ptr1                // first address beyond total range
 141;; }
 142{ .mmi
 143        add     tmp = -1, linecnt               // next loop count
 144        mov.i   ar.lc = loopcnt                 //
 145;; }
 146.pref_l1a:
 147{ .mib
 148        stf8 [ptr9] = fvalue, 128               // Do stores one cache line apart
 149        nop.i   0
 150        br.cloop.dptk.few .pref_l1a
 151;; }
 152{ .mmi
 153        add     ptr0 = 16, ptr2                 // Two stores in parallel
 154        mov.i   ar.lc = tmp                     //
 155;; }
 156.l1ax:
 157 { .mmi
 158        stf8 [ptr2] = fvalue, 8
 159        stf8 [ptr0] = fvalue, 8
 160 ;; }
 161 { .mmi
 162        stf8 [ptr2] = fvalue, 24
 163        stf8 [ptr0] = fvalue, 24
 164 ;; }
 165 { .mmi
 166        stf8 [ptr2] = fvalue, 8
 167        stf8 [ptr0] = fvalue, 8
 168 ;; }
 169 { .mmi
 170        stf8 [ptr2] = fvalue, 24
 171        stf8 [ptr0] = fvalue, 24
 172 ;; }
 173 { .mmi
 174        stf8 [ptr2] = fvalue, 8
 175        stf8 [ptr0] = fvalue, 8
 176 ;; }
 177 { .mmi
 178        stf8 [ptr2] = fvalue, 24
 179        stf8 [ptr0] = fvalue, 24
 180 ;; }
 181 { .mmi
 182        stf8 [ptr2] = fvalue, 8
 183        stf8 [ptr0] = fvalue, 32
 184        cmp.lt  p_scr, p0 = ptr9, ptr1          // do we need more prefetching?
 185 ;; }
 186{ .mmb
 187        stf8 [ptr2] = fvalue, 24
 188(p_scr) stf8 [ptr9] = fvalue, 128
 189        br.cloop.dptk.few .l1ax
 190;; }
 191{ .mbb
 192        cmp.le  p_scr, p0 = 8, cnt              // just a few bytes left ?
 193(p_scr) br.cond.dpnt.many  .fraction_of_line    // Branch no. 2
 194        br.cond.dpnt.many  .move_bytes_from_alignment   // Branch no. 3
 195;; }
 196
 197        TEXT_ALIGN(32)
 198.l1b:   // ------------------------------------ //  L1B: store ahead into cache lines; fill later
 199{ .mmi
 200        and     tmp = -(LINE_SIZE), cnt         // compute end of range
 201        mov     ptr9 = ptr1                     // used for prefetching
 202        and     cnt = (LINE_SIZE-1), cnt        // remainder
 203} { .mmi
 204        mov     loopcnt = PREF_AHEAD-1          // default prefetch loop
 205        cmp.gt  p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
 206;; }
 207{ .mmi
 208(p_scr) add     loopcnt = -1, linecnt
 209        add     ptr2 = 16, ptr1                 // start of stores (beyond prefetch stores)
 210        add     ptr1 = tmp, ptr1                // first address beyond total range
 211;; }
 212{ .mmi
 213        add     tmp = -1, linecnt               // next loop count
 214        mov.i   ar.lc = loopcnt
 215;; }
 216.pref_l1b:
 217{ .mib
 218        stf.spill [ptr9] = f0, 128              // Do stores one cache line apart
 219        nop.i   0
 220        br.cloop.dptk.few .pref_l1b
 221;; }
 222{ .mmi
 223        add     ptr0 = 16, ptr2                 // Two stores in parallel
 224        mov.i   ar.lc = tmp
 225;; }
 226.l1bx:
 227 { .mmi
 228        stf.spill [ptr2] = f0, 32
 229        stf.spill [ptr0] = f0, 32
 230 ;; }
 231 { .mmi
 232        stf.spill [ptr2] = f0, 32
 233        stf.spill [ptr0] = f0, 32
 234 ;; }
 235 { .mmi
 236        stf.spill [ptr2] = f0, 32
 237        stf.spill [ptr0] = f0, 64
 238        cmp.lt  p_scr, p0 = ptr9, ptr1          // do we need more prefetching?
 239 ;; }
 240{ .mmb
 241        stf.spill [ptr2] = f0, 32
 242(p_scr) stf.spill [ptr9] = f0, 128
 243        br.cloop.dptk.few .l1bx
 244;; }
 245{ .mib
 246        cmp.gt  p_scr, p0 = 8, cnt              // just a few bytes left ?
 247(p_scr) br.cond.dpnt.many  .move_bytes_from_alignment   //
 248;; }
 249
 250.fraction_of_line:
 251{ .mib
 252        add     ptr2 = 16, ptr1
 253        shr.u   loopcnt = cnt, 5                // loopcnt = cnt / 32
 254;; }
 255{ .mib
 256        cmp.eq  p_scr, p0 = loopcnt, r0
 257        add     loopcnt = -1, loopcnt
 258(p_scr) br.cond.dpnt.many .store_words
 259;; }
 260{ .mib
 261        and     cnt = 0x1f, cnt                 // compute the remaining cnt
 262        mov.i   ar.lc = loopcnt
 263;; }
 264        TEXT_ALIGN(32)
 265.l2:    // ------------------------------------ //  L2A:  store 32B in 2 cycles
 266{ .mmb
 267        stf8    [ptr1] = fvalue, 8
 268        stf8    [ptr2] = fvalue, 8
 269;; } { .mmb
 270        stf8    [ptr1] = fvalue, 24
 271        stf8    [ptr2] = fvalue, 24
 272        br.cloop.dptk.many .l2
 273;; }
 274.store_words:
 275{ .mib
 276        cmp.gt  p_scr, p0 = 8, cnt              // just a few bytes left ?
 277(p_scr) br.cond.dpnt.many .move_bytes_from_alignment    // Branch
 278;; }
 279
 280{ .mmi
 281        stf8    [ptr1] = fvalue, 8              // store
 282        cmp.le  p_y, p_n = 16, cnt
 283        add     cnt = -8, cnt                   // subtract
 284;; }
 285{ .mmi
 286(p_y)   stf8    [ptr1] = fvalue, 8              // store
 287(p_y)   cmp.le.unc p_yy, p_nn = 16, cnt
 288(p_y)   add     cnt = -8, cnt                   // subtract
 289;; }
 290{ .mmi                                          // store
 291(p_yy)  stf8    [ptr1] = fvalue, 8
 292(p_yy)  add     cnt = -8, cnt                   // subtract
 293;; }
 294
 295.move_bytes_from_alignment:
 296{ .mib
 297        cmp.eq  p_scr, p0 = cnt, r0
 298        tbit.nz.unc p_y, p0 = cnt, 2            // should we terminate with a st4 ?
 299(p_scr) br.cond.dpnt.few .restore_and_exit
 300;; }
 301{ .mib
 302(p_y)   st4     [ptr1] = value,4
 303        tbit.nz.unc p_yy, p0 = cnt, 1           // should we terminate with a st2 ?
 304;; }
 305{ .mib
 306(p_yy)  st2     [ptr1] = value,2
 307        tbit.nz.unc p_y, p0 = cnt, 0            // should we terminate with a st1 ?
 308;; }
 309
 310{ .mib
 311(p_y)   st1     [ptr1] = value
 312;; }
 313.restore_and_exit:
 314{ .mib
 315        nop.m   0
 316        mov.i   ar.lc = save_lc
 317        br.ret.sptk.many rp
 318;; }
 319
 320.move_bytes_unaligned:
 321{ .mmi
 322       .pred.rel "mutex",p_y, p_n
 323       .pred.rel "mutex",p_yy, p_nn
 324(p_n)   cmp.le  p_yy, p_nn = 4, cnt
 325(p_y)   cmp.le  p_yy, p_nn = 5, cnt
 326(p_n)   add     ptr2 = 2, ptr1
 327} { .mmi
 328(p_y)   add     ptr2 = 3, ptr1
 329(p_y)   st1     [ptr1] = value, 1               // fill 1 (odd-aligned) byte [15, 14 (or less) left]
 330(p_y)   add     cnt = -1, cnt
 331;; }
 332{ .mmi
 333(p_yy)  cmp.le.unc p_y, p0 = 8, cnt
 334        add     ptr3 = ptr1, cnt                // prepare last store
 335        mov.i   ar.lc = save_lc
 336} { .mmi
 337(p_yy)  st2     [ptr1] = value, 4               // fill 2 (aligned) bytes
 338(p_yy)  st2     [ptr2] = value, 4               // fill 2 (aligned) bytes [11, 10 (o less) left]
 339(p_yy)  add     cnt = -4, cnt
 340;; }
 341{ .mmi
 342(p_y)   cmp.le.unc p_yy, p0 = 8, cnt
 343        add     ptr3 = -1, ptr3                 // last store
 344        tbit.nz p_scr, p0 = cnt, 1              // will there be a st2 at the end ?
 345} { .mmi
 346(p_y)   st2     [ptr1] = value, 4               // fill 2 (aligned) bytes
 347(p_y)   st2     [ptr2] = value, 4               // fill 2 (aligned) bytes [7, 6 (or less) left]
 348(p_y)   add     cnt = -4, cnt
 349;; }
 350{ .mmi
 351(p_yy)  st2     [ptr1] = value, 4               // fill 2 (aligned) bytes
 352(p_yy)  st2     [ptr2] = value, 4               // fill 2 (aligned) bytes [3, 2 (or less) left]
 353        tbit.nz p_y, p0 = cnt, 0                // will there be a st1 at the end ?
 354} { .mmi
 355(p_yy)  add     cnt = -4, cnt
 356;; }
 357{ .mmb
 358(p_scr) st2     [ptr1] = value                  // fill 2 (aligned) bytes
 359(p_y)   st1     [ptr3] = value                  // fill last byte (using ptr3)
 360        br.ret.sptk.many rp
 361}
 362END(memset)
 363