linux/arch/ia64/lib/memset.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0 */
   2/* Optimized version of the standard memset() function.
   3
   4   Copyright (c) 2002 Hewlett-Packard Co/CERN
   5        Sverre Jarp <Sverre.Jarp@cern.ch>
   6
   7   Return: dest
   8
   9   Inputs:
  10        in0:    dest
  11        in1:    value
  12        in2:    count
  13
  14   The algorithm is fairly straightforward: set byte by byte until we
  15   we get to a 16B-aligned address, then loop on 128 B chunks using an
  16   early store as prefetching, then loop on 32B chucks, then clear remaining
  17   words, finally clear remaining bytes.
  18   Since a stf.spill f0 can store 16B in one go, we use this instruction
  19   to get peak speed when value = 0.  */
  20
  21#include <asm/asmmacro.h>
  22#include <asm/export.h>
  23#undef ret
  24
  25#define dest            in0
  26#define value           in1
  27#define cnt             in2
  28
  29#define tmp             r31
  30#define save_lc         r30
  31#define ptr0            r29
  32#define ptr1            r28
  33#define ptr2            r27
  34#define ptr3            r26
  35#define ptr9            r24
  36#define loopcnt         r23
  37#define linecnt         r22
  38#define bytecnt         r21
  39
  40#define fvalue          f6
  41
  42// This routine uses only scratch predicate registers (p6 - p15)
  43#define p_scr           p6                      // default register for same-cycle branches
  44#define p_nz            p7
  45#define p_zr            p8
  46#define p_unalgn        p9
  47#define p_y             p11
  48#define p_n             p12
  49#define p_yy            p13
  50#define p_nn            p14
  51
  52#define MIN1            15
  53#define MIN1P1HALF      8
  54#define LINE_SIZE       128
  55#define LSIZE_SH        7                       // shift amount
  56#define PREF_AHEAD      8
  57
  58GLOBAL_ENTRY(memset)
  59{ .mmi
  60        .prologue
  61        alloc   tmp = ar.pfs, 3, 0, 0, 0
  62        lfetch.nt1 [dest]                       //
  63        .save   ar.lc, save_lc
  64        mov.i   save_lc = ar.lc
  65        .body
  66} { .mmi
  67        mov     ret0 = dest                     // return value
  68        cmp.ne  p_nz, p_zr = value, r0          // use stf.spill if value is zero
  69        cmp.eq  p_scr, p0 = cnt, r0
  70;; }
  71{ .mmi
  72        and     ptr2 = -(MIN1+1), dest          // aligned address
  73        and     tmp = MIN1, dest                // prepare to check for correct alignment
  74        tbit.nz p_y, p_n = dest, 0              // Do we have an odd address? (M_B_U)
  75} { .mib
  76        mov     ptr1 = dest
  77        mux1    value = value, @brcst           // create 8 identical bytes in word
  78(p_scr) br.ret.dpnt.many rp                     // return immediately if count = 0
  79;; }
  80{ .mib
  81        cmp.ne  p_unalgn, p0 = tmp, r0          //
  82} { .mib
  83        sub     bytecnt = (MIN1+1), tmp         // NB: # of bytes to move is 1 higher than loopcnt
  84        cmp.gt  p_scr, p0 = 16, cnt             // is it a minimalistic task?
  85(p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U)
  86;; }
  87{ .mmi
  88(p_unalgn) add  ptr1 = (MIN1+1), ptr2           // after alignment
  89(p_unalgn) add  ptr2 = MIN1P1HALF, ptr2         // after alignment
  90(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3    // should we do a st8 ?
  91;; }
  92{ .mib
  93(p_y)   add     cnt = -8, cnt                   //
  94(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2  // should we do a st4 ?
  95} { .mib
  96(p_y)   st8     [ptr2] = value,-4               //
  97(p_n)   add     ptr2 = 4, ptr2                  //
  98;; }
  99{ .mib
 100(p_yy)  add     cnt = -4, cnt                   //
 101(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1    // should we do a st2 ?
 102} { .mib
 103(p_yy)  st4     [ptr2] = value,-2               //
 104(p_nn)  add     ptr2 = 2, ptr2                  //
 105;; }
 106{ .mmi
 107        mov     tmp = LINE_SIZE+1               // for compare
 108(p_y)   add     cnt = -2, cnt                   //
 109(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0  // should we do a st1 ?
 110} { .mmi
 111        setf.sig fvalue=value                   // transfer value to FLP side
 112(p_y)   st2     [ptr2] = value,-1               //
 113(p_n)   add     ptr2 = 1, ptr2                  //
 114;; }
 115
 116{ .mmi
 117(p_yy)  st1     [ptr2] = value                  //
 118        cmp.gt  p_scr, p0 = tmp, cnt            // is it a minimalistic task?
 119} { .mbb
 120(p_yy)  add     cnt = -1, cnt                   //
 121(p_scr) br.cond.dpnt.many .fraction_of_line     // go move just a few
 122;; }
 123
 124{ .mib
 125        nop.m 0
 126        shr.u   linecnt = cnt, LSIZE_SH
 127(p_zr)  br.cond.dptk.many .l1b                  // Jump to use stf.spill
 128;; }
 129
 130        TEXT_ALIGN(32) // --------------------- //  L1A: store ahead into cache lines; fill later
 131{ .mmi
 132        and     tmp = -(LINE_SIZE), cnt         // compute end of range
 133        mov     ptr9 = ptr1                     // used for prefetching
 134        and     cnt = (LINE_SIZE-1), cnt        // remainder
 135} { .mmi
 136        mov     loopcnt = PREF_AHEAD-1          // default prefetch loop
 137        cmp.gt  p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
 138;; }
 139{ .mmi
 140(p_scr) add     loopcnt = -1, linecnt           //
 141        add     ptr2 = 8, ptr1                  // start of stores (beyond prefetch stores)
 142        add     ptr1 = tmp, ptr1                // first address beyond total range
 143;; }
 144{ .mmi
 145        add     tmp = -1, linecnt               // next loop count
 146        mov.i   ar.lc = loopcnt                 //
 147;; }
 148.pref_l1a:
 149{ .mib
 150        stf8 [ptr9] = fvalue, 128               // Do stores one cache line apart
 151        nop.i   0
 152        br.cloop.dptk.few .pref_l1a
 153;; }
 154{ .mmi
 155        add     ptr0 = 16, ptr2                 // Two stores in parallel
 156        mov.i   ar.lc = tmp                     //
 157;; }
 158.l1ax:
 159 { .mmi
 160        stf8 [ptr2] = fvalue, 8
 161        stf8 [ptr0] = fvalue, 8
 162 ;; }
 163 { .mmi
 164        stf8 [ptr2] = fvalue, 24
 165        stf8 [ptr0] = fvalue, 24
 166 ;; }
 167 { .mmi
 168        stf8 [ptr2] = fvalue, 8
 169        stf8 [ptr0] = fvalue, 8
 170 ;; }
 171 { .mmi
 172        stf8 [ptr2] = fvalue, 24
 173        stf8 [ptr0] = fvalue, 24
 174 ;; }
 175 { .mmi
 176        stf8 [ptr2] = fvalue, 8
 177        stf8 [ptr0] = fvalue, 8
 178 ;; }
 179 { .mmi
 180        stf8 [ptr2] = fvalue, 24
 181        stf8 [ptr0] = fvalue, 24
 182 ;; }
 183 { .mmi
 184        stf8 [ptr2] = fvalue, 8
 185        stf8 [ptr0] = fvalue, 32
 186        cmp.lt  p_scr, p0 = ptr9, ptr1          // do we need more prefetching?
 187 ;; }
 188{ .mmb
 189        stf8 [ptr2] = fvalue, 24
 190(p_scr) stf8 [ptr9] = fvalue, 128
 191        br.cloop.dptk.few .l1ax
 192;; }
 193{ .mbb
 194        cmp.le  p_scr, p0 = 8, cnt              // just a few bytes left ?
 195(p_scr) br.cond.dpnt.many  .fraction_of_line    // Branch no. 2
 196        br.cond.dpnt.many  .move_bytes_from_alignment   // Branch no. 3
 197;; }
 198
 199        TEXT_ALIGN(32)
 200.l1b:   // ------------------------------------ //  L1B: store ahead into cache lines; fill later
 201{ .mmi
 202        and     tmp = -(LINE_SIZE), cnt         // compute end of range
 203        mov     ptr9 = ptr1                     // used for prefetching
 204        and     cnt = (LINE_SIZE-1), cnt        // remainder
 205} { .mmi
 206        mov     loopcnt = PREF_AHEAD-1          // default prefetch loop
 207        cmp.gt  p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
 208;; }
 209{ .mmi
 210(p_scr) add     loopcnt = -1, linecnt
 211        add     ptr2 = 16, ptr1                 // start of stores (beyond prefetch stores)
 212        add     ptr1 = tmp, ptr1                // first address beyond total range
 213;; }
 214{ .mmi
 215        add     tmp = -1, linecnt               // next loop count
 216        mov.i   ar.lc = loopcnt
 217;; }
 218.pref_l1b:
 219{ .mib
 220        stf.spill [ptr9] = f0, 128              // Do stores one cache line apart
 221        nop.i   0
 222        br.cloop.dptk.few .pref_l1b
 223;; }
 224{ .mmi
 225        add     ptr0 = 16, ptr2                 // Two stores in parallel
 226        mov.i   ar.lc = tmp
 227;; }
 228.l1bx:
 229 { .mmi
 230        stf.spill [ptr2] = f0, 32
 231        stf.spill [ptr0] = f0, 32
 232 ;; }
 233 { .mmi
 234        stf.spill [ptr2] = f0, 32
 235        stf.spill [ptr0] = f0, 32
 236 ;; }
 237 { .mmi
 238        stf.spill [ptr2] = f0, 32
 239        stf.spill [ptr0] = f0, 64
 240        cmp.lt  p_scr, p0 = ptr9, ptr1          // do we need more prefetching?
 241 ;; }
 242{ .mmb
 243        stf.spill [ptr2] = f0, 32
 244(p_scr) stf.spill [ptr9] = f0, 128
 245        br.cloop.dptk.few .l1bx
 246;; }
 247{ .mib
 248        cmp.gt  p_scr, p0 = 8, cnt              // just a few bytes left ?
 249(p_scr) br.cond.dpnt.many  .move_bytes_from_alignment   //
 250;; }
 251
 252.fraction_of_line:
 253{ .mib
 254        add     ptr2 = 16, ptr1
 255        shr.u   loopcnt = cnt, 5                // loopcnt = cnt / 32
 256;; }
 257{ .mib
 258        cmp.eq  p_scr, p0 = loopcnt, r0
 259        add     loopcnt = -1, loopcnt
 260(p_scr) br.cond.dpnt.many .store_words
 261;; }
 262{ .mib
 263        and     cnt = 0x1f, cnt                 // compute the remaining cnt
 264        mov.i   ar.lc = loopcnt
 265;; }
 266        TEXT_ALIGN(32)
 267.l2:    // ------------------------------------ //  L2A:  store 32B in 2 cycles
 268{ .mmb
 269        stf8    [ptr1] = fvalue, 8
 270        stf8    [ptr2] = fvalue, 8
 271;; } { .mmb
 272        stf8    [ptr1] = fvalue, 24
 273        stf8    [ptr2] = fvalue, 24
 274        br.cloop.dptk.many .l2
 275;; }
 276.store_words:
 277{ .mib
 278        cmp.gt  p_scr, p0 = 8, cnt              // just a few bytes left ?
 279(p_scr) br.cond.dpnt.many .move_bytes_from_alignment    // Branch
 280;; }
 281
 282{ .mmi
 283        stf8    [ptr1] = fvalue, 8              // store
 284        cmp.le  p_y, p_n = 16, cnt
 285        add     cnt = -8, cnt                   // subtract
 286;; }
 287{ .mmi
 288(p_y)   stf8    [ptr1] = fvalue, 8              // store
 289(p_y)   cmp.le.unc p_yy, p_nn = 16, cnt
 290(p_y)   add     cnt = -8, cnt                   // subtract
 291;; }
 292{ .mmi                                          // store
 293(p_yy)  stf8    [ptr1] = fvalue, 8
 294(p_yy)  add     cnt = -8, cnt                   // subtract
 295;; }
 296
 297.move_bytes_from_alignment:
 298{ .mib
 299        cmp.eq  p_scr, p0 = cnt, r0
 300        tbit.nz.unc p_y, p0 = cnt, 2            // should we terminate with a st4 ?
 301(p_scr) br.cond.dpnt.few .restore_and_exit
 302;; }
 303{ .mib
 304(p_y)   st4     [ptr1] = value,4
 305        tbit.nz.unc p_yy, p0 = cnt, 1           // should we terminate with a st2 ?
 306;; }
 307{ .mib
 308(p_yy)  st2     [ptr1] = value,2
 309        tbit.nz.unc p_y, p0 = cnt, 0            // should we terminate with a st1 ?
 310;; }
 311
 312{ .mib
 313(p_y)   st1     [ptr1] = value
 314;; }
 315.restore_and_exit:
 316{ .mib
 317        nop.m   0
 318        mov.i   ar.lc = save_lc
 319        br.ret.sptk.many rp
 320;; }
 321
 322.move_bytes_unaligned:
 323{ .mmi
 324       .pred.rel "mutex",p_y, p_n
 325       .pred.rel "mutex",p_yy, p_nn
 326(p_n)   cmp.le  p_yy, p_nn = 4, cnt
 327(p_y)   cmp.le  p_yy, p_nn = 5, cnt
 328(p_n)   add     ptr2 = 2, ptr1
 329} { .mmi
 330(p_y)   add     ptr2 = 3, ptr1
 331(p_y)   st1     [ptr1] = value, 1               // fill 1 (odd-aligned) byte [15, 14 (or less) left]
 332(p_y)   add     cnt = -1, cnt
 333;; }
 334{ .mmi
 335(p_yy)  cmp.le.unc p_y, p0 = 8, cnt
 336        add     ptr3 = ptr1, cnt                // prepare last store
 337        mov.i   ar.lc = save_lc
 338} { .mmi
 339(p_yy)  st2     [ptr1] = value, 4               // fill 2 (aligned) bytes
 340(p_yy)  st2     [ptr2] = value, 4               // fill 2 (aligned) bytes [11, 10 (o less) left]
 341(p_yy)  add     cnt = -4, cnt
 342;; }
 343{ .mmi
 344(p_y)   cmp.le.unc p_yy, p0 = 8, cnt
 345        add     ptr3 = -1, ptr3                 // last store
 346        tbit.nz p_scr, p0 = cnt, 1              // will there be a st2 at the end ?
 347} { .mmi
 348(p_y)   st2     [ptr1] = value, 4               // fill 2 (aligned) bytes
 349(p_y)   st2     [ptr2] = value, 4               // fill 2 (aligned) bytes [7, 6 (or less) left]
 350(p_y)   add     cnt = -4, cnt
 351;; }
 352{ .mmi
 353(p_yy)  st2     [ptr1] = value, 4               // fill 2 (aligned) bytes
 354(p_yy)  st2     [ptr2] = value, 4               // fill 2 (aligned) bytes [3, 2 (or less) left]
 355        tbit.nz p_y, p0 = cnt, 0                // will there be a st1 at the end ?
 356} { .mmi
 357(p_yy)  add     cnt = -4, cnt
 358;; }
 359{ .mmb
 360(p_scr) st2     [ptr1] = value                  // fill 2 (aligned) bytes
 361(p_y)   st1     [ptr3] = value                  // fill last byte (using ptr3)
 362        br.ret.sptk.many rp
 363}
 364END(memset)
 365EXPORT_SYMBOL(memset)
 366