linux/arch/sparc/lib/M7memset.S
<<
>>
Prefs
   1/*
   2 * M7memset.S: SPARC M7 optimized memset.
   3 *
   4 * Copyright (c) 2016, Oracle and/or its affiliates.  All rights reserved.
   5 */
   6
   7/*
   8 * M7memset.S: M7 optimized memset.
   9 *
  10 * char *memset(sp, c, n)
  11 *
  12 * Set an array of n chars starting at sp to the character c.
  13 * Return sp.
  14 *
  15 * Fast assembler language version of the following C-program for memset
  16 * which represents the `standard' for the C-library.
  17 *
  18 *      void *
  19 *      memset(void *sp1, int c, size_t n)
  20 *      {
  21 *          if (n != 0) {
  22 *              char *sp = sp1;
  23 *              do {
  24 *                  *sp++ = (char)c;
  25 *              } while (--n != 0);
  26 *          }
  27 *          return (sp1);
  28 *      }
  29 *
  30 * The algorithm is as follows :
  31 *
  32 *      For small 6 or fewer bytes stores, bytes will be stored.
  33 *
  34 *      For less than 32 bytes stores, align the address on 4 byte boundary.
  35 *      Then store as many 4-byte chunks, followed by trailing bytes.
  36 *
  37 *      For sizes greater than 32 bytes, align the address on 8 byte boundary.
  38 *      if (count >= 64) {
  39 *              store 8-bytes chunks to align the address on 64 byte boundary
  40 *              if (value to be set is zero && count >= MIN_ZERO) {
  41 *                      Using BIS stores, set the first long word of each
  42 *                      64-byte cache line to zero which will also clear the
  43 *                      other seven long words of the cache line.
  44 *              }
  45 *              else if (count >= MIN_LOOP) {
  46 *                      Using BIS stores, set the first long word of each of
  47 *                      ST_CHUNK cache lines (64 bytes each) before the main
  48 *                      loop is entered.
  49 *                      In the main loop, continue pre-setting the first long
  50 *                      word of each cache line ST_CHUNK lines in advance while
  51 *                      setting the other seven long words (56 bytes) of each
  52 *                      cache line until fewer than ST_CHUNK*64 bytes remain.
  53 *                      Then set the remaining seven long words of each cache
  54 *                      line that has already had its first long word set.
  55 *              }
  56 *              store remaining data in 64-byte chunks until less than
  57 *              64 bytes remain.
  58 *       }
  59 *       Store as many 8-byte chunks, followed by trailing bytes.
  60 *
  61 * BIS = Block Init Store
  62 *   Doing the advance store of the first element of the cache line
  63 *   initiates the displacement of a cache line while only using a single
  64 *   instruction in the pipeline. That avoids various pipeline delays,
  65 *   such as filling the miss buffer. The performance effect is
  66 *   similar to prefetching for normal stores.
  67 *   The special case for zero fills runs faster and uses fewer instruction
  68 *   cycles than the normal memset loop.
  69 *
  70 * We only use BIS for memset of greater than MIN_LOOP bytes because a sequence
  71 * BIS stores must be followed by a membar #StoreStore. The benefit of
  72 * the BIS store must be balanced against the cost of the membar operation.
  73 */
  74
  75/*
  76 * ASI_STBI_P marks the cache line as "least recently used"
  77 * which means if many threads are active, it has a high chance
  78 * of being pushed out of the cache between the first initializing
  79 * store and the final stores.
  80 * Thus, we use ASI_STBIMRU_P which marks the cache line as
  81 * "most recently used" for all but the last store to the cache line.
  82 */
  83
  84#include <asm/asi.h>
  85#include <asm/page.h>
  86
  87#define ASI_STBI_P      ASI_BLK_INIT_QUAD_LDD_P
  88#define ASI_STBIMRU_P   ASI_ST_BLKINIT_MRU_P
  89
  90
  91#define ST_CHUNK        24   /* multiple of 4 due to loop unrolling */
  92#define MIN_LOOP        16320
  93#define MIN_ZERO        512
  94
  95        .section        ".text"
  96        .align          32
  97
  98/*
  99 * Define clear_page(dest) as memset(dest, 0, PAGE_SIZE)
 100 * (can create a more optimized version later.)
 101 */
 102        .globl          M7clear_page
 103        .globl          M7clear_user_page
 104M7clear_page:           /* clear_page(dest) */
 105M7clear_user_page:
 106        set     PAGE_SIZE, %o1
 107        /* fall through into bzero code */
 108
 109        .size           M7clear_page,.-M7clear_page
 110        .size           M7clear_user_page,.-M7clear_user_page
 111
 112/*
 113 * Define bzero(dest, n) as memset(dest, 0, n)
 114 * (can create a more optimized version later.)
 115 */
 116        .globl          M7bzero
 117M7bzero:                /* bzero(dest, size) */
 118        mov     %o1, %o2
 119        mov     0, %o1
 120        /* fall through into memset code */
 121
 122        .size           M7bzero,.-M7bzero
 123
 124        .global         M7memset
 125        .type           M7memset, #function
 126        .register       %g3, #scratch
 127M7memset:
 128        mov     %o0, %o5                ! copy sp1 before using it
 129        cmp     %o2, 7                  ! if small counts, just write bytes
 130        bleu,pn %xcc, .wrchar
 131         and     %o1, 0xff, %o1          ! o1 is (char)c
 132
 133        sll     %o1, 8, %o3
 134        or      %o1, %o3, %o1           ! now o1 has 2 bytes of c
 135        sll     %o1, 16, %o3
 136        cmp     %o2, 32
 137        blu,pn  %xcc, .wdalign
 138         or      %o1, %o3, %o1           ! now o1 has 4 bytes of c
 139
 140        sllx    %o1, 32, %o3
 141        or      %o1, %o3, %o1           ! now o1 has 8 bytes of c
 142
 143.dbalign:
 144        andcc   %o5, 7, %o3             ! is sp1 aligned on a 8 byte bound?
 145        bz,pt   %xcc, .blkalign         ! already long word aligned
 146         sub     %o3, 8, %o3             ! -(bytes till long word aligned)
 147
 148        add     %o2, %o3, %o2           ! update o2 with new count
 149        ! Set -(%o3) bytes till sp1 long word aligned
 1501:      stb     %o1, [%o5]              ! there is at least 1 byte to set
 151        inccc   %o3                     ! byte clearing loop
 152        bl,pt   %xcc, 1b
 153         inc     %o5
 154
 155        ! Now sp1 is long word aligned (sp1 is found in %o5)
 156.blkalign:
 157        cmp     %o2, 64                 ! check if there are 64 bytes to set
 158        blu,pn  %xcc, .wrshort
 159         mov     %o2, %o3
 160
 161        andcc   %o5, 63, %o3            ! is sp1 block aligned?
 162        bz,pt   %xcc, .blkwr            ! now block aligned
 163         sub     %o3, 64, %o3            ! o3 is -(bytes till block aligned)
 164        add     %o2, %o3, %o2           ! o2 is the remainder
 165
 166        ! Store -(%o3) bytes till dst is block (64 byte) aligned.
 167        ! Use long word stores.
 168        ! Recall that dst is already long word aligned
 1691:
 170        addcc   %o3, 8, %o3
 171        stx     %o1, [%o5]
 172        bl,pt   %xcc, 1b
 173         add     %o5, 8, %o5
 174
 175        ! Now sp1 is block aligned
 176.blkwr:
 177        andn    %o2, 63, %o4            ! calculate size of blocks in bytes
 178        brz,pn  %o1, .wrzero            ! special case if c == 0
 179         and     %o2, 63, %o3            ! %o3 = bytes left after blk stores.
 180
 181        set     MIN_LOOP, %g1
 182        cmp     %o4, %g1                ! check there are enough bytes to set
 183        blu,pn  %xcc, .short_set        ! to justify cost of membar
 184                                        ! must be > pre-cleared lines
 185         nop
 186
 187        ! initial cache-clearing stores
 188        ! get store pipeline moving
 189        rd      %asi, %g3               ! save %asi to be restored later
 190        wr     %g0, ASI_STBIMRU_P, %asi
 191
 192        ! Primary memset loop for large memsets
 193.wr_loop:
 194        sub     %o5, 8, %o5             ! adjust %o5 for ASI store alignment
 195        mov     ST_CHUNK, %g1
 196.wr_loop_start:
 197        stxa    %o1, [%o5+8]%asi
 198        subcc   %g1, 4, %g1
 199        stxa    %o1, [%o5+8+64]%asi
 200        add     %o5, 256, %o5
 201        stxa    %o1, [%o5+8-128]%asi
 202        bgu     %xcc, .wr_loop_start
 203         stxa    %o1, [%o5+8-64]%asi
 204
 205        sub     %o5, ST_CHUNK*64, %o5   ! reset %o5
 206        mov     ST_CHUNK, %g1
 207
 208.wr_loop_rest:
 209        stxa    %o1, [%o5+8+8]%asi
 210        sub     %o4, 64, %o4
 211        stxa    %o1, [%o5+16+8]%asi
 212        subcc   %g1, 1, %g1
 213        stxa    %o1, [%o5+24+8]%asi
 214        stxa    %o1, [%o5+32+8]%asi
 215        stxa    %o1, [%o5+40+8]%asi
 216        add     %o5, 64, %o5
 217        stxa    %o1, [%o5-8]%asi
 218        bgu     %xcc, .wr_loop_rest
 219         stxa    %o1, [%o5]ASI_STBI_P
 220
 221        ! If more than ST_CHUNK*64 bytes remain to set, continue
 222        ! setting the first long word of each cache line in advance
 223        ! to keep the store pipeline moving.
 224
 225        cmp     %o4, ST_CHUNK*64
 226        bge,pt  %xcc, .wr_loop_start
 227         mov     ST_CHUNK, %g1
 228
 229        brz,a,pn %o4, .asi_done
 230         add     %o5, 8, %o5             ! restore %o5 offset
 231
 232.wr_loop_small:
 233        stxa    %o1, [%o5+8]%asi
 234        stxa    %o1, [%o5+8+8]%asi
 235        stxa    %o1, [%o5+16+8]%asi
 236        stxa    %o1, [%o5+24+8]%asi
 237        stxa    %o1, [%o5+32+8]%asi
 238        subcc   %o4, 64, %o4
 239        stxa    %o1, [%o5+40+8]%asi
 240        add     %o5, 64, %o5
 241        stxa    %o1, [%o5-8]%asi
 242        bgu,pt  %xcc, .wr_loop_small
 243         stxa    %o1, [%o5]ASI_STBI_P
 244
 245        ba      .asi_done
 246         add     %o5, 8, %o5             ! restore %o5 offset
 247
 248        ! Special case loop for zero fill memsets
 249        ! For each 64 byte cache line, single STBI to first element
 250        ! clears line
 251.wrzero:
 252        cmp     %o4, MIN_ZERO           ! check if enough bytes to set
 253                                        ! to pay %asi + membar cost
 254        blu     %xcc, .short_set
 255         nop
 256        sub     %o4, 256, %o4
 257
 258.wrzero_loop:
 259        mov     64, %g3
 260        stxa    %o1, [%o5]ASI_STBI_P
 261        subcc   %o4, 256, %o4
 262        stxa    %o1, [%o5+%g3]ASI_STBI_P
 263        add     %o5, 256, %o5
 264        sub     %g3, 192, %g3
 265        stxa    %o1, [%o5+%g3]ASI_STBI_P
 266        add %g3, 64, %g3
 267        bge,pt  %xcc, .wrzero_loop
 268         stxa    %o1, [%o5+%g3]ASI_STBI_P
 269        add     %o4, 256, %o4
 270
 271        brz,pn  %o4, .bsi_done
 272         nop
 273
 274.wrzero_small:
 275        stxa    %o1, [%o5]ASI_STBI_P
 276        subcc   %o4, 64, %o4
 277        bgu,pt  %xcc, .wrzero_small
 278         add     %o5, 64, %o5
 279        ba,a    .bsi_done
 280
 281.asi_done:
 282        wr      %g3, 0x0, %asi          ! restored saved %asi
 283.bsi_done:
 284        membar  #StoreStore             ! required by use of Block Store Init
 285
 286.short_set:
 287        cmp     %o4, 64                 ! check if 64 bytes to set
 288        blu     %xcc, 5f
 289         nop
 2904:                                      ! set final blocks of 64 bytes
 291        stx     %o1, [%o5]
 292        stx     %o1, [%o5+8]
 293        stx     %o1, [%o5+16]
 294        stx     %o1, [%o5+24]
 295        subcc   %o4, 64, %o4
 296        stx     %o1, [%o5+32]
 297        stx     %o1, [%o5+40]
 298        add     %o5, 64, %o5
 299        stx     %o1, [%o5-16]
 300        bgu,pt  %xcc, 4b
 301         stx     %o1, [%o5-8]
 302
 3035:
 304        ! Set the remaining long words
 305.wrshort:
 306        subcc   %o3, 8, %o3             ! Can we store any long words?
 307        blu,pn  %xcc, .wrchars
 308         and     %o2, 7, %o2             ! calc bytes left after long words
 3096:
 310        subcc   %o3, 8, %o3
 311        stx     %o1, [%o5]              ! store the long words
 312        bgeu,pt %xcc, 6b
 313         add     %o5, 8, %o5
 314
 315.wrchars:                               ! check for extra chars
 316        brnz    %o2, .wrfin
 317         nop
 318        retl
 319         nop
 320
 321.wdalign:
 322        andcc   %o5, 3, %o3             ! is sp1 aligned on a word boundary
 323        bz,pn   %xcc, .wrword
 324         andn    %o2, 3, %o3             ! create word sized count in %o3
 325
 326        dec     %o2                     ! decrement count
 327        stb     %o1, [%o5]              ! clear a byte
 328        b       .wdalign
 329         inc     %o5                     ! next byte
 330
 331.wrword:
 332        subcc   %o3, 4, %o3
 333        st      %o1, [%o5]              ! 4-byte writing loop
 334        bnz,pt  %xcc, .wrword
 335         add     %o5, 4, %o5
 336
 337        and     %o2, 3, %o2             ! leftover count, if any
 338
 339.wrchar:
 340        ! Set the remaining bytes, if any
 341        brz     %o2, .exit
 342         nop
 343.wrfin:
 344        deccc   %o2
 345        stb     %o1, [%o5]
 346        bgu,pt  %xcc, .wrfin
 347         inc     %o5
 348.exit:
 349        retl                            ! %o0 was preserved
 350         nop
 351
 352        .size           M7memset,.-M7memset
 353