linux/arch/tile/lib/memset_32.c
<<
>>
Prefs
   1/*
   2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
   3 *
   4 *   This program is free software; you can redistribute it and/or
   5 *   modify it under the terms of the GNU General Public License
   6 *   as published by the Free Software Foundation, version 2.
   7 *
   8 *   This program is distributed in the hope that it will be useful, but
   9 *   WITHOUT ANY WARRANTY; without even the implied warranty of
  10 *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
  11 *   NON INFRINGEMENT.  See the GNU General Public License for
  12 *   more details.
  13 */
  14
  15#include <arch/chip.h>
  16
  17#include <linux/types.h>
  18#include <linux/string.h>
  19#include <linux/module.h>
  20
  21#undef memset
  22
  23void *memset(void *s, int c, size_t n)
  24{
  25        uint32_t *out32;
  26        int n32;
  27        uint32_t v16, v32;
  28        uint8_t *out8 = s;
  29#if !CHIP_HAS_WH64()
  30        int ahead32;
  31#else
  32        int to_align32;
  33#endif
  34
  35        /* Experimentation shows that a trivial tight loop is a win up until
  36         * around a size of 20, where writing a word at a time starts to win.
  37         */
  38#define BYTE_CUTOFF 20
  39
  40#if BYTE_CUTOFF < 3
  41        /* This must be at least at least this big, or some code later
  42         * on doesn't work.
  43         */
  44#error "BYTE_CUTOFF is too small"
  45#endif
  46
  47        if (n < BYTE_CUTOFF) {
  48                /* Strangely, this turns out to be the tightest way to
  49                 * write this loop.
  50                 */
  51                if (n != 0) {
  52                        do {
  53                                /* Strangely, combining these into one line
  54                                 * performs worse.
  55                                 */
  56                                *out8 = c;
  57                                out8++;
  58                        } while (--n != 0);
  59                }
  60
  61                return s;
  62        }
  63
  64#if !CHIP_HAS_WH64()
  65        /* Use a spare issue slot to start prefetching the first cache
  66         * line early. This instruction is free as the store can be buried
  67         * in otherwise idle issue slots doing ALU ops.
  68         */
  69        __insn_prefetch(out8);
  70
  71        /* We prefetch the end so that a short memset that spans two cache
  72         * lines gets some prefetching benefit. Again we believe this is free
  73         * to issue.
  74         */
  75        __insn_prefetch(&out8[n - 1]);
  76#endif /* !CHIP_HAS_WH64() */
  77
  78
  79        /* Align 'out8'. We know n >= 3 so this won't write past the end. */
  80        while (((uintptr_t) out8 & 3) != 0) {
  81                *out8++ = c;
  82                --n;
  83        }
  84
  85        /* Align 'n'. */
  86        while (n & 3)
  87                out8[--n] = c;
  88
  89        out32 = (uint32_t *) out8;
  90        n32 = n >> 2;
  91
  92        /* Tile input byte out to 32 bits. */
  93        v16 = __insn_intlb(c, c);
  94        v32 = __insn_intlh(v16, v16);
  95
  96        /* This must be at least 8 or the following loop doesn't work. */
  97#define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
  98
  99#if !CHIP_HAS_WH64()
 100
 101        ahead32 = CACHE_LINE_SIZE_IN_WORDS;
 102
 103        /* We already prefetched the first and last cache lines, so
 104         * we only need to do more prefetching if we are storing
 105         * to more than two cache lines.
 106         */
 107        if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) {
 108                int i;
 109
 110                /* Prefetch the next several cache lines.
 111                 * This is the setup code for the software-pipelined
 112                 * loop below.
 113                 */
 114#define MAX_PREFETCH 5
 115                ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS;
 116                if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS)
 117                        ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS;
 118
 119                for (i = CACHE_LINE_SIZE_IN_WORDS;
 120                     i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS)
 121                        __insn_prefetch(&out32[i]);
 122        }
 123
 124        if (n32 > ahead32) {
 125                while (1) {
 126                        int j;
 127
 128                        /* Prefetch by reading one word several cache lines
 129                         * ahead.  Since loads are non-blocking this will
 130                         * cause the full cache line to be read while we are
 131                         * finishing earlier cache lines.  Using a store
 132                         * here causes microarchitectural performance
 133                         * problems where a victimizing store miss goes to
 134                         * the head of the retry FIFO and locks the pipe for
 135                         * a few cycles.  So a few subsequent stores in this
 136                         * loop go into the retry FIFO, and then later
 137                         * stores see other stores to the same cache line
 138                         * are already in the retry FIFO and themselves go
 139                         * into the retry FIFO, filling it up and grinding
 140                         * to a halt waiting for the original miss to be
 141                         * satisfied.
 142                         */
 143                        __insn_prefetch(&out32[ahead32]);
 144
 145#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
 146#error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
 147#endif
 148
 149                        n32 -= CACHE_LINE_SIZE_IN_WORDS;
 150
 151                        /* Save icache space by only partially unrolling
 152                         * this loop.
 153                         */
 154                        for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) {
 155                                *out32++ = v32;
 156                                *out32++ = v32;
 157                                *out32++ = v32;
 158                                *out32++ = v32;
 159                        }
 160
 161                        /* To save compiled code size, reuse this loop even
 162                         * when we run out of prefetching to do by dropping
 163                         * ahead32 down.
 164                         */
 165                        if (n32 <= ahead32) {
 166                                /* Not even a full cache line left,
 167                                 * so stop now.
 168                                 */
 169                                if (n32 < CACHE_LINE_SIZE_IN_WORDS)
 170                                        break;
 171
 172                                /* Choose a small enough value that we don't
 173                                 * prefetch past the end.  There's no sense
 174                                 * in touching cache lines we don't have to.
 175                                 */
 176                                ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1;
 177                        }
 178                }
 179        }
 180
 181#else /* CHIP_HAS_WH64() */
 182
 183        /* Determine how many words we need to emit before the 'out32'
 184         * pointer becomes aligned modulo the cache line size.
 185         */
 186        to_align32 =
 187                (-((uintptr_t)out32 >> 2)) & (CACHE_LINE_SIZE_IN_WORDS - 1);
 188
 189        /* Only bother aligning and using wh64 if there is at least
 190         * one full cache line to process.  This check also prevents
 191         * overrunning the end of the buffer with alignment words.
 192         */
 193        if (to_align32 <= n32 - CACHE_LINE_SIZE_IN_WORDS) {
 194                int lines_left;
 195
 196                /* Align out32 mod the cache line size so we can use wh64. */
 197                n32 -= to_align32;
 198                for (; to_align32 != 0; to_align32--) {
 199                        *out32 = v32;
 200                        out32++;
 201                }
 202
 203                /* Use unsigned divide to turn this into a right shift. */
 204                lines_left = (unsigned)n32 / CACHE_LINE_SIZE_IN_WORDS;
 205
 206                do {
 207                        /* Only wh64 a few lines at a time, so we don't
 208                         * exceed the maximum number of victim lines.
 209                         */
 210                        int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS())
 211                                  ? lines_left
 212                                  : CHIP_MAX_OUTSTANDING_VICTIMS());
 213                        uint32_t *wh = out32;
 214                        int i = x;
 215                        int j;
 216
 217                        lines_left -= x;
 218
 219                        do {
 220                                __insn_wh64(wh);
 221                                wh += CACHE_LINE_SIZE_IN_WORDS;
 222                        } while (--i);
 223
 224                        for (j = x * (CACHE_LINE_SIZE_IN_WORDS / 4);
 225                             j != 0; j--) {
 226                                *out32++ = v32;
 227                                *out32++ = v32;
 228                                *out32++ = v32;
 229                                *out32++ = v32;
 230                        }
 231                } while (lines_left != 0);
 232
 233                /* We processed all full lines above, so only this many
 234                 * words remain to be processed.
 235                 */
 236                n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
 237        }
 238
 239#endif /* CHIP_HAS_WH64() */
 240
 241        /* Now handle any leftover values. */
 242        if (n32 != 0) {
 243                do {
 244                        *out32 = v32;
 245                        out32++;
 246                } while (--n32 != 0);
 247        }
 248
 249        return s;
 250}
 251EXPORT_SYMBOL(memset);
 252