1/* 2 * Copyright 2010 Tilera Corporation. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation, version 2. 7 * 8 * This program is distributed in the hope that it will be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or 11 * NON INFRINGEMENT. See the GNU General Public License for 12 * more details. 13 */ 14 15#include <arch/chip.h> 16 17#include <linux/types.h> 18#include <linux/string.h> 19#include <linux/module.h> 20 21#undef memset 22 23void *memset(void *s, int c, size_t n) 24{ 25 uint32_t *out32; 26 int n32; 27 uint32_t v16, v32; 28 uint8_t *out8 = s; 29#if !CHIP_HAS_WH64() 30 int ahead32; 31#else 32 int to_align32; 33#endif 34 35 /* Experimentation shows that a trivial tight loop is a win up until 36 * around a size of 20, where writing a word at a time starts to win. 37 */ 38#define BYTE_CUTOFF 20 39 40#if BYTE_CUTOFF < 3 41 /* This must be at least at least this big, or some code later 42 * on doesn't work. 43 */ 44#error "BYTE_CUTOFF is too small" 45#endif 46 47 if (n < BYTE_CUTOFF) { 48 /* Strangely, this turns out to be the tightest way to 49 * write this loop. 50 */ 51 if (n != 0) { 52 do { 53 /* Strangely, combining these into one line 54 * performs worse. 55 */ 56 *out8 = c; 57 out8++; 58 } while (--n != 0); 59 } 60 61 return s; 62 } 63 64#if !CHIP_HAS_WH64() 65 /* Use a spare issue slot to start prefetching the first cache 66 * line early. This instruction is free as the store can be buried 67 * in otherwise idle issue slots doing ALU ops. 68 */ 69 __insn_prefetch(out8); 70 71 /* We prefetch the end so that a short memset that spans two cache 72 * lines gets some prefetching benefit. Again we believe this is free 73 * to issue. 74 */ 75 __insn_prefetch(&out8[n - 1]); 76#endif /* !CHIP_HAS_WH64() */ 77 78 79 /* Align 'out8'. We know n >= 3 so this won't write past the end. */ 80 while (((uintptr_t) out8 & 3) != 0) { 81 *out8++ = c; 82 --n; 83 } 84 85 /* Align 'n'. */ 86 while (n & 3) 87 out8[--n] = c; 88 89 out32 = (uint32_t *) out8; 90 n32 = n >> 2; 91 92 /* Tile input byte out to 32 bits. */ 93 v16 = __insn_intlb(c, c); 94 v32 = __insn_intlh(v16, v16); 95 96 /* This must be at least 8 or the following loop doesn't work. */ 97#define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4) 98 99#if !CHIP_HAS_WH64() 100 101 ahead32 = CACHE_LINE_SIZE_IN_WORDS; 102 103 /* We already prefetched the first and last cache lines, so 104 * we only need to do more prefetching if we are storing 105 * to more than two cache lines. 106 */ 107 if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) { 108 int i; 109 110 /* Prefetch the next several cache lines. 111 * This is the setup code for the software-pipelined 112 * loop below. 113 */ 114#define MAX_PREFETCH 5 115 ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS; 116 if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS) 117 ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS; 118 119 for (i = CACHE_LINE_SIZE_IN_WORDS; 120 i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS) 121 __insn_prefetch(&out32[i]); 122 } 123 124 if (n32 > ahead32) { 125 while (1) { 126 int j; 127 128 /* Prefetch by reading one word several cache lines 129 * ahead. Since loads are non-blocking this will 130 * cause the full cache line to be read while we are 131 * finishing earlier cache lines. Using a store 132 * here causes microarchitectural performance 133 * problems where a victimizing store miss goes to 134 * the head of the retry FIFO and locks the pipe for 135 * a few cycles. So a few subsequent stores in this 136 * loop go into the retry FIFO, and then later 137 * stores see other stores to the same cache line 138 * are already in the retry FIFO and themselves go 139 * into the retry FIFO, filling it up and grinding 140 * to a halt waiting for the original miss to be 141 * satisfied. 142 */ 143 __insn_prefetch(&out32[ahead32]); 144 145#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0 146#error "Unhandled CACHE_LINE_SIZE_IN_WORDS" 147#endif 148 149 n32 -= CACHE_LINE_SIZE_IN_WORDS; 150 151 /* Save icache space by only partially unrolling 152 * this loop. 153 */ 154 for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) { 155 *out32++ = v32; 156 *out32++ = v32; 157 *out32++ = v32; 158 *out32++ = v32; 159 } 160 161 /* To save compiled code size, reuse this loop even 162 * when we run out of prefetching to do by dropping 163 * ahead32 down. 164 */ 165 if (n32 <= ahead32) { 166 /* Not even a full cache line left, 167 * so stop now. 168 */ 169 if (n32 < CACHE_LINE_SIZE_IN_WORDS) 170 break; 171 172 /* Choose a small enough value that we don't 173 * prefetch past the end. There's no sense 174 * in touching cache lines we don't have to. 175 */ 176 ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1; 177 } 178 } 179 } 180 181#else /* CHIP_HAS_WH64() */ 182 183 /* Determine how many words we need to emit before the 'out32' 184 * pointer becomes aligned modulo the cache line size. 185 */ 186 to_align32 = 187 (-((uintptr_t)out32 >> 2)) & (CACHE_LINE_SIZE_IN_WORDS - 1); 188 189 /* Only bother aligning and using wh64 if there is at least 190 * one full cache line to process. This check also prevents 191 * overrunning the end of the buffer with alignment words. 192 */ 193 if (to_align32 <= n32 - CACHE_LINE_SIZE_IN_WORDS) { 194 int lines_left; 195 196 /* Align out32 mod the cache line size so we can use wh64. */ 197 n32 -= to_align32; 198 for (; to_align32 != 0; to_align32--) { 199 *out32 = v32; 200 out32++; 201 } 202 203 /* Use unsigned divide to turn this into a right shift. */ 204 lines_left = (unsigned)n32 / CACHE_LINE_SIZE_IN_WORDS; 205 206 do { 207 /* Only wh64 a few lines at a time, so we don't 208 * exceed the maximum number of victim lines. 209 */ 210 int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS()) 211 ? lines_left 212 : CHIP_MAX_OUTSTANDING_VICTIMS()); 213 uint32_t *wh = out32; 214 int i = x; 215 int j; 216 217 lines_left -= x; 218 219 do { 220 __insn_wh64(wh); 221 wh += CACHE_LINE_SIZE_IN_WORDS; 222 } while (--i); 223 224 for (j = x * (CACHE_LINE_SIZE_IN_WORDS / 4); 225 j != 0; j--) { 226 *out32++ = v32; 227 *out32++ = v32; 228 *out32++ = v32; 229 *out32++ = v32; 230 } 231 } while (lines_left != 0); 232 233 /* We processed all full lines above, so only this many 234 * words remain to be processed. 235 */ 236 n32 &= CACHE_LINE_SIZE_IN_WORDS - 1; 237 } 238 239#endif /* CHIP_HAS_WH64() */ 240 241 /* Now handle any leftover values. */ 242 if (n32 != 0) { 243 do { 244 *out32 = v32; 245 out32++; 246 } while (--n32 != 0); 247 } 248 249 return s; 250} 251EXPORT_SYMBOL(memset); 252