arch/tile/lib/memset_32.c

   1 /*
   2  * Copyright 2010 Tilera Corporation. All Rights Reserved.
   3  *
   4  *   This program is free software; you can redistribute it and/or
   5  *   modify it under the terms of the GNU General Public License
   6  *   as published by the Free Software Foundation, version 2.
   7  *
   8  *   This program is distributed in the hope that it will be useful, but
   9  *   WITHOUT ANY WARRANTY; without even the implied warranty of
  10  *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
  11  *   NON INFRINGEMENT.  See the GNU General Public License for
  12  *   more details.
  13  */
  14
  15 #include <arch/chip.h>
  16
  17 #include <linux/types.h>
  18 #include <linux/string.h>
  19 #include <linux/module.h>
  20
  21
  22 void *memset(void *s, int c, size_t n)
  23 {
  24         uint32_t *out32;
  25         int n32;
  26         uint32_t v16, v32;
  27         uint8_t *out8 = s;
  28 #if !CHIP_HAS_WH64()
  29         int ahead32;
  30 #else
  31         int to_align32;
  32 #endif
  33
  34         /* Experimentation shows that a trivial tight loop is a win up until
  35          * around a size of 20, where writing a word at a time starts to win.
  36          */
  37 #define BYTE_CUTOFF 20
  38
  39 #if BYTE_CUTOFF < 3
  40         /* This must be at least at least this big, or some code later
  41          * on doesn't work.
  42          */
  43 #error "BYTE_CUTOFF is too small"
  44 #endif
  45
  46         if (n < BYTE_CUTOFF) {
  47                 /* Strangely, this turns out to be the tightest way to
  48                  * write this loop.
  49                  */
  50                 if (n != 0) {
  51                         do {
  52                                 /* Strangely, combining these into one line
  53                                  * performs worse.
  54                                  */
  55                                 *out8 = c;
  56                                 out8++;
  57                         } while (--n != 0);
  58                 }
  59
  60                 return s;
  61         }
  62
  63 #if !CHIP_HAS_WH64()
  64         /* Use a spare issue slot to start prefetching the first cache
  65          * line early. This instruction is free as the store can be buried
  66          * in otherwise idle issue slots doing ALU ops.
  67          */
  68         __insn_prefetch(out8);
  69
  70         /* We prefetch the end so that a short memset that spans two cache
  71          * lines gets some prefetching benefit. Again we believe this is free
  72          * to issue.
  73          */
  74         __insn_prefetch(&out8[n - 1]);
  75 #endif /* !CHIP_HAS_WH64() */
  76
  77
  78         /* Align 'out8'. We know n >= 3 so this won't write past the end. */
  79         while (((uintptr_t) out8 & 3) != 0) {
  80                 *out8++ = c;
  81                 --n;
  82         }
  83
  84         /* Align 'n'. */
  85         while (n & 3)
  86                 out8[--n] = c;
  87
  88         out32 = (uint32_t *) out8;
  89         n32 = n >> 2;
  90
  91         /* Tile input byte out to 32 bits. */
  92         v16 = __insn_intlb(c, c);
  93         v32 = __insn_intlh(v16, v16);
  94
  95         /* This must be at least 8 or the following loop doesn't work. */
  96 #define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
  97
  98 #if !CHIP_HAS_WH64()
  99
 100         ahead32 = CACHE_LINE_SIZE_IN_WORDS;
 101
 102         /* We already prefetched the first and last cache lines, so
 103          * we only need to do more prefetching if we are storing
 104          * to more than two cache lines.
 105          */
 106         if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) {
 107                 int i;
 108
 109                 /* Prefetch the next several cache lines.
 110                  * This is the setup code for the software-pipelined
 111                  * loop below.
 112                  */
 113 #define MAX_PREFETCH 5
 114                 ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS;
 115                 if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS)
 116                         ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS;
 117
 118                 for (i = CACHE_LINE_SIZE_IN_WORDS;
 119                      i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS)
 120                         __insn_prefetch(&out32[i]);
 121         }
 122
 123         if (n32 > ahead32) {
 124                 while (1) {
 125                         int j;
 126
 127                         /* Prefetch by reading one word several cache lines
 128                          * ahead.  Since loads are non-blocking this will
 129                          * cause the full cache line to be read while we are
 130                          * finishing earlier cache lines.  Using a store
 131                          * here causes microarchitectural performance
 132                          * problems where a victimizing store miss goes to
 133                          * the head of the retry FIFO and locks the pipe for
 134                          * a few cycles.  So a few subsequent stores in this
 135                          * loop go into the retry FIFO, and then later
 136                          * stores see other stores to the same cache line
 137                          * are already in the retry FIFO and themselves go
 138                          * into the retry FIFO, filling it up and grinding
 139                          * to a halt waiting for the original miss to be
 140                          * satisfied.
 141                          */
 142                         __insn_prefetch(&out32[ahead32]);
 143
 144 #if 1
 145 #if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
 146 #error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
 147 #endif
 148
 149                         n32 -= CACHE_LINE_SIZE_IN_WORDS;
 150
 151                         /* Save icache space by only partially unrolling
 152                          * this loop.
 153                          */
 154                         for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) {
 155                                 *out32++ = v32;
 156                                 *out32++ = v32;
 157                                 *out32++ = v32;
 158                                 *out32++ = v32;
 159                         }
 160 #else
 161                         /* Unfortunately, due to a code generator flaw this
 162                          * allocates a separate register for each of these
 163                          * stores, which requires a large number of spills,
 164                          * which makes this procedure enormously bigger
 165                          * (something like 70%)
 166                          */
 167                         *out32++ = v32;
 168                         *out32++ = v32;
 169                         *out32++ = v32;
 170                         *out32++ = v32;
 171                         *out32++ = v32;
 172                         *out32++ = v32;
 173                         *out32++ = v32;
 174                         *out32++ = v32;
 175                         *out32++ = v32;
 176                         *out32++ = v32;
 177                         *out32++ = v32;
 178                         *out32++ = v32;
 179                         *out32++ = v32;
 180                         *out32++ = v32;
 181                         *out32++ = v32;
 182                         n32 -= 16;
 183 #endif
 184
 185                         /* To save compiled code size, reuse this loop even
 186                          * when we run out of prefetching to do by dropping
 187                          * ahead32 down.
 188                          */
 189                         if (n32 <= ahead32) {
 190                                 /* Not even a full cache line left,
 191                                  * so stop now.
 192                                  */
 193                                 if (n32 < CACHE_LINE_SIZE_IN_WORDS)
 194                                         break;
 195
 196                                 /* Choose a small enough value that we don't
 197                                  * prefetch past the end.  There's no sense
 198                                  * in touching cache lines we don't have to.
 199                                  */
 200                                 ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1;
 201                         }
 202                 }
 203         }
 204
 205 #else /* CHIP_HAS_WH64() */
 206
 207         /* Determine how many words we need to emit before the 'out32'
 208          * pointer becomes aligned modulo the cache line size.
 209          */
 210         to_align32 =
 211                 (-((uintptr_t)out32 >> 2)) & (CACHE_LINE_SIZE_IN_WORDS - 1);
 212
 213         /* Only bother aligning and using wh64 if there is at least
 214          * one full cache line to process.  This check also prevents
 215          * overrunning the end of the buffer with alignment words.
 216          */
 217         if (to_align32 <= n32 - CACHE_LINE_SIZE_IN_WORDS) {
 218                 int lines_left;
 219
 220                 /* Align out32 mod the cache line size so we can use wh64. */
 221                 n32 -= to_align32;
 222                 for (; to_align32 != 0; to_align32--) {
 223                         *out32 = v32;
 224                         out32++;
 225                 }
 226
 227                 /* Use unsigned divide to turn this into a right shift. */
 228                 lines_left = (unsigned)n32 / CACHE_LINE_SIZE_IN_WORDS;
 229
 230                 do {
 231                         /* Only wh64 a few lines at a time, so we don't
 232                          * exceed the maximum number of victim lines.
 233                          */
 234                         int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS())
 235                                   ? lines_left
 236                                   : CHIP_MAX_OUTSTANDING_VICTIMS());
 237                         uint32_t *wh = out32;
 238                         int i = x;
 239                         int j;
 240
 241                         lines_left -= x;
 242
 243                         do {
 244                                 __insn_wh64(wh);
 245                                 wh += CACHE_LINE_SIZE_IN_WORDS;
 246                         } while (--i);
 247
 248                         for (j = x * (CACHE_LINE_SIZE_IN_WORDS / 4); j != 0; j--) {
 249                                 *out32++ = v32;
 250                                 *out32++ = v32;
 251                                 *out32++ = v32;
 252                                 *out32++ = v32;
 253                         }
 254                 } while (lines_left != 0);
 255
 256                 /* We processed all full lines above, so only this many
 257                  * words remain to be processed.
 258                  */
 259                 n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
 260         }
 261
 262 #endif /* CHIP_HAS_WH64() */
 263
 264         /* Now handle any leftover values. */
 265         if (n32 != 0) {
 266                 do {
 267                         *out32 = v32;
 268                         out32++;
 269                 } while (--n32 != 0);
 270         }
 271
 272         return s;
 273 }
 274 EXPORT_SYMBOL(memset);