Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jikos/hid
[deliverable/linux.git] / arch / parisc / lib / memcpy.c
CommitLineData
1da177e4
LT
1/*
2 * Optimized memory copy routines.
3 *
4 * Copyright (C) 2004 Randolph Chung <tausq@debian.org>
5b879d78 5 * Copyright (C) 2013 Helge Deller <deller@gmx.de>
1da177e4
LT
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2, or (at your option)
10 * any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Portions derived from the GNU C Library
22 * Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
23 *
24 * Several strategies are tried to try to get the best performance for various
25 * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using
26 * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
27 * general registers. Unaligned copies are handled either by aligning the
28 * destination and then using shift-and-write method, or in a few cases by
29 * falling back to a byte-at-a-time copy.
30 *
31 * I chose to implement this in C because it is easier to maintain and debug,
32 * and in my experiments it appears that the C code generated by gcc (3.3/3.4
33 * at the time of writing) is fairly optimal. Unfortunately some of the
34 * semantics of the copy routine (exception handling) is difficult to express
35 * in C, so we have to play some tricks to get it to work.
36 *
37 * All the loads and stores are done via explicit asm() code in order to use
38 * the right space registers.
39 *
40 * Testing with various alignments and buffer sizes shows that this code is
41 * often >10x faster than a simple byte-at-a-time copy, even for strangely
42 * aligned operands. It is interesting to note that the glibc version
43 * of memcpy (written in C) is actually quite fast already. This routine is
44 * able to beat it by 30-40% for aligned copies because of the loop unrolling,
45 * but in some cases the glibc version is still slightly faster. This lends
46 * more credibility that gcc can generate very good code as long as we are
47 * careful.
48 *
49 * TODO:
50 * - cache prefetching needs more experimentation to get optimal settings
51 * - try not to use the post-increment address modifiers; they create additional
52 * interlocks
53 * - replace byte-copy loops with stybs sequences
54 */
55
56#ifdef __KERNEL__
1da177e4
LT
57#include <linux/module.h>
58#include <linux/compiler.h>
db080f9c 59#include <linux/uaccess.h>
1da177e4
LT
60#define s_space "%%sr1"
61#define d_space "%%sr2"
62#else
63#include "memcpy.h"
64#define s_space "%%sr0"
65#define d_space "%%sr0"
66#define pa_memcpy new2_copy
67#endif
68
69DECLARE_PER_CPU(struct exception_data, exception_data);
70
71#define preserve_branch(label) do { \
d8d0524a 72 volatile int dummy = 0; \
1da177e4
LT
73 /* The following branch is never taken, it's just here to */ \
74 /* prevent gcc from optimizing away our exception code. */ \
75 if (unlikely(dummy != dummy)) \
76 goto label; \
77} while (0)
78
79#define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
80#define get_kernel_space() (0)
81
82#define MERGE(w0, sh_1, w1, sh_2) ({ \
83 unsigned int _r; \
84 asm volatile ( \
85 "mtsar %3\n" \
86 "shrpw %1, %2, %%sar, %0\n" \
87 : "=r"(_r) \
88 : "r"(w0), "r"(w1), "r"(sh_2) \
89 ); \
90 _r; \
91})
92#define THRESHOLD 16
93
94#ifdef DEBUG_MEMCPY
91bae23c 95#define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __func__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
1da177e4
LT
96#else
97#define DPRINTF(fmt, args...)
98#endif
99
1da177e4
LT
100#define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
101 __asm__ __volatile__ ( \
0b3d643f
HD
102 "1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n\t" \
103 ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
1da177e4
LT
104 : _tt(_t), "+r"(_a) \
105 : \
106 : "r8")
107
108#define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
109 __asm__ __volatile__ ( \
0b3d643f
HD
110 "1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n\t" \
111 ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
1da177e4
LT
112 : "+r"(_a) \
113 : _tt(_t) \
114 : "r8")
115
116#define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)
117#define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)
118#define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)
119#define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)
120#define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
121#define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)
122
123#define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) \
124 __asm__ __volatile__ ( \
0b3d643f
HD
125 "1:\t" #_insn " " #_o "(" _s ",%1), %0\n\t" \
126 ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
1da177e4
LT
127 : _tt(_t) \
128 : "r"(_a) \
129 : "r8")
130
131#define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) \
132 __asm__ __volatile__ ( \
0b3d643f
HD
133 "1:\t" #_insn " %0, " #_o "(" _s ",%1)\n\t" \
134 ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
1da177e4
LT
135 : \
136 : _tt(_t), "r"(_a) \
137 : "r8")
138
139#define ldw(_s,_o,_a,_t,_e) def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
140#define stw(_s,_t,_o,_a,_e) def_store_insn(stw,"r",_s,_t,_o,_a,_e)
141
142#ifdef CONFIG_PREFETCH
f13cec84 143static inline void prefetch_src(const void *addr)
1da177e4
LT
144{
145 __asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
146}
147
f13cec84 148static inline void prefetch_dst(const void *addr)
1da177e4
LT
149{
150 __asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
151}
152#else
0b3d643f
HD
153#define prefetch_src(addr) do { } while(0)
154#define prefetch_dst(addr) do { } while(0)
1da177e4
LT
155#endif
156
5b879d78
HD
157#define PA_MEMCPY_OK 0
158#define PA_MEMCPY_LOAD_ERROR 1
159#define PA_MEMCPY_STORE_ERROR 2
160
1da177e4
LT
161/* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
162 * per loop. This code is derived from glibc.
163 */
9af63aed 164static noinline unsigned long copy_dstaligned(unsigned long dst,
5b879d78 165 unsigned long src, unsigned long len)
1da177e4
LT
166{
167 /* gcc complains that a2 and a3 may be uninitialized, but actually
168 * they cannot be. Initialize a2/a3 to shut gcc up.
169 */
170 register unsigned int a0, a1, a2 = 0, a3 = 0;
171 int sh_1, sh_2;
1da177e4
LT
172
173 /* prefetch_src((const void *)src); */
174
175 /* Calculate how to shift a word read at the memory operation
176 aligned srcp to make it aligned for copy. */
177 sh_1 = 8 * (src % sizeof(unsigned int));
178 sh_2 = 8 * sizeof(unsigned int) - sh_1;
179
180 /* Make src aligned by rounding it down. */
181 src &= -sizeof(unsigned int);
182
183 switch (len % 4)
184 {
185 case 2:
186 /* a1 = ((unsigned int *) src)[0];
187 a2 = ((unsigned int *) src)[1]; */
188 ldw(s_space, 0, src, a1, cda_ldw_exc);
189 ldw(s_space, 4, src, a2, cda_ldw_exc);
190 src -= 1 * sizeof(unsigned int);
191 dst -= 3 * sizeof(unsigned int);
192 len += 2;
193 goto do1;
194 case 3:
195 /* a0 = ((unsigned int *) src)[0];
196 a1 = ((unsigned int *) src)[1]; */
197 ldw(s_space, 0, src, a0, cda_ldw_exc);
198 ldw(s_space, 4, src, a1, cda_ldw_exc);
199 src -= 0 * sizeof(unsigned int);
200 dst -= 2 * sizeof(unsigned int);
201 len += 1;
202 goto do2;
203 case 0:
204 if (len == 0)
5b879d78 205 return PA_MEMCPY_OK;
1da177e4
LT
206 /* a3 = ((unsigned int *) src)[0];
207 a0 = ((unsigned int *) src)[1]; */
208 ldw(s_space, 0, src, a3, cda_ldw_exc);
209 ldw(s_space, 4, src, a0, cda_ldw_exc);
210 src -=-1 * sizeof(unsigned int);
211 dst -= 1 * sizeof(unsigned int);
212 len += 0;
213 goto do3;
214 case 1:
215 /* a2 = ((unsigned int *) src)[0];
216 a3 = ((unsigned int *) src)[1]; */
217 ldw(s_space, 0, src, a2, cda_ldw_exc);
218 ldw(s_space, 4, src, a3, cda_ldw_exc);
219 src -=-2 * sizeof(unsigned int);
220 dst -= 0 * sizeof(unsigned int);
221 len -= 1;
222 if (len == 0)
223 goto do0;
224 goto do4; /* No-op. */
225 }
226
227 do
228 {
229 /* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
230do4:
231 /* a0 = ((unsigned int *) src)[0]; */
232 ldw(s_space, 0, src, a0, cda_ldw_exc);
233 /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
234 stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
235do3:
236 /* a1 = ((unsigned int *) src)[1]; */
237 ldw(s_space, 4, src, a1, cda_ldw_exc);
238 /* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */
239 stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc);
240do2:
241 /* a2 = ((unsigned int *) src)[2]; */
242 ldw(s_space, 8, src, a2, cda_ldw_exc);
243 /* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */
244 stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc);
245do1:
246 /* a3 = ((unsigned int *) src)[3]; */
247 ldw(s_space, 12, src, a3, cda_ldw_exc);
248 /* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */
249 stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc);
250
251 src += 4 * sizeof(unsigned int);
252 dst += 4 * sizeof(unsigned int);
253 len -= 4;
254 }
255 while (len != 0);
256
257do0:
258 /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
259 stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
260
261 preserve_branch(handle_load_error);
262 preserve_branch(handle_store_error);
263
5b879d78 264 return PA_MEMCPY_OK;
1da177e4
LT
265
266handle_load_error:
267 __asm__ __volatile__ ("cda_ldw_exc:\n");
5b879d78 268 return PA_MEMCPY_LOAD_ERROR;
1da177e4
LT
269
270handle_store_error:
271 __asm__ __volatile__ ("cda_stw_exc:\n");
5b879d78 272 return PA_MEMCPY_STORE_ERROR;
1da177e4
LT
273}
274
275
5b879d78
HD
276/* Returns PA_MEMCPY_OK, PA_MEMCPY_LOAD_ERROR or PA_MEMCPY_STORE_ERROR.
277 * In case of an access fault the faulty address can be read from the per_cpu
278 * exception data struct. */
9af63aed 279static noinline unsigned long pa_memcpy_internal(void *dstp, const void *srcp,
5b879d78 280 unsigned long len)
1da177e4
LT
281{
282 register unsigned long src, dst, t1, t2, t3;
283 register unsigned char *pcs, *pcd;
284 register unsigned int *pws, *pwd;
285 register double *pds, *pdd;
5b879d78 286 unsigned long ret;
1da177e4
LT
287
288 src = (unsigned long)srcp;
289 dst = (unsigned long)dstp;
290 pcs = (unsigned char *)srcp;
291 pcd = (unsigned char *)dstp;
292
1da177e4
LT
293 /* prefetch_src((const void *)srcp); */
294
295 if (len < THRESHOLD)
296 goto byte_copy;
297
298 /* Check alignment */
299 t1 = (src ^ dst);
300 if (unlikely(t1 & (sizeof(double)-1)))
301 goto unaligned_copy;
302
303 /* src and dst have same alignment. */
304
305 /* Copy bytes till we are double-aligned. */
306 t2 = src & (sizeof(double) - 1);
307 if (unlikely(t2 != 0)) {
308 t2 = sizeof(double) - t2;
309 while (t2 && len) {
310 /* *pcd++ = *pcs++; */
311 ldbma(s_space, pcs, t3, pmc_load_exc);
312 len--;
313 stbma(d_space, t3, pcd, pmc_store_exc);
314 t2--;
315 }
316 }
317
318 pds = (double *)pcs;
319 pdd = (double *)pcd;
320
fa681a18 321#if 0
1da177e4
LT
322 /* Copy 8 doubles at a time */
323 while (len >= 8*sizeof(double)) {
324 register double r1, r2, r3, r4, r5, r6, r7, r8;
325 /* prefetch_src((char *)pds + L1_CACHE_BYTES); */
326 flddma(s_space, pds, r1, pmc_load_exc);
327 flddma(s_space, pds, r2, pmc_load_exc);
328 flddma(s_space, pds, r3, pmc_load_exc);
329 flddma(s_space, pds, r4, pmc_load_exc);
330 fstdma(d_space, r1, pdd, pmc_store_exc);
331 fstdma(d_space, r2, pdd, pmc_store_exc);
332 fstdma(d_space, r3, pdd, pmc_store_exc);
333 fstdma(d_space, r4, pdd, pmc_store_exc);
334
335#if 0
336 if (L1_CACHE_BYTES <= 32)
337 prefetch_src((char *)pds + L1_CACHE_BYTES);
338#endif
339 flddma(s_space, pds, r5, pmc_load_exc);
340 flddma(s_space, pds, r6, pmc_load_exc);
341 flddma(s_space, pds, r7, pmc_load_exc);
342 flddma(s_space, pds, r8, pmc_load_exc);
343 fstdma(d_space, r5, pdd, pmc_store_exc);
344 fstdma(d_space, r6, pdd, pmc_store_exc);
345 fstdma(d_space, r7, pdd, pmc_store_exc);
346 fstdma(d_space, r8, pdd, pmc_store_exc);
347 len -= 8*sizeof(double);
348 }
fa681a18 349#endif
1da177e4
LT
350
351 pws = (unsigned int *)pds;
352 pwd = (unsigned int *)pdd;
353
354word_copy:
355 while (len >= 8*sizeof(unsigned int)) {
356 register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
357 /* prefetch_src((char *)pws + L1_CACHE_BYTES); */
358 ldwma(s_space, pws, r1, pmc_load_exc);
359 ldwma(s_space, pws, r2, pmc_load_exc);
360 ldwma(s_space, pws, r3, pmc_load_exc);
361 ldwma(s_space, pws, r4, pmc_load_exc);
362 stwma(d_space, r1, pwd, pmc_store_exc);
363 stwma(d_space, r2, pwd, pmc_store_exc);
364 stwma(d_space, r3, pwd, pmc_store_exc);
365 stwma(d_space, r4, pwd, pmc_store_exc);
366
367 ldwma(s_space, pws, r5, pmc_load_exc);
368 ldwma(s_space, pws, r6, pmc_load_exc);
369 ldwma(s_space, pws, r7, pmc_load_exc);
370 ldwma(s_space, pws, r8, pmc_load_exc);
371 stwma(d_space, r5, pwd, pmc_store_exc);
372 stwma(d_space, r6, pwd, pmc_store_exc);
373 stwma(d_space, r7, pwd, pmc_store_exc);
374 stwma(d_space, r8, pwd, pmc_store_exc);
375 len -= 8*sizeof(unsigned int);
376 }
377
378 while (len >= 4*sizeof(unsigned int)) {
379 register unsigned int r1,r2,r3,r4;
380 ldwma(s_space, pws, r1, pmc_load_exc);
381 ldwma(s_space, pws, r2, pmc_load_exc);
382 ldwma(s_space, pws, r3, pmc_load_exc);
383 ldwma(s_space, pws, r4, pmc_load_exc);
384 stwma(d_space, r1, pwd, pmc_store_exc);
385 stwma(d_space, r2, pwd, pmc_store_exc);
386 stwma(d_space, r3, pwd, pmc_store_exc);
387 stwma(d_space, r4, pwd, pmc_store_exc);
388 len -= 4*sizeof(unsigned int);
389 }
390
391 pcs = (unsigned char *)pws;
392 pcd = (unsigned char *)pwd;
393
394byte_copy:
395 while (len) {
396 /* *pcd++ = *pcs++; */
397 ldbma(s_space, pcs, t3, pmc_load_exc);
398 stbma(d_space, t3, pcd, pmc_store_exc);
399 len--;
400 }
401
5b879d78 402 return PA_MEMCPY_OK;
1da177e4
LT
403
404unaligned_copy:
405 /* possibly we are aligned on a word, but not on a double... */
87451d85 406 if (likely((t1 & (sizeof(unsigned int)-1)) == 0)) {
1da177e4
LT
407 t2 = src & (sizeof(unsigned int) - 1);
408
409 if (unlikely(t2 != 0)) {
410 t2 = sizeof(unsigned int) - t2;
411 while (t2) {
412 /* *pcd++ = *pcs++; */
413 ldbma(s_space, pcs, t3, pmc_load_exc);
414 stbma(d_space, t3, pcd, pmc_store_exc);
415 len--;
416 t2--;
417 }
418 }
419
420 pws = (unsigned int *)pcs;
421 pwd = (unsigned int *)pcd;
422 goto word_copy;
423 }
424
425 /* Align the destination. */
426 if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
427 t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
428 while (t2) {
429 /* *pcd++ = *pcs++; */
430 ldbma(s_space, pcs, t3, pmc_load_exc);
431 stbma(d_space, t3, pcd, pmc_store_exc);
432 len--;
433 t2--;
434 }
435 dst = (unsigned long)pcd;
436 src = (unsigned long)pcs;
437 }
438
5b879d78 439 ret = copy_dstaligned(dst, src, len / sizeof(unsigned int));
1da177e4
LT
440 if (ret)
441 return ret;
442
443 pcs += (len & -sizeof(unsigned int));
444 pcd += (len & -sizeof(unsigned int));
445 len %= sizeof(unsigned int);
446
447 preserve_branch(handle_load_error);
448 preserve_branch(handle_store_error);
449
450 goto byte_copy;
451
452handle_load_error:
453 __asm__ __volatile__ ("pmc_load_exc:\n");
5b879d78 454 return PA_MEMCPY_LOAD_ERROR;
1da177e4
LT
455
456handle_store_error:
457 __asm__ __volatile__ ("pmc_store_exc:\n");
5b879d78
HD
458 return PA_MEMCPY_STORE_ERROR;
459}
460
461
462/* Returns 0 for success, otherwise, returns number of bytes not transferred. */
463static unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
464{
465 unsigned long ret, fault_addr, reference;
466 struct exception_data *d;
467
468 ret = pa_memcpy_internal(dstp, srcp, len);
469 if (likely(ret == PA_MEMCPY_OK))
470 return 0;
471
472 /* if a load or store fault occured we can get the faulty addr */
496252f7 473 d = this_cpu_ptr(&exception_data);
5b879d78
HD
474 fault_addr = d->fault_addr;
475
476 /* error in load or store? */
477 if (ret == PA_MEMCPY_LOAD_ERROR)
478 reference = (unsigned long) srcp;
479 else
480 reference = (unsigned long) dstp;
481
482 DPRINTF("pa_memcpy: fault type = %lu, len=%lu fault_addr=%lu ref=%lu\n",
483 ret, len, fault_addr, reference);
484
485 if (fault_addr >= reference)
486 return len - (fault_addr - reference);
487 else
488 return len;
1da177e4
LT
489}
490
491#ifdef __KERNEL__
492unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len)
493{
494 mtsp(get_kernel_space(), 1);
495 mtsp(get_user_space(), 2);
496 return pa_memcpy((void __force *)dst, src, len);
497}
498
888c31fc
HD
499EXPORT_SYMBOL(__copy_from_user);
500unsigned long __copy_from_user(void *dst, const void __user *src, unsigned long len)
1da177e4
LT
501{
502 mtsp(get_user_space(), 1);
503 mtsp(get_kernel_space(), 2);
504 return pa_memcpy(dst, (void __force *)src, len);
505}
506
507unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len)
508{
509 mtsp(get_user_space(), 1);
510 mtsp(get_user_space(), 2);
511 return pa_memcpy((void __force *)dst, (void __force *)src, len);
512}
513
514
515void * memcpy(void * dst,const void *src, size_t count)
516{
517 mtsp(get_kernel_space(), 1);
518 mtsp(get_kernel_space(), 2);
519 pa_memcpy(dst, src, count);
520 return dst;
521}
522
523EXPORT_SYMBOL(copy_to_user);
524EXPORT_SYMBOL(copy_from_user);
525EXPORT_SYMBOL(copy_in_user);
526EXPORT_SYMBOL(memcpy);
db080f9c
HD
527
528long probe_kernel_read(void *dst, const void *src, size_t size)
529{
530 unsigned long addr = (unsigned long)src;
531
964f4133 532 if (addr < PAGE_SIZE)
db080f9c
HD
533 return -EFAULT;
534
535 /* check for I/O space F_EXTEND(0xfff00000) access as well? */
536
537 return __probe_kernel_read(dst, src, size);
538}
539
1da177e4 540#endif
This page took 1.222448 seconds and 5 git commands to generate.