Merge tag 'armsoc-arm64' of git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc
[deliverable/linux.git] / arch / powerpc / lib / copy_32.S
1 /*
2 * Memory copy functions for 32-bit PowerPC.
3 *
4 * Copyright (C) 1996-2005 Paul Mackerras.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11 #include <asm/processor.h>
12 #include <asm/cache.h>
13 #include <asm/errno.h>
14 #include <asm/ppc_asm.h>
15
16 #define COPY_16_BYTES \
17 lwz r7,4(r4); \
18 lwz r8,8(r4); \
19 lwz r9,12(r4); \
20 lwzu r10,16(r4); \
21 stw r7,4(r6); \
22 stw r8,8(r6); \
23 stw r9,12(r6); \
24 stwu r10,16(r6)
25
26 #define COPY_16_BYTES_WITHEX(n) \
27 8 ## n ## 0: \
28 lwz r7,4(r4); \
29 8 ## n ## 1: \
30 lwz r8,8(r4); \
31 8 ## n ## 2: \
32 lwz r9,12(r4); \
33 8 ## n ## 3: \
34 lwzu r10,16(r4); \
35 8 ## n ## 4: \
36 stw r7,4(r6); \
37 8 ## n ## 5: \
38 stw r8,8(r6); \
39 8 ## n ## 6: \
40 stw r9,12(r6); \
41 8 ## n ## 7: \
42 stwu r10,16(r6)
43
44 #define COPY_16_BYTES_EXCODE(n) \
45 9 ## n ## 0: \
46 addi r5,r5,-(16 * n); \
47 b 104f; \
48 9 ## n ## 1: \
49 addi r5,r5,-(16 * n); \
50 b 105f; \
51 .section __ex_table,"a"; \
52 .align 2; \
53 .long 8 ## n ## 0b,9 ## n ## 0b; \
54 .long 8 ## n ## 1b,9 ## n ## 0b; \
55 .long 8 ## n ## 2b,9 ## n ## 0b; \
56 .long 8 ## n ## 3b,9 ## n ## 0b; \
57 .long 8 ## n ## 4b,9 ## n ## 1b; \
58 .long 8 ## n ## 5b,9 ## n ## 1b; \
59 .long 8 ## n ## 6b,9 ## n ## 1b; \
60 .long 8 ## n ## 7b,9 ## n ## 1b; \
61 .text
62
63 .text
64 .stabs "arch/powerpc/lib/",N_SO,0,0,0f
65 .stabs "copy_32.S",N_SO,0,0,0f
66 0:
67
68 CACHELINE_BYTES = L1_CACHE_BYTES
69 LG_CACHELINE_BYTES = L1_CACHE_SHIFT
70 CACHELINE_MASK = (L1_CACHE_BYTES-1)
71
72 _GLOBAL(memset)
73 rlwimi r4,r4,8,16,23
74 rlwimi r4,r4,16,0,15
75 addi r6,r3,-4
76 cmplwi 0,r5,4
77 blt 7f
78 stwu r4,4(r6)
79 beqlr
80 andi. r0,r6,3
81 add r5,r0,r5
82 subf r6,r0,r6
83 srwi r0,r5,2
84 mtctr r0
85 bdz 6f
86 1: stwu r4,4(r6)
87 bdnz 1b
88 6: andi. r5,r5,3
89 7: cmpwi 0,r5,0
90 beqlr
91 mtctr r5
92 addi r6,r6,3
93 8: stbu r4,1(r6)
94 bdnz 8b
95 blr
96
97 _GLOBAL(memmove)
98 cmplw 0,r3,r4
99 bgt backwards_memcpy
100 /* fall through */
101
102 _GLOBAL(memcpy)
103 srwi. r7,r5,3
104 addi r6,r3,-4
105 addi r4,r4,-4
106 beq 2f /* if less than 8 bytes to do */
107 andi. r0,r6,3 /* get dest word aligned */
108 mtctr r7
109 bne 5f
110 1: lwz r7,4(r4)
111 lwzu r8,8(r4)
112 stw r7,4(r6)
113 stwu r8,8(r6)
114 bdnz 1b
115 andi. r5,r5,7
116 2: cmplwi 0,r5,4
117 blt 3f
118 lwzu r0,4(r4)
119 addi r5,r5,-4
120 stwu r0,4(r6)
121 3: cmpwi 0,r5,0
122 beqlr
123 mtctr r5
124 addi r4,r4,3
125 addi r6,r6,3
126 4: lbzu r0,1(r4)
127 stbu r0,1(r6)
128 bdnz 4b
129 blr
130 5: subfic r0,r0,4
131 mtctr r0
132 6: lbz r7,4(r4)
133 addi r4,r4,1
134 stb r7,4(r6)
135 addi r6,r6,1
136 bdnz 6b
137 subf r5,r0,r5
138 rlwinm. r7,r5,32-3,3,31
139 beq 2b
140 mtctr r7
141 b 1b
142
143 _GLOBAL(backwards_memcpy)
144 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */
145 add r6,r3,r5
146 add r4,r4,r5
147 beq 2f
148 andi. r0,r6,3
149 mtctr r7
150 bne 5f
151 1: lwz r7,-4(r4)
152 lwzu r8,-8(r4)
153 stw r7,-4(r6)
154 stwu r8,-8(r6)
155 bdnz 1b
156 andi. r5,r5,7
157 2: cmplwi 0,r5,4
158 blt 3f
159 lwzu r0,-4(r4)
160 subi r5,r5,4
161 stwu r0,-4(r6)
162 3: cmpwi 0,r5,0
163 beqlr
164 mtctr r5
165 4: lbzu r0,-1(r4)
166 stbu r0,-1(r6)
167 bdnz 4b
168 blr
169 5: mtctr r0
170 6: lbzu r7,-1(r4)
171 stbu r7,-1(r6)
172 bdnz 6b
173 subf r5,r0,r5
174 rlwinm. r7,r5,32-3,3,31
175 beq 2b
176 mtctr r7
177 b 1b
178
179 _GLOBAL(__copy_tofrom_user)
180 addi r4,r4,-4
181 addi r6,r3,-4
182 neg r0,r3
183 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
184 beq 58f
185
186 cmplw 0,r5,r0 /* is this more than total to do? */
187 blt 63f /* if not much to do */
188 andi. r8,r0,3 /* get it word-aligned first */
189 mtctr r8
190 beq+ 61f
191 70: lbz r9,4(r4) /* do some bytes */
192 71: stb r9,4(r6)
193 addi r4,r4,1
194 addi r6,r6,1
195 bdnz 70b
196 61: subf r5,r0,r5
197 srwi. r0,r0,2
198 mtctr r0
199 beq 58f
200 72: lwzu r9,4(r4) /* do some words */
201 73: stwu r9,4(r6)
202 bdnz 72b
203
204 .section __ex_table,"a"
205 .align 2
206 .long 70b,100f
207 .long 71b,101f
208 .long 72b,102f
209 .long 73b,103f
210 .text
211
212 58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
213 clrlwi r5,r5,32-LG_CACHELINE_BYTES
214 li r11,4
215 beq 63f
216
217 /* Here we decide how far ahead to prefetch the source */
218 li r3,4
219 cmpwi r0,1
220 li r7,0
221 ble 114f
222 li r7,1
223 #if MAX_COPY_PREFETCH > 1
224 /* Heuristically, for large transfers we prefetch
225 MAX_COPY_PREFETCH cachelines ahead. For small transfers
226 we prefetch 1 cacheline ahead. */
227 cmpwi r0,MAX_COPY_PREFETCH
228 ble 112f
229 li r7,MAX_COPY_PREFETCH
230 112: mtctr r7
231 111: dcbt r3,r4
232 addi r3,r3,CACHELINE_BYTES
233 bdnz 111b
234 #else
235 dcbt r3,r4
236 addi r3,r3,CACHELINE_BYTES
237 #endif /* MAX_COPY_PREFETCH > 1 */
238
239 114: subf r8,r7,r0
240 mr r0,r7
241 mtctr r8
242
243 53: dcbt r3,r4
244 54: dcbz r11,r6
245 .section __ex_table,"a"
246 .align 2
247 .long 54b,105f
248 .text
249 /* the main body of the cacheline loop */
250 COPY_16_BYTES_WITHEX(0)
251 #if L1_CACHE_BYTES >= 32
252 COPY_16_BYTES_WITHEX(1)
253 #if L1_CACHE_BYTES >= 64
254 COPY_16_BYTES_WITHEX(2)
255 COPY_16_BYTES_WITHEX(3)
256 #if L1_CACHE_BYTES >= 128
257 COPY_16_BYTES_WITHEX(4)
258 COPY_16_BYTES_WITHEX(5)
259 COPY_16_BYTES_WITHEX(6)
260 COPY_16_BYTES_WITHEX(7)
261 #endif
262 #endif
263 #endif
264 bdnz 53b
265 cmpwi r0,0
266 li r3,4
267 li r7,0
268 bne 114b
269
270 63: srwi. r0,r5,2
271 mtctr r0
272 beq 64f
273 30: lwzu r0,4(r4)
274 31: stwu r0,4(r6)
275 bdnz 30b
276
277 64: andi. r0,r5,3
278 mtctr r0
279 beq+ 65f
280 40: lbz r0,4(r4)
281 41: stb r0,4(r6)
282 addi r4,r4,1
283 addi r6,r6,1
284 bdnz 40b
285 65: li r3,0
286 blr
287
288 /* read fault, initial single-byte copy */
289 100: li r9,0
290 b 90f
291 /* write fault, initial single-byte copy */
292 101: li r9,1
293 90: subf r5,r8,r5
294 li r3,0
295 b 99f
296 /* read fault, initial word copy */
297 102: li r9,0
298 b 91f
299 /* write fault, initial word copy */
300 103: li r9,1
301 91: li r3,2
302 b 99f
303
304 /*
305 * this stuff handles faults in the cacheline loop and branches to either
306 * 104f (if in read part) or 105f (if in write part), after updating r5
307 */
308 COPY_16_BYTES_EXCODE(0)
309 #if L1_CACHE_BYTES >= 32
310 COPY_16_BYTES_EXCODE(1)
311 #if L1_CACHE_BYTES >= 64
312 COPY_16_BYTES_EXCODE(2)
313 COPY_16_BYTES_EXCODE(3)
314 #if L1_CACHE_BYTES >= 128
315 COPY_16_BYTES_EXCODE(4)
316 COPY_16_BYTES_EXCODE(5)
317 COPY_16_BYTES_EXCODE(6)
318 COPY_16_BYTES_EXCODE(7)
319 #endif
320 #endif
321 #endif
322
323 /* read fault in cacheline loop */
324 104: li r9,0
325 b 92f
326 /* fault on dcbz (effectively a write fault) */
327 /* or write fault in cacheline loop */
328 105: li r9,1
329 92: li r3,LG_CACHELINE_BYTES
330 mfctr r8
331 add r0,r0,r8
332 b 106f
333 /* read fault in final word loop */
334 108: li r9,0
335 b 93f
336 /* write fault in final word loop */
337 109: li r9,1
338 93: andi. r5,r5,3
339 li r3,2
340 b 99f
341 /* read fault in final byte loop */
342 110: li r9,0
343 b 94f
344 /* write fault in final byte loop */
345 111: li r9,1
346 94: li r5,0
347 li r3,0
348 /*
349 * At this stage the number of bytes not copied is
350 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
351 */
352 99: mfctr r0
353 106: slw r3,r0,r3
354 add. r3,r3,r5
355 beq 120f /* shouldn't happen */
356 cmpwi 0,r9,0
357 bne 120f
358 /* for a read fault, first try to continue the copy one byte at a time */
359 mtctr r3
360 130: lbz r0,4(r4)
361 131: stb r0,4(r6)
362 addi r4,r4,1
363 addi r6,r6,1
364 bdnz 130b
365 /* then clear out the destination: r3 bytes starting at 4(r6) */
366 132: mfctr r3
367 srwi. r0,r3,2
368 li r9,0
369 mtctr r0
370 beq 113f
371 112: stwu r9,4(r6)
372 bdnz 112b
373 113: andi. r0,r3,3
374 mtctr r0
375 beq 120f
376 114: stb r9,4(r6)
377 addi r6,r6,1
378 bdnz 114b
379 120: blr
380
381 .section __ex_table,"a"
382 .align 2
383 .long 30b,108b
384 .long 31b,109b
385 .long 40b,110b
386 .long 41b,111b
387 .long 130b,132b
388 .long 131b,120b
389 .long 112b,120b
390 .long 114b,120b
391 .text
This page took 0.038581 seconds and 6 git commands to generate.