Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* U3memcpy.S: UltraSparc-III optimized memcpy. |
2 | * | |
3 | * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com) | |
4 | */ | |
5 | ||
6 | #ifdef __KERNEL__ | |
7 | #include <asm/visasm.h> | |
8 | #include <asm/asi.h> | |
9 | #define GLOBAL_SPARE %g7 | |
10 | #else | |
11 | #define ASI_BLK_P 0xf0 | |
12 | #define FPRS_FEF 0x04 | |
13 | #ifdef MEMCPY_DEBUG | |
14 | #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \ | |
15 | clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0; | |
16 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | |
17 | #else | |
18 | #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs | |
19 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | |
20 | #endif | |
21 | #define GLOBAL_SPARE %g5 | |
22 | #endif | |
23 | ||
24 | #ifndef EX_LD | |
25 | #define EX_LD(x) x | |
26 | #endif | |
27 | ||
28 | #ifndef EX_ST | |
29 | #define EX_ST(x) x | |
30 | #endif | |
31 | ||
32 | #ifndef EX_RETVAL | |
33 | #define EX_RETVAL(x) x | |
34 | #endif | |
35 | ||
36 | #ifndef LOAD | |
37 | #define LOAD(type,addr,dest) type [addr], dest | |
38 | #endif | |
39 | ||
40 | #ifndef STORE | |
41 | #define STORE(type,src,addr) type src, [addr] | |
42 | #endif | |
43 | ||
44 | #ifndef STORE_BLK | |
45 | #define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P | |
46 | #endif | |
47 | ||
48 | #ifndef FUNC_NAME | |
49 | #define FUNC_NAME U3memcpy | |
50 | #endif | |
51 | ||
52 | #ifndef PREAMBLE | |
53 | #define PREAMBLE | |
54 | #endif | |
55 | ||
56 | #ifndef XCC | |
57 | #define XCC xcc | |
58 | #endif | |
59 | ||
60 | .register %g2,#scratch | |
61 | .register %g3,#scratch | |
62 | ||
63 | /* Special/non-trivial issues of this code: | |
64 | * | |
65 | * 1) %o5 is preserved from VISEntryHalf to VISExitHalf | |
66 | * 2) Only low 32 FPU registers are used so that only the | |
67 | * lower half of the FPU register set is dirtied by this | |
68 | * code. This is especially important in the kernel. | |
69 | * 3) This code never prefetches cachelines past the end | |
70 | * of the source buffer. | |
71 | */ | |
72 | ||
73 | .text | |
74 | .align 64 | |
75 | ||
76 | /* The cheetah's flexible spine, oversized liver, enlarged heart, | |
77 | * slender muscular body, and claws make it the swiftest hunter | |
78 | * in Africa and the fastest animal on land. Can reach speeds | |
79 | * of up to 2.4GB per second. | |
80 | */ | |
81 | ||
82 | .globl FUNC_NAME | |
83 | .type FUNC_NAME,#function | |
84 | FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ | |
85 | srlx %o2, 31, %g2 | |
86 | cmp %g2, 0 | |
87 | tne %xcc, 5 | |
88 | PREAMBLE | |
89 | mov %o0, %o4 | |
90 | cmp %o2, 0 | |
91 | be,pn %XCC, 85f | |
92 | or %o0, %o1, %o3 | |
93 | cmp %o2, 16 | |
94 | blu,a,pn %XCC, 80f | |
95 | or %o3, %o2, %o3 | |
96 | ||
97 | cmp %o2, (3 * 64) | |
98 | blu,pt %XCC, 70f | |
99 | andcc %o3, 0x7, %g0 | |
100 | ||
101 | /* Clobbers o5/g1/g2/g3/g7/icc/xcc. We must preserve | |
102 | * o5 from here until we hit VISExitHalf. | |
103 | */ | |
104 | VISEntryHalf | |
105 | ||
106 | /* Is 'dst' already aligned on an 64-byte boundary? */ | |
107 | andcc %o0, 0x3f, %g2 | |
108 | be,pt %XCC, 2f | |
109 | ||
110 | /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number | |
111 | * of bytes to copy to make 'dst' 64-byte aligned. We pre- | |
112 | * subtract this from 'len'. | |
113 | */ | |
114 | sub %o0, %o1, GLOBAL_SPARE | |
115 | sub %g2, 0x40, %g2 | |
116 | sub %g0, %g2, %g2 | |
117 | sub %o2, %g2, %o2 | |
118 | andcc %g2, 0x7, %g1 | |
119 | be,pt %icc, 2f | |
120 | and %g2, 0x38, %g2 | |
121 | ||
122 | 1: subcc %g1, 0x1, %g1 | |
123 | EX_LD(LOAD(ldub, %o1 + 0x00, %o3)) | |
124 | EX_ST(STORE(stb, %o3, %o1 + GLOBAL_SPARE)) | |
125 | bgu,pt %XCC, 1b | |
126 | add %o1, 0x1, %o1 | |
127 | ||
128 | add %o1, GLOBAL_SPARE, %o0 | |
129 | ||
130 | 2: cmp %g2, 0x0 | |
131 | and %o1, 0x7, %g1 | |
132 | be,pt %icc, 3f | |
133 | alignaddr %o1, %g0, %o1 | |
134 | ||
135 | EX_LD(LOAD(ldd, %o1, %f4)) | |
136 | 1: EX_LD(LOAD(ldd, %o1 + 0x8, %f6)) | |
137 | add %o1, 0x8, %o1 | |
138 | subcc %g2, 0x8, %g2 | |
139 | faligndata %f4, %f6, %f0 | |
140 | EX_ST(STORE(std, %f0, %o0)) | |
141 | be,pn %icc, 3f | |
142 | add %o0, 0x8, %o0 | |
143 | ||
144 | EX_LD(LOAD(ldd, %o1 + 0x8, %f4)) | |
145 | add %o1, 0x8, %o1 | |
146 | subcc %g2, 0x8, %g2 | |
147 | faligndata %f6, %f4, %f2 | |
148 | EX_ST(STORE(std, %f2, %o0)) | |
149 | bne,pt %icc, 1b | |
150 | add %o0, 0x8, %o0 | |
151 | ||
152 | 3: LOAD(prefetch, %o1 + 0x000, #one_read) | |
153 | LOAD(prefetch, %o1 + 0x040, #one_read) | |
154 | andn %o2, (0x40 - 1), GLOBAL_SPARE | |
155 | LOAD(prefetch, %o1 + 0x080, #one_read) | |
156 | LOAD(prefetch, %o1 + 0x0c0, #one_read) | |
157 | LOAD(prefetch, %o1 + 0x100, #one_read) | |
158 | EX_LD(LOAD(ldd, %o1 + 0x000, %f0)) | |
159 | LOAD(prefetch, %o1 + 0x140, #one_read) | |
160 | EX_LD(LOAD(ldd, %o1 + 0x008, %f2)) | |
161 | LOAD(prefetch, %o1 + 0x180, #one_read) | |
162 | EX_LD(LOAD(ldd, %o1 + 0x010, %f4)) | |
163 | LOAD(prefetch, %o1 + 0x1c0, #one_read) | |
164 | faligndata %f0, %f2, %f16 | |
165 | EX_LD(LOAD(ldd, %o1 + 0x018, %f6)) | |
166 | faligndata %f2, %f4, %f18 | |
167 | EX_LD(LOAD(ldd, %o1 + 0x020, %f8)) | |
168 | faligndata %f4, %f6, %f20 | |
169 | EX_LD(LOAD(ldd, %o1 + 0x028, %f10)) | |
170 | faligndata %f6, %f8, %f22 | |
171 | ||
172 | EX_LD(LOAD(ldd, %o1 + 0x030, %f12)) | |
173 | faligndata %f8, %f10, %f24 | |
174 | EX_LD(LOAD(ldd, %o1 + 0x038, %f14)) | |
175 | faligndata %f10, %f12, %f26 | |
176 | EX_LD(LOAD(ldd, %o1 + 0x040, %f0)) | |
177 | ||
178 | subcc GLOBAL_SPARE, 0x80, GLOBAL_SPARE | |
179 | add %o1, 0x40, %o1 | |
180 | bgu,pt %XCC, 1f | |
181 | srl GLOBAL_SPARE, 6, %o3 | |
182 | ba,pt %xcc, 2f | |
183 | nop | |
184 | ||
185 | .align 64 | |
186 | 1: | |
187 | EX_LD(LOAD(ldd, %o1 + 0x008, %f2)) | |
188 | faligndata %f12, %f14, %f28 | |
189 | EX_LD(LOAD(ldd, %o1 + 0x010, %f4)) | |
190 | faligndata %f14, %f0, %f30 | |
191 | EX_ST(STORE_BLK(%f16, %o0)) | |
192 | EX_LD(LOAD(ldd, %o1 + 0x018, %f6)) | |
193 | faligndata %f0, %f2, %f16 | |
194 | add %o0, 0x40, %o0 | |
195 | ||
196 | EX_LD(LOAD(ldd, %o1 + 0x020, %f8)) | |
197 | faligndata %f2, %f4, %f18 | |
198 | EX_LD(LOAD(ldd, %o1 + 0x028, %f10)) | |
199 | faligndata %f4, %f6, %f20 | |
200 | EX_LD(LOAD(ldd, %o1 + 0x030, %f12)) | |
201 | subcc %o3, 0x01, %o3 | |
202 | faligndata %f6, %f8, %f22 | |
203 | EX_LD(LOAD(ldd, %o1 + 0x038, %f14)) | |
204 | ||
205 | faligndata %f8, %f10, %f24 | |
206 | EX_LD(LOAD(ldd, %o1 + 0x040, %f0)) | |
207 | LOAD(prefetch, %o1 + 0x1c0, #one_read) | |
208 | faligndata %f10, %f12, %f26 | |
209 | bg,pt %XCC, 1b | |
210 | add %o1, 0x40, %o1 | |
211 | ||
212 | /* Finally we copy the last full 64-byte block. */ | |
213 | 2: | |
214 | EX_LD(LOAD(ldd, %o1 + 0x008, %f2)) | |
215 | faligndata %f12, %f14, %f28 | |
216 | EX_LD(LOAD(ldd, %o1 + 0x010, %f4)) | |
217 | faligndata %f14, %f0, %f30 | |
218 | EX_ST(STORE_BLK(%f16, %o0)) | |
219 | EX_LD(LOAD(ldd, %o1 + 0x018, %f6)) | |
220 | faligndata %f0, %f2, %f16 | |
221 | EX_LD(LOAD(ldd, %o1 + 0x020, %f8)) | |
222 | faligndata %f2, %f4, %f18 | |
223 | EX_LD(LOAD(ldd, %o1 + 0x028, %f10)) | |
224 | faligndata %f4, %f6, %f20 | |
225 | EX_LD(LOAD(ldd, %o1 + 0x030, %f12)) | |
226 | faligndata %f6, %f8, %f22 | |
227 | EX_LD(LOAD(ldd, %o1 + 0x038, %f14)) | |
228 | faligndata %f8, %f10, %f24 | |
229 | cmp %g1, 0 | |
230 | be,pt %XCC, 1f | |
231 | add %o0, 0x40, %o0 | |
232 | EX_LD(LOAD(ldd, %o1 + 0x040, %f0)) | |
233 | 1: faligndata %f10, %f12, %f26 | |
234 | faligndata %f12, %f14, %f28 | |
235 | faligndata %f14, %f0, %f30 | |
236 | EX_ST(STORE_BLK(%f16, %o0)) | |
237 | add %o0, 0x40, %o0 | |
238 | add %o1, 0x40, %o1 | |
239 | membar #Sync | |
240 | ||
241 | /* Now we copy the (len modulo 64) bytes at the end. | |
242 | * Note how we borrow the %f0 loaded above. | |
243 | * | |
244 | * Also notice how this code is careful not to perform a | |
245 | * load past the end of the src buffer. | |
246 | */ | |
247 | and %o2, 0x3f, %o2 | |
248 | andcc %o2, 0x38, %g2 | |
249 | be,pn %XCC, 2f | |
250 | subcc %g2, 0x8, %g2 | |
251 | be,pn %XCC, 2f | |
252 | cmp %g1, 0 | |
253 | ||
254 | sub %o2, %g2, %o2 | |
255 | be,a,pt %XCC, 1f | |
256 | EX_LD(LOAD(ldd, %o1 + 0x00, %f0)) | |
257 | ||
258 | 1: EX_LD(LOAD(ldd, %o1 + 0x08, %f2)) | |
259 | add %o1, 0x8, %o1 | |
260 | subcc %g2, 0x8, %g2 | |
261 | faligndata %f0, %f2, %f8 | |
262 | EX_ST(STORE(std, %f8, %o0)) | |
263 | be,pn %XCC, 2f | |
264 | add %o0, 0x8, %o0 | |
265 | EX_LD(LOAD(ldd, %o1 + 0x08, %f0)) | |
266 | add %o1, 0x8, %o1 | |
267 | subcc %g2, 0x8, %g2 | |
268 | faligndata %f2, %f0, %f8 | |
269 | EX_ST(STORE(std, %f8, %o0)) | |
270 | bne,pn %XCC, 1b | |
271 | add %o0, 0x8, %o0 | |
272 | ||
273 | /* If anything is left, we copy it one byte at a time. | |
274 | * Note that %g1 is (src & 0x3) saved above before the | |
275 | * alignaddr was performed. | |
276 | */ | |
277 | 2: | |
278 | cmp %o2, 0 | |
279 | add %o1, %g1, %o1 | |
280 | VISExitHalf | |
281 | be,pn %XCC, 85f | |
282 | sub %o0, %o1, %o3 | |
283 | ||
284 | andcc %g1, 0x7, %g0 | |
285 | bne,pn %icc, 90f | |
286 | andcc %o2, 0x8, %g0 | |
287 | be,pt %icc, 1f | |
288 | nop | |
289 | EX_LD(LOAD(ldx, %o1, %o5)) | |
290 | EX_ST(STORE(stx, %o5, %o1 + %o3)) | |
291 | add %o1, 0x8, %o1 | |
292 | ||
293 | 1: andcc %o2, 0x4, %g0 | |
294 | be,pt %icc, 1f | |
295 | nop | |
296 | EX_LD(LOAD(lduw, %o1, %o5)) | |
297 | EX_ST(STORE(stw, %o5, %o1 + %o3)) | |
298 | add %o1, 0x4, %o1 | |
299 | ||
300 | 1: andcc %o2, 0x2, %g0 | |
301 | be,pt %icc, 1f | |
302 | nop | |
303 | EX_LD(LOAD(lduh, %o1, %o5)) | |
304 | EX_ST(STORE(sth, %o5, %o1 + %o3)) | |
305 | add %o1, 0x2, %o1 | |
306 | ||
307 | 1: andcc %o2, 0x1, %g0 | |
308 | be,pt %icc, 85f | |
309 | nop | |
310 | EX_LD(LOAD(ldub, %o1, %o5)) | |
311 | ba,pt %xcc, 85f | |
312 | EX_ST(STORE(stb, %o5, %o1 + %o3)) | |
313 | ||
314 | .align 64 | |
315 | 70: /* 16 < len <= 64 */ | |
316 | bne,pn %XCC, 75f | |
317 | sub %o0, %o1, %o3 | |
318 | ||
319 | 72: | |
320 | andn %o2, 0xf, GLOBAL_SPARE | |
321 | and %o2, 0xf, %o2 | |
322 | 1: subcc GLOBAL_SPARE, 0x10, GLOBAL_SPARE | |
323 | EX_LD(LOAD(ldx, %o1 + 0x00, %o5)) | |
324 | EX_LD(LOAD(ldx, %o1 + 0x08, %g1)) | |
325 | EX_ST(STORE(stx, %o5, %o1 + %o3)) | |
326 | add %o1, 0x8, %o1 | |
327 | EX_ST(STORE(stx, %g1, %o1 + %o3)) | |
328 | bgu,pt %XCC, 1b | |
329 | add %o1, 0x8, %o1 | |
330 | 73: andcc %o2, 0x8, %g0 | |
331 | be,pt %XCC, 1f | |
332 | nop | |
333 | sub %o2, 0x8, %o2 | |
334 | EX_LD(LOAD(ldx, %o1, %o5)) | |
335 | EX_ST(STORE(stx, %o5, %o1 + %o3)) | |
336 | add %o1, 0x8, %o1 | |
337 | 1: andcc %o2, 0x4, %g0 | |
338 | be,pt %XCC, 1f | |
339 | nop | |
340 | sub %o2, 0x4, %o2 | |
341 | EX_LD(LOAD(lduw, %o1, %o5)) | |
342 | EX_ST(STORE(stw, %o5, %o1 + %o3)) | |
343 | add %o1, 0x4, %o1 | |
344 | 1: cmp %o2, 0 | |
345 | be,pt %XCC, 85f | |
346 | nop | |
347 | ba,pt %xcc, 90f | |
348 | nop | |
349 | ||
350 | 75: | |
351 | andcc %o0, 0x7, %g1 | |
352 | sub %g1, 0x8, %g1 | |
353 | be,pn %icc, 2f | |
354 | sub %g0, %g1, %g1 | |
355 | sub %o2, %g1, %o2 | |
356 | ||
357 | 1: subcc %g1, 1, %g1 | |
358 | EX_LD(LOAD(ldub, %o1, %o5)) | |
359 | EX_ST(STORE(stb, %o5, %o1 + %o3)) | |
360 | bgu,pt %icc, 1b | |
361 | add %o1, 1, %o1 | |
362 | ||
363 | 2: add %o1, %o3, %o0 | |
364 | andcc %o1, 0x7, %g1 | |
365 | bne,pt %icc, 8f | |
366 | sll %g1, 3, %g1 | |
367 | ||
368 | cmp %o2, 16 | |
369 | bgeu,pt %icc, 72b | |
370 | nop | |
371 | ba,a,pt %xcc, 73b | |
372 | ||
373 | 8: mov 64, %o3 | |
374 | andn %o1, 0x7, %o1 | |
375 | EX_LD(LOAD(ldx, %o1, %g2)) | |
376 | sub %o3, %g1, %o3 | |
377 | andn %o2, 0x7, GLOBAL_SPARE | |
378 | sllx %g2, %g1, %g2 | |
379 | 1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3)) | |
380 | subcc GLOBAL_SPARE, 0x8, GLOBAL_SPARE | |
381 | add %o1, 0x8, %o1 | |
382 | srlx %g3, %o3, %o5 | |
383 | or %o5, %g2, %o5 | |
384 | EX_ST(STORE(stx, %o5, %o0)) | |
385 | add %o0, 0x8, %o0 | |
386 | bgu,pt %icc, 1b | |
387 | sllx %g3, %g1, %g2 | |
388 | ||
389 | srl %g1, 3, %g1 | |
390 | andcc %o2, 0x7, %o2 | |
391 | be,pn %icc, 85f | |
392 | add %o1, %g1, %o1 | |
393 | ba,pt %xcc, 90f | |
394 | sub %o0, %o1, %o3 | |
395 | ||
396 | .align 64 | |
397 | 80: /* 0 < len <= 16 */ | |
398 | andcc %o3, 0x3, %g0 | |
399 | bne,pn %XCC, 90f | |
400 | sub %o0, %o1, %o3 | |
401 | ||
402 | 1: | |
403 | subcc %o2, 4, %o2 | |
404 | EX_LD(LOAD(lduw, %o1, %g1)) | |
405 | EX_ST(STORE(stw, %g1, %o1 + %o3)) | |
406 | bgu,pt %XCC, 1b | |
407 | add %o1, 4, %o1 | |
408 | ||
409 | 85: retl | |
410 | mov EX_RETVAL(%o4), %o0 | |
411 | ||
412 | .align 32 | |
413 | 90: | |
414 | subcc %o2, 1, %o2 | |
415 | EX_LD(LOAD(ldub, %o1, %g1)) | |
416 | EX_ST(STORE(stb, %g1, %o1 + %o3)) | |
417 | bgu,pt %XCC, 90b | |
418 | add %o1, 1, %o1 | |
419 | retl | |
420 | mov EX_RETVAL(%o4), %o0 | |
421 | ||
422 | .size FUNC_NAME, .-FUNC_NAME |