Merge tag 'for-f2fs-3.17' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk...
[deliverable/linux.git] / arch / sparc / lib / NG4memcpy.S
1 /* NG4memcpy.S: Niagara-4 optimized memcpy.
2 *
3 * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
4 */
5
6 #ifdef __KERNEL__
7 #include <asm/visasm.h>
8 #include <asm/asi.h>
9 #define GLOBAL_SPARE %g7
10 #else
11 #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
12 #define FPRS_FEF 0x04
13
14 /* On T4 it is very expensive to access ASRs like %fprs and
15 * %asi, avoiding a read or a write can save ~50 cycles.
16 */
17 #define FPU_ENTER \
18 rd %fprs, %o5; \
19 andcc %o5, FPRS_FEF, %g0; \
20 be,a,pn %icc, 999f; \
21 wr %g0, FPRS_FEF, %fprs; \
22 999:
23
24 #ifdef MEMCPY_DEBUG
25 #define VISEntryHalf FPU_ENTER; \
26 clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
27 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
28 #else
29 #define VISEntryHalf FPU_ENTER
30 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
31 #endif
32
33 #define GLOBAL_SPARE %g5
34 #endif
35
36 #ifndef STORE_ASI
37 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
38 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
39 #else
40 #define STORE_ASI 0x80 /* ASI_P */
41 #endif
42 #endif
43
44 #ifndef EX_LD
45 #define EX_LD(x) x
46 #endif
47
48 #ifndef EX_ST
49 #define EX_ST(x) x
50 #endif
51
52 #ifndef EX_RETVAL
53 #define EX_RETVAL(x) x
54 #endif
55
56 #ifndef LOAD
57 #define LOAD(type,addr,dest) type [addr], dest
58 #endif
59
60 #ifndef STORE
61 #ifndef MEMCPY_DEBUG
62 #define STORE(type,src,addr) type src, [addr]
63 #else
64 #define STORE(type,src,addr) type##a src, [addr] %asi
65 #endif
66 #endif
67
68 #ifndef STORE_INIT
69 #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
70 #endif
71
72 #ifndef FUNC_NAME
73 #define FUNC_NAME NG4memcpy
74 #endif
75 #ifndef PREAMBLE
76 #define PREAMBLE
77 #endif
78
79 #ifndef XCC
80 #define XCC xcc
81 #endif
82
83 .register %g2,#scratch
84 .register %g3,#scratch
85
86 .text
87 .align 64
88
89 .globl FUNC_NAME
90 .type FUNC_NAME,#function
91 FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
92 #ifdef MEMCPY_DEBUG
93 wr %g0, 0x80, %asi
94 #endif
95 srlx %o2, 31, %g2
96 cmp %g2, 0
97 tne %XCC, 5
98 PREAMBLE
99 mov %o0, %o3
100 brz,pn %o2, .Lexit
101 cmp %o2, 3
102 ble,pn %icc, .Ltiny
103 cmp %o2, 19
104 ble,pn %icc, .Lsmall
105 or %o0, %o1, %g2
106 cmp %o2, 128
107 bl,pn %icc, .Lmedium
108 nop
109
110 .Llarge:/* len >= 0x80 */
111 /* First get dest 8 byte aligned. */
112 sub %g0, %o0, %g1
113 and %g1, 0x7, %g1
114 brz,pt %g1, 51f
115 sub %o2, %g1, %o2
116
117 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
118 add %o1, 1, %o1
119 subcc %g1, 1, %g1
120 add %o0, 1, %o0
121 bne,pt %icc, 1b
122 EX_ST(STORE(stb, %g2, %o0 - 0x01))
123
124 51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
125 LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
126 LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
127 LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
128 LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
129 LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
130 LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
131 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
132
133 /* Check if we can use the straight fully aligned
134 * loop, or we require the alignaddr/faligndata variant.
135 */
136 andcc %o1, 0x7, %o5
137 bne,pn %icc, .Llarge_src_unaligned
138 sub %g0, %o0, %g1
139
140 /* Legitimize the use of initializing stores by getting dest
141 * to be 64-byte aligned.
142 */
143 and %g1, 0x3f, %g1
144 brz,pt %g1, .Llarge_aligned
145 sub %o2, %g1, %o2
146
147 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
148 add %o1, 8, %o1
149 subcc %g1, 8, %g1
150 add %o0, 8, %o0
151 bne,pt %icc, 1b
152 EX_ST(STORE(stx, %g2, %o0 - 0x08))
153
154 .Llarge_aligned:
155 /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
156 andn %o2, 0x3f, %o4
157 sub %o2, %o4, %o2
158
159 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
160 add %o1, 0x40, %o1
161 EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
162 subcc %o4, 0x40, %o4
163 EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
164 EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
165 EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
166 EX_ST(STORE_INIT(%g1, %o0))
167 add %o0, 0x08, %o0
168 EX_ST(STORE_INIT(%g2, %o0))
169 add %o0, 0x08, %o0
170 EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
171 EX_ST(STORE_INIT(%g3, %o0))
172 add %o0, 0x08, %o0
173 EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
174 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
175 add %o0, 0x08, %o0
176 EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
177 EX_ST(STORE_INIT(%o5, %o0))
178 add %o0, 0x08, %o0
179 EX_ST(STORE_INIT(%g2, %o0))
180 add %o0, 0x08, %o0
181 EX_ST(STORE_INIT(%g3, %o0))
182 add %o0, 0x08, %o0
183 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
184 add %o0, 0x08, %o0
185 bne,pt %icc, 1b
186 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
187
188 membar #StoreLoad | #StoreStore
189
190 brz,pn %o2, .Lexit
191 cmp %o2, 19
192 ble,pn %icc, .Lsmall_unaligned
193 nop
194 ba,a,pt %icc, .Lmedium_noprefetch
195
196 .Lexit: retl
197 mov EX_RETVAL(%o3), %o0
198
199 .Llarge_src_unaligned:
200 andn %o2, 0x3f, %o4
201 sub %o2, %o4, %o2
202 VISEntryHalf
203 alignaddr %o1, %g0, %g1
204 add %o1, %o4, %o1
205 EX_LD(LOAD(ldd, %g1 + 0x00, %f0))
206 1: EX_LD(LOAD(ldd, %g1 + 0x08, %f2))
207 subcc %o4, 0x40, %o4
208 EX_LD(LOAD(ldd, %g1 + 0x10, %f4))
209 EX_LD(LOAD(ldd, %g1 + 0x18, %f6))
210 EX_LD(LOAD(ldd, %g1 + 0x20, %f8))
211 EX_LD(LOAD(ldd, %g1 + 0x28, %f10))
212 EX_LD(LOAD(ldd, %g1 + 0x30, %f12))
213 EX_LD(LOAD(ldd, %g1 + 0x38, %f14))
214 faligndata %f0, %f2, %f16
215 EX_LD(LOAD(ldd, %g1 + 0x40, %f0))
216 faligndata %f2, %f4, %f18
217 add %g1, 0x40, %g1
218 faligndata %f4, %f6, %f20
219 faligndata %f6, %f8, %f22
220 faligndata %f8, %f10, %f24
221 faligndata %f10, %f12, %f26
222 faligndata %f12, %f14, %f28
223 faligndata %f14, %f0, %f30
224 EX_ST(STORE(std, %f16, %o0 + 0x00))
225 EX_ST(STORE(std, %f18, %o0 + 0x08))
226 EX_ST(STORE(std, %f20, %o0 + 0x10))
227 EX_ST(STORE(std, %f22, %o0 + 0x18))
228 EX_ST(STORE(std, %f24, %o0 + 0x20))
229 EX_ST(STORE(std, %f26, %o0 + 0x28))
230 EX_ST(STORE(std, %f28, %o0 + 0x30))
231 EX_ST(STORE(std, %f30, %o0 + 0x38))
232 add %o0, 0x40, %o0
233 bne,pt %icc, 1b
234 LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
235 VISExitHalf
236
237 brz,pn %o2, .Lexit
238 cmp %o2, 19
239 ble,pn %icc, .Lsmall_unaligned
240 nop
241 ba,a,pt %icc, .Lmedium_unaligned
242
243 .Lmedium:
244 LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
245 andcc %g2, 0x7, %g0
246 bne,pn %icc, .Lmedium_unaligned
247 nop
248 .Lmedium_noprefetch:
249 andncc %o2, 0x20 - 1, %o5
250 be,pn %icc, 2f
251 sub %o2, %o5, %o2
252 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
253 EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
254 EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
255 EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
256 add %o1, 0x20, %o1
257 subcc %o5, 0x20, %o5
258 EX_ST(STORE(stx, %g1, %o0 + 0x00))
259 EX_ST(STORE(stx, %g2, %o0 + 0x08))
260 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
261 EX_ST(STORE(stx, %o4, %o0 + 0x18))
262 bne,pt %icc, 1b
263 add %o0, 0x20, %o0
264 2: andcc %o2, 0x18, %o5
265 be,pt %icc, 3f
266 sub %o2, %o5, %o2
267 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
268 add %o1, 0x08, %o1
269 add %o0, 0x08, %o0
270 subcc %o5, 0x08, %o5
271 bne,pt %icc, 1b
272 EX_ST(STORE(stx, %g1, %o0 - 0x08))
273 3: brz,pt %o2, .Lexit
274 cmp %o2, 0x04
275 bl,pn %icc, .Ltiny
276 nop
277 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
278 add %o1, 0x04, %o1
279 add %o0, 0x04, %o0
280 subcc %o2, 0x04, %o2
281 bne,pn %icc, .Ltiny
282 EX_ST(STORE(stw, %g1, %o0 - 0x04))
283 ba,a,pt %icc, .Lexit
284 .Lmedium_unaligned:
285 /* First get dest 8 byte aligned. */
286 sub %g0, %o0, %g1
287 and %g1, 0x7, %g1
288 brz,pt %g1, 2f
289 sub %o2, %g1, %o2
290
291 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
292 add %o1, 1, %o1
293 subcc %g1, 1, %g1
294 add %o0, 1, %o0
295 bne,pt %icc, 1b
296 EX_ST(STORE(stb, %g2, %o0 - 0x01))
297 2:
298 and %o1, 0x7, %g1
299 brz,pn %g1, .Lmedium_noprefetch
300 sll %g1, 3, %g1
301 mov 64, %g2
302 sub %g2, %g1, %g2
303 andn %o1, 0x7, %o1
304 EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
305 sllx %o4, %g1, %o4
306 andn %o2, 0x08 - 1, %o5
307 sub %o2, %o5, %o2
308 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
309 add %o1, 0x08, %o1
310 subcc %o5, 0x08, %o5
311 srlx %g3, %g2, GLOBAL_SPARE
312 or GLOBAL_SPARE, %o4, GLOBAL_SPARE
313 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
314 add %o0, 0x08, %o0
315 bne,pt %icc, 1b
316 sllx %g3, %g1, %o4
317 srl %g1, 3, %g1
318 add %o1, %g1, %o1
319 brz,pn %o2, .Lexit
320 nop
321 ba,pt %icc, .Lsmall_unaligned
322
323 .Ltiny:
324 EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
325 subcc %o2, 1, %o2
326 be,pn %icc, .Lexit
327 EX_ST(STORE(stb, %g1, %o0 + 0x00))
328 EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
329 subcc %o2, 1, %o2
330 be,pn %icc, .Lexit
331 EX_ST(STORE(stb, %g1, %o0 + 0x01))
332 EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
333 ba,pt %icc, .Lexit
334 EX_ST(STORE(stb, %g1, %o0 + 0x02))
335
336 .Lsmall:
337 andcc %g2, 0x3, %g0
338 bne,pn %icc, .Lsmall_unaligned
339 andn %o2, 0x4 - 1, %o5
340 sub %o2, %o5, %o2
341 1:
342 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
343 add %o1, 0x04, %o1
344 subcc %o5, 0x04, %o5
345 add %o0, 0x04, %o0
346 bne,pt %icc, 1b
347 EX_ST(STORE(stw, %g1, %o0 - 0x04))
348 brz,pt %o2, .Lexit
349 nop
350 ba,a,pt %icc, .Ltiny
351
352 .Lsmall_unaligned:
353 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
354 add %o1, 1, %o1
355 add %o0, 1, %o0
356 subcc %o2, 1, %o2
357 bne,pt %icc, 1b
358 EX_ST(STORE(stb, %g1, %o0 - 0x01))
359 ba,a,pt %icc, .Lexit
360 .size FUNC_NAME, .-FUNC_NAME
This page took 0.037032 seconds and 5 git commands to generate.