Commit | Line | Data |
---|---|---|
322ae8eb MS |
1 | /* |
2 | * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu> | |
3 | * Copyright (C) 2008-2009 PetaLogix | |
4 | * Copyright (C) 2008 Jim Law - Iris LP All rights reserved. | |
5 | * | |
6 | * This file is subject to the terms and conditions of the GNU General | |
7 | * Public License. See the file COPYING in the main directory of this | |
8 | * archive for more details. | |
9 | * | |
10 | * Written by Jim Law <jlaw@irispower.com> | |
11 | * | |
12 | * intended to replace: | |
13 | * memcpy in memcpy.c and | |
14 | * memmove in memmove.c | |
15 | * ... in arch/microblaze/lib | |
16 | * | |
17 | * | |
18 | * assly_fastcopy.S | |
19 | * | |
20 | * Attempt at quicker memcpy and memmove for MicroBlaze | |
21 | * Input : Operand1 in Reg r5 - destination address | |
22 | * Operand2 in Reg r6 - source address | |
23 | * Operand3 in Reg r7 - number of bytes to transfer | |
24 | * Output: Result in Reg r3 - starting destinaition address | |
25 | * | |
26 | * | |
27 | * Explanation: | |
28 | * Perform (possibly unaligned) copy of a block of memory | |
29 | * between mem locations with size of xfer spec'd in bytes | |
30 | */ | |
31 | ||
de93c3c1 MS |
32 | #ifdef __MICROBLAZEEL__ |
33 | #error Microblaze LE not support ASM optimized lib func. Disable OPT_LIB_ASM. | |
34 | #endif | |
35 | ||
322ae8eb | 36 | #include <linux/linkage.h> |
13851966 | 37 | .text |
322ae8eb | 38 | .globl memcpy |
13851966 | 39 | .type memcpy, @function |
322ae8eb MS |
40 | .ent memcpy |
41 | ||
42 | memcpy: | |
43 | fast_memcpy_ascending: | |
44 | /* move d to return register as value of function */ | |
45 | addi r3, r5, 0 | |
46 | ||
47 | addi r4, r0, 4 /* n = 4 */ | |
48 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ | |
49 | blti r4, a_xfer_end /* if n < 0, less than one word to transfer */ | |
50 | ||
51 | /* transfer first 0~3 bytes to get aligned dest address */ | |
52 | andi r4, r5, 3 /* n = d & 3 */ | |
53 | /* if zero, destination already aligned */ | |
54 | beqi r4, a_dalign_done | |
55 | /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */ | |
56 | rsubi r4, r4, 4 | |
57 | rsub r7, r4, r7 /* c = c - n adjust c */ | |
58 | ||
59 | a_xfer_first_loop: | |
60 | /* if no bytes left to transfer, transfer the bulk */ | |
61 | beqi r4, a_dalign_done | |
62 | lbui r11, r6, 0 /* h = *s */ | |
63 | sbi r11, r5, 0 /* *d = h */ | |
64 | addi r6, r6, 1 /* s++ */ | |
65 | addi r5, r5, 1 /* d++ */ | |
66 | brid a_xfer_first_loop /* loop */ | |
67 | addi r4, r4, -1 /* n-- (IN DELAY SLOT) */ | |
68 | ||
69 | a_dalign_done: | |
70 | addi r4, r0, 32 /* n = 32 */ | |
71 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ | |
72 | /* if n < 0, less than one block to transfer */ | |
73 | blti r4, a_block_done | |
74 | ||
75 | a_block_xfer: | |
76 | andi r4, r7, 0xffffffe0 /* n = c & ~31 */ | |
77 | rsub r7, r4, r7 /* c = c - n */ | |
78 | ||
79 | andi r9, r6, 3 /* t1 = s & 3 */ | |
80 | /* if temp != 0, unaligned transfers needed */ | |
81 | bnei r9, a_block_unaligned | |
82 | ||
83 | a_block_aligned: | |
84 | lwi r9, r6, 0 /* t1 = *(s + 0) */ | |
85 | lwi r10, r6, 4 /* t2 = *(s + 4) */ | |
86 | lwi r11, r6, 8 /* t3 = *(s + 8) */ | |
87 | lwi r12, r6, 12 /* t4 = *(s + 12) */ | |
88 | swi r9, r5, 0 /* *(d + 0) = t1 */ | |
89 | swi r10, r5, 4 /* *(d + 4) = t2 */ | |
90 | swi r11, r5, 8 /* *(d + 8) = t3 */ | |
91 | swi r12, r5, 12 /* *(d + 12) = t4 */ | |
92 | lwi r9, r6, 16 /* t1 = *(s + 16) */ | |
93 | lwi r10, r6, 20 /* t2 = *(s + 20) */ | |
94 | lwi r11, r6, 24 /* t3 = *(s + 24) */ | |
95 | lwi r12, r6, 28 /* t4 = *(s + 28) */ | |
96 | swi r9, r5, 16 /* *(d + 16) = t1 */ | |
97 | swi r10, r5, 20 /* *(d + 20) = t2 */ | |
98 | swi r11, r5, 24 /* *(d + 24) = t3 */ | |
99 | swi r12, r5, 28 /* *(d + 28) = t4 */ | |
100 | addi r6, r6, 32 /* s = s + 32 */ | |
101 | addi r4, r4, -32 /* n = n - 32 */ | |
102 | bneid r4, a_block_aligned /* while (n) loop */ | |
103 | addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ | |
104 | bri a_block_done | |
105 | ||
106 | a_block_unaligned: | |
107 | andi r8, r6, 0xfffffffc /* as = s & ~3 */ | |
108 | add r6, r6, r4 /* s = s + n */ | |
109 | lwi r11, r8, 0 /* h = *(as + 0) */ | |
110 | ||
111 | addi r9, r9, -1 | |
112 | beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */ | |
113 | addi r9, r9, -1 | |
114 | beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */ | |
115 | ||
116 | a_block_u3: | |
117 | bslli r11, r11, 24 /* h = h << 24 */ | |
118 | a_bu3_loop: | |
119 | lwi r12, r8, 4 /* v = *(as + 4) */ | |
120 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | |
121 | or r9, r11, r9 /* t1 = h | t1 */ | |
122 | swi r9, r5, 0 /* *(d + 0) = t1 */ | |
123 | bslli r11, r12, 24 /* h = v << 24 */ | |
124 | lwi r12, r8, 8 /* v = *(as + 8) */ | |
125 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | |
126 | or r9, r11, r9 /* t1 = h | t1 */ | |
127 | swi r9, r5, 4 /* *(d + 4) = t1 */ | |
128 | bslli r11, r12, 24 /* h = v << 24 */ | |
129 | lwi r12, r8, 12 /* v = *(as + 12) */ | |
130 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | |
131 | or r9, r11, r9 /* t1 = h | t1 */ | |
132 | swi r9, r5, 8 /* *(d + 8) = t1 */ | |
133 | bslli r11, r12, 24 /* h = v << 24 */ | |
134 | lwi r12, r8, 16 /* v = *(as + 16) */ | |
135 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | |
136 | or r9, r11, r9 /* t1 = h | t1 */ | |
137 | swi r9, r5, 12 /* *(d + 12) = t1 */ | |
138 | bslli r11, r12, 24 /* h = v << 24 */ | |
139 | lwi r12, r8, 20 /* v = *(as + 20) */ | |
140 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | |
141 | or r9, r11, r9 /* t1 = h | t1 */ | |
142 | swi r9, r5, 16 /* *(d + 16) = t1 */ | |
143 | bslli r11, r12, 24 /* h = v << 24 */ | |
144 | lwi r12, r8, 24 /* v = *(as + 24) */ | |
145 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | |
146 | or r9, r11, r9 /* t1 = h | t1 */ | |
147 | swi r9, r5, 20 /* *(d + 20) = t1 */ | |
148 | bslli r11, r12, 24 /* h = v << 24 */ | |
149 | lwi r12, r8, 28 /* v = *(as + 28) */ | |
150 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | |
151 | or r9, r11, r9 /* t1 = h | t1 */ | |
152 | swi r9, r5, 24 /* *(d + 24) = t1 */ | |
153 | bslli r11, r12, 24 /* h = v << 24 */ | |
154 | lwi r12, r8, 32 /* v = *(as + 32) */ | |
155 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | |
156 | or r9, r11, r9 /* t1 = h | t1 */ | |
157 | swi r9, r5, 28 /* *(d + 28) = t1 */ | |
158 | bslli r11, r12, 24 /* h = v << 24 */ | |
159 | addi r8, r8, 32 /* as = as + 32 */ | |
160 | addi r4, r4, -32 /* n = n - 32 */ | |
161 | bneid r4, a_bu3_loop /* while (n) loop */ | |
162 | addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ | |
163 | bri a_block_done | |
164 | ||
165 | a_block_u1: | |
166 | bslli r11, r11, 8 /* h = h << 8 */ | |
167 | a_bu1_loop: | |
168 | lwi r12, r8, 4 /* v = *(as + 4) */ | |
169 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | |
170 | or r9, r11, r9 /* t1 = h | t1 */ | |
171 | swi r9, r5, 0 /* *(d + 0) = t1 */ | |
172 | bslli r11, r12, 8 /* h = v << 8 */ | |
173 | lwi r12, r8, 8 /* v = *(as + 8) */ | |
174 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | |
175 | or r9, r11, r9 /* t1 = h | t1 */ | |
176 | swi r9, r5, 4 /* *(d + 4) = t1 */ | |
177 | bslli r11, r12, 8 /* h = v << 8 */ | |
178 | lwi r12, r8, 12 /* v = *(as + 12) */ | |
179 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | |
180 | or r9, r11, r9 /* t1 = h | t1 */ | |
181 | swi r9, r5, 8 /* *(d + 8) = t1 */ | |
182 | bslli r11, r12, 8 /* h = v << 8 */ | |
183 | lwi r12, r8, 16 /* v = *(as + 16) */ | |
184 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | |
185 | or r9, r11, r9 /* t1 = h | t1 */ | |
186 | swi r9, r5, 12 /* *(d + 12) = t1 */ | |
187 | bslli r11, r12, 8 /* h = v << 8 */ | |
188 | lwi r12, r8, 20 /* v = *(as + 20) */ | |
189 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | |
190 | or r9, r11, r9 /* t1 = h | t1 */ | |
191 | swi r9, r5, 16 /* *(d + 16) = t1 */ | |
192 | bslli r11, r12, 8 /* h = v << 8 */ | |
193 | lwi r12, r8, 24 /* v = *(as + 24) */ | |
194 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | |
195 | or r9, r11, r9 /* t1 = h | t1 */ | |
196 | swi r9, r5, 20 /* *(d + 20) = t1 */ | |
197 | bslli r11, r12, 8 /* h = v << 8 */ | |
198 | lwi r12, r8, 28 /* v = *(as + 28) */ | |
199 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | |
200 | or r9, r11, r9 /* t1 = h | t1 */ | |
201 | swi r9, r5, 24 /* *(d + 24) = t1 */ | |
202 | bslli r11, r12, 8 /* h = v << 8 */ | |
203 | lwi r12, r8, 32 /* v = *(as + 32) */ | |
204 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | |
205 | or r9, r11, r9 /* t1 = h | t1 */ | |
206 | swi r9, r5, 28 /* *(d + 28) = t1 */ | |
207 | bslli r11, r12, 8 /* h = v << 8 */ | |
208 | addi r8, r8, 32 /* as = as + 32 */ | |
209 | addi r4, r4, -32 /* n = n - 32 */ | |
210 | bneid r4, a_bu1_loop /* while (n) loop */ | |
211 | addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ | |
212 | bri a_block_done | |
213 | ||
214 | a_block_u2: | |
215 | bslli r11, r11, 16 /* h = h << 16 */ | |
216 | a_bu2_loop: | |
217 | lwi r12, r8, 4 /* v = *(as + 4) */ | |
218 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | |
219 | or r9, r11, r9 /* t1 = h | t1 */ | |
220 | swi r9, r5, 0 /* *(d + 0) = t1 */ | |
221 | bslli r11, r12, 16 /* h = v << 16 */ | |
222 | lwi r12, r8, 8 /* v = *(as + 8) */ | |
223 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | |
224 | or r9, r11, r9 /* t1 = h | t1 */ | |
225 | swi r9, r5, 4 /* *(d + 4) = t1 */ | |
226 | bslli r11, r12, 16 /* h = v << 16 */ | |
227 | lwi r12, r8, 12 /* v = *(as + 12) */ | |
228 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | |
229 | or r9, r11, r9 /* t1 = h | t1 */ | |
230 | swi r9, r5, 8 /* *(d + 8) = t1 */ | |
231 | bslli r11, r12, 16 /* h = v << 16 */ | |
232 | lwi r12, r8, 16 /* v = *(as + 16) */ | |
233 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | |
234 | or r9, r11, r9 /* t1 = h | t1 */ | |
235 | swi r9, r5, 12 /* *(d + 12) = t1 */ | |
236 | bslli r11, r12, 16 /* h = v << 16 */ | |
237 | lwi r12, r8, 20 /* v = *(as + 20) */ | |
238 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | |
239 | or r9, r11, r9 /* t1 = h | t1 */ | |
240 | swi r9, r5, 16 /* *(d + 16) = t1 */ | |
241 | bslli r11, r12, 16 /* h = v << 16 */ | |
242 | lwi r12, r8, 24 /* v = *(as + 24) */ | |
243 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | |
244 | or r9, r11, r9 /* t1 = h | t1 */ | |
245 | swi r9, r5, 20 /* *(d + 20) = t1 */ | |
246 | bslli r11, r12, 16 /* h = v << 16 */ | |
247 | lwi r12, r8, 28 /* v = *(as + 28) */ | |
248 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | |
249 | or r9, r11, r9 /* t1 = h | t1 */ | |
250 | swi r9, r5, 24 /* *(d + 24) = t1 */ | |
251 | bslli r11, r12, 16 /* h = v << 16 */ | |
252 | lwi r12, r8, 32 /* v = *(as + 32) */ | |
253 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | |
254 | or r9, r11, r9 /* t1 = h | t1 */ | |
255 | swi r9, r5, 28 /* *(d + 28) = t1 */ | |
256 | bslli r11, r12, 16 /* h = v << 16 */ | |
257 | addi r8, r8, 32 /* as = as + 32 */ | |
258 | addi r4, r4, -32 /* n = n - 32 */ | |
259 | bneid r4, a_bu2_loop /* while (n) loop */ | |
260 | addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ | |
261 | ||
262 | a_block_done: | |
263 | addi r4, r0, 4 /* n = 4 */ | |
264 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ | |
265 | blti r4, a_xfer_end /* if n < 0, less than one word to transfer */ | |
266 | ||
267 | a_word_xfer: | |
268 | andi r4, r7, 0xfffffffc /* n = c & ~3 */ | |
269 | addi r10, r0, 0 /* offset = 0 */ | |
270 | ||
271 | andi r9, r6, 3 /* t1 = s & 3 */ | |
272 | /* if temp != 0, unaligned transfers needed */ | |
273 | bnei r9, a_word_unaligned | |
274 | ||
275 | a_word_aligned: | |
276 | lw r9, r6, r10 /* t1 = *(s+offset) */ | |
277 | sw r9, r5, r10 /* *(d+offset) = t1 */ | |
278 | addi r4, r4,-4 /* n-- */ | |
279 | bneid r4, a_word_aligned /* loop */ | |
280 | addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */ | |
281 | ||
282 | bri a_word_done | |
283 | ||
284 | a_word_unaligned: | |
285 | andi r8, r6, 0xfffffffc /* as = s & ~3 */ | |
286 | lwi r11, r8, 0 /* h = *(as + 0) */ | |
287 | addi r8, r8, 4 /* as = as + 4 */ | |
288 | ||
289 | addi r9, r9, -1 | |
290 | beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */ | |
291 | addi r9, r9, -1 | |
292 | beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */ | |
293 | ||
294 | a_word_u3: | |
295 | bslli r11, r11, 24 /* h = h << 24 */ | |
296 | a_wu3_loop: | |
297 | lw r12, r8, r10 /* v = *(as + offset) */ | |
298 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | |
299 | or r9, r11, r9 /* t1 = h | t1 */ | |
300 | sw r9, r5, r10 /* *(d + offset) = t1 */ | |
301 | bslli r11, r12, 24 /* h = v << 24 */ | |
302 | addi r4, r4,-4 /* n = n - 4 */ | |
303 | bneid r4, a_wu3_loop /* while (n) loop */ | |
304 | addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ | |
305 | ||
306 | bri a_word_done | |
307 | ||
308 | a_word_u1: | |
309 | bslli r11, r11, 8 /* h = h << 8 */ | |
310 | a_wu1_loop: | |
311 | lw r12, r8, r10 /* v = *(as + offset) */ | |
312 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | |
313 | or r9, r11, r9 /* t1 = h | t1 */ | |
314 | sw r9, r5, r10 /* *(d + offset) = t1 */ | |
315 | bslli r11, r12, 8 /* h = v << 8 */ | |
316 | addi r4, r4,-4 /* n = n - 4 */ | |
317 | bneid r4, a_wu1_loop /* while (n) loop */ | |
318 | addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ | |
319 | ||
320 | bri a_word_done | |
321 | ||
322 | a_word_u2: | |
323 | bslli r11, r11, 16 /* h = h << 16 */ | |
324 | a_wu2_loop: | |
325 | lw r12, r8, r10 /* v = *(as + offset) */ | |
326 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | |
327 | or r9, r11, r9 /* t1 = h | t1 */ | |
328 | sw r9, r5, r10 /* *(d + offset) = t1 */ | |
329 | bslli r11, r12, 16 /* h = v << 16 */ | |
330 | addi r4, r4,-4 /* n = n - 4 */ | |
331 | bneid r4, a_wu2_loop /* while (n) loop */ | |
332 | addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ | |
333 | ||
334 | a_word_done: | |
335 | add r5, r5, r10 /* d = d + offset */ | |
336 | add r6, r6, r10 /* s = s + offset */ | |
337 | rsub r7, r10, r7 /* c = c - offset */ | |
338 | ||
339 | a_xfer_end: | |
340 | a_xfer_end_loop: | |
341 | beqi r7, a_done /* while (c) */ | |
342 | lbui r9, r6, 0 /* t1 = *s */ | |
343 | addi r6, r6, 1 /* s++ */ | |
344 | sbi r9, r5, 0 /* *d = t1 */ | |
345 | addi r7, r7, -1 /* c-- */ | |
346 | brid a_xfer_end_loop /* loop */ | |
347 | addi r5, r5, 1 /* d++ (IN DELAY SLOT) */ | |
348 | ||
349 | a_done: | |
350 | rtsd r15, 8 | |
351 | nop | |
352 | ||
13851966 | 353 | .size memcpy, . - memcpy |
322ae8eb MS |
354 | .end memcpy |
355 | /*----------------------------------------------------------------------------*/ | |
356 | .globl memmove | |
13851966 | 357 | .type memmove, @function |
322ae8eb MS |
358 | .ent memmove |
359 | ||
360 | memmove: | |
361 | cmpu r4, r5, r6 /* n = s - d */ | |
362 | bgei r4,fast_memcpy_ascending | |
363 | ||
364 | fast_memcpy_descending: | |
365 | /* move d to return register as value of function */ | |
366 | addi r3, r5, 0 | |
367 | ||
368 | add r5, r5, r7 /* d = d + c */ | |
369 | add r6, r6, r7 /* s = s + c */ | |
370 | ||
371 | addi r4, r0, 4 /* n = 4 */ | |
372 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ | |
373 | blti r4,d_xfer_end /* if n < 0, less than one word to transfer */ | |
374 | ||
375 | /* transfer first 0~3 bytes to get aligned dest address */ | |
376 | andi r4, r5, 3 /* n = d & 3 */ | |
377 | /* if zero, destination already aligned */ | |
378 | beqi r4,d_dalign_done | |
379 | rsub r7, r4, r7 /* c = c - n adjust c */ | |
380 | ||
381 | d_xfer_first_loop: | |
382 | /* if no bytes left to transfer, transfer the bulk */ | |
383 | beqi r4,d_dalign_done | |
384 | addi r6, r6, -1 /* s-- */ | |
385 | addi r5, r5, -1 /* d-- */ | |
386 | lbui r11, r6, 0 /* h = *s */ | |
387 | sbi r11, r5, 0 /* *d = h */ | |
388 | brid d_xfer_first_loop /* loop */ | |
389 | addi r4, r4, -1 /* n-- (IN DELAY SLOT) */ | |
390 | ||
391 | d_dalign_done: | |
392 | addi r4, r0, 32 /* n = 32 */ | |
393 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ | |
394 | /* if n < 0, less than one block to transfer */ | |
395 | blti r4, d_block_done | |
396 | ||
397 | d_block_xfer: | |
398 | andi r4, r7, 0xffffffe0 /* n = c & ~31 */ | |
399 | rsub r7, r4, r7 /* c = c - n */ | |
400 | ||
401 | andi r9, r6, 3 /* t1 = s & 3 */ | |
402 | /* if temp != 0, unaligned transfers needed */ | |
403 | bnei r9, d_block_unaligned | |
404 | ||
405 | d_block_aligned: | |
406 | addi r6, r6, -32 /* s = s - 32 */ | |
407 | addi r5, r5, -32 /* d = d - 32 */ | |
408 | lwi r9, r6, 28 /* t1 = *(s + 28) */ | |
409 | lwi r10, r6, 24 /* t2 = *(s + 24) */ | |
410 | lwi r11, r6, 20 /* t3 = *(s + 20) */ | |
411 | lwi r12, r6, 16 /* t4 = *(s + 16) */ | |
412 | swi r9, r5, 28 /* *(d + 28) = t1 */ | |
413 | swi r10, r5, 24 /* *(d + 24) = t2 */ | |
414 | swi r11, r5, 20 /* *(d + 20) = t3 */ | |
415 | swi r12, r5, 16 /* *(d + 16) = t4 */ | |
416 | lwi r9, r6, 12 /* t1 = *(s + 12) */ | |
417 | lwi r10, r6, 8 /* t2 = *(s + 8) */ | |
418 | lwi r11, r6, 4 /* t3 = *(s + 4) */ | |
419 | lwi r12, r6, 0 /* t4 = *(s + 0) */ | |
420 | swi r9, r5, 12 /* *(d + 12) = t1 */ | |
421 | swi r10, r5, 8 /* *(d + 8) = t2 */ | |
422 | swi r11, r5, 4 /* *(d + 4) = t3 */ | |
423 | addi r4, r4, -32 /* n = n - 32 */ | |
424 | bneid r4, d_block_aligned /* while (n) loop */ | |
425 | swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */ | |
426 | bri d_block_done | |
427 | ||
428 | d_block_unaligned: | |
429 | andi r8, r6, 0xfffffffc /* as = s & ~3 */ | |
430 | rsub r6, r4, r6 /* s = s - n */ | |
431 | lwi r11, r8, 0 /* h = *(as + 0) */ | |
432 | ||
433 | addi r9, r9, -1 | |
434 | beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */ | |
435 | addi r9, r9, -1 | |
436 | beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */ | |
437 | ||
438 | d_block_u3: | |
439 | bsrli r11, r11, 8 /* h = h >> 8 */ | |
440 | d_bu3_loop: | |
441 | addi r8, r8, -32 /* as = as - 32 */ | |
442 | addi r5, r5, -32 /* d = d - 32 */ | |
443 | lwi r12, r8, 28 /* v = *(as + 28) */ | |
444 | bslli r9, r12, 24 /* t1 = v << 24 */ | |
445 | or r9, r11, r9 /* t1 = h | t1 */ | |
446 | swi r9, r5, 28 /* *(d + 28) = t1 */ | |
447 | bsrli r11, r12, 8 /* h = v >> 8 */ | |
448 | lwi r12, r8, 24 /* v = *(as + 24) */ | |
449 | bslli r9, r12, 24 /* t1 = v << 24 */ | |
450 | or r9, r11, r9 /* t1 = h | t1 */ | |
451 | swi r9, r5, 24 /* *(d + 24) = t1 */ | |
452 | bsrli r11, r12, 8 /* h = v >> 8 */ | |
453 | lwi r12, r8, 20 /* v = *(as + 20) */ | |
454 | bslli r9, r12, 24 /* t1 = v << 24 */ | |
455 | or r9, r11, r9 /* t1 = h | t1 */ | |
456 | swi r9, r5, 20 /* *(d + 20) = t1 */ | |
457 | bsrli r11, r12, 8 /* h = v >> 8 */ | |
458 | lwi r12, r8, 16 /* v = *(as + 16) */ | |
459 | bslli r9, r12, 24 /* t1 = v << 24 */ | |
460 | or r9, r11, r9 /* t1 = h | t1 */ | |
461 | swi r9, r5, 16 /* *(d + 16) = t1 */ | |
462 | bsrli r11, r12, 8 /* h = v >> 8 */ | |
463 | lwi r12, r8, 12 /* v = *(as + 12) */ | |
464 | bslli r9, r12, 24 /* t1 = v << 24 */ | |
465 | or r9, r11, r9 /* t1 = h | t1 */ | |
466 | swi r9, r5, 12 /* *(d + 112) = t1 */ | |
467 | bsrli r11, r12, 8 /* h = v >> 8 */ | |
468 | lwi r12, r8, 8 /* v = *(as + 8) */ | |
469 | bslli r9, r12, 24 /* t1 = v << 24 */ | |
470 | or r9, r11, r9 /* t1 = h | t1 */ | |
471 | swi r9, r5, 8 /* *(d + 8) = t1 */ | |
472 | bsrli r11, r12, 8 /* h = v >> 8 */ | |
473 | lwi r12, r8, 4 /* v = *(as + 4) */ | |
474 | bslli r9, r12, 24 /* t1 = v << 24 */ | |
475 | or r9, r11, r9 /* t1 = h | t1 */ | |
476 | swi r9, r5, 4 /* *(d + 4) = t1 */ | |
477 | bsrli r11, r12, 8 /* h = v >> 8 */ | |
478 | lwi r12, r8, 0 /* v = *(as + 0) */ | |
479 | bslli r9, r12, 24 /* t1 = v << 24 */ | |
480 | or r9, r11, r9 /* t1 = h | t1 */ | |
481 | swi r9, r5, 0 /* *(d + 0) = t1 */ | |
482 | addi r4, r4, -32 /* n = n - 32 */ | |
483 | bneid r4, d_bu3_loop /* while (n) loop */ | |
484 | bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */ | |
485 | bri d_block_done | |
486 | ||
487 | d_block_u1: | |
488 | bsrli r11, r11, 24 /* h = h >> 24 */ | |
489 | d_bu1_loop: | |
490 | addi r8, r8, -32 /* as = as - 32 */ | |
491 | addi r5, r5, -32 /* d = d - 32 */ | |
492 | lwi r12, r8, 28 /* v = *(as + 28) */ | |
493 | bslli r9, r12, 8 /* t1 = v << 8 */ | |
494 | or r9, r11, r9 /* t1 = h | t1 */ | |
495 | swi r9, r5, 28 /* *(d + 28) = t1 */ | |
496 | bsrli r11, r12, 24 /* h = v >> 24 */ | |
497 | lwi r12, r8, 24 /* v = *(as + 24) */ | |
498 | bslli r9, r12, 8 /* t1 = v << 8 */ | |
499 | or r9, r11, r9 /* t1 = h | t1 */ | |
500 | swi r9, r5, 24 /* *(d + 24) = t1 */ | |
501 | bsrli r11, r12, 24 /* h = v >> 24 */ | |
502 | lwi r12, r8, 20 /* v = *(as + 20) */ | |
503 | bslli r9, r12, 8 /* t1 = v << 8 */ | |
504 | or r9, r11, r9 /* t1 = h | t1 */ | |
505 | swi r9, r5, 20 /* *(d + 20) = t1 */ | |
506 | bsrli r11, r12, 24 /* h = v >> 24 */ | |
507 | lwi r12, r8, 16 /* v = *(as + 16) */ | |
508 | bslli r9, r12, 8 /* t1 = v << 8 */ | |
509 | or r9, r11, r9 /* t1 = h | t1 */ | |
510 | swi r9, r5, 16 /* *(d + 16) = t1 */ | |
511 | bsrli r11, r12, 24 /* h = v >> 24 */ | |
512 | lwi r12, r8, 12 /* v = *(as + 12) */ | |
513 | bslli r9, r12, 8 /* t1 = v << 8 */ | |
514 | or r9, r11, r9 /* t1 = h | t1 */ | |
515 | swi r9, r5, 12 /* *(d + 112) = t1 */ | |
516 | bsrli r11, r12, 24 /* h = v >> 24 */ | |
517 | lwi r12, r8, 8 /* v = *(as + 8) */ | |
518 | bslli r9, r12, 8 /* t1 = v << 8 */ | |
519 | or r9, r11, r9 /* t1 = h | t1 */ | |
520 | swi r9, r5, 8 /* *(d + 8) = t1 */ | |
521 | bsrli r11, r12, 24 /* h = v >> 24 */ | |
522 | lwi r12, r8, 4 /* v = *(as + 4) */ | |
523 | bslli r9, r12, 8 /* t1 = v << 8 */ | |
524 | or r9, r11, r9 /* t1 = h | t1 */ | |
525 | swi r9, r5, 4 /* *(d + 4) = t1 */ | |
526 | bsrli r11, r12, 24 /* h = v >> 24 */ | |
527 | lwi r12, r8, 0 /* v = *(as + 0) */ | |
528 | bslli r9, r12, 8 /* t1 = v << 8 */ | |
529 | or r9, r11, r9 /* t1 = h | t1 */ | |
530 | swi r9, r5, 0 /* *(d + 0) = t1 */ | |
531 | addi r4, r4, -32 /* n = n - 32 */ | |
532 | bneid r4, d_bu1_loop /* while (n) loop */ | |
533 | bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */ | |
534 | bri d_block_done | |
535 | ||
536 | d_block_u2: | |
537 | bsrli r11, r11, 16 /* h = h >> 16 */ | |
538 | d_bu2_loop: | |
539 | addi r8, r8, -32 /* as = as - 32 */ | |
540 | addi r5, r5, -32 /* d = d - 32 */ | |
541 | lwi r12, r8, 28 /* v = *(as + 28) */ | |
542 | bslli r9, r12, 16 /* t1 = v << 16 */ | |
543 | or r9, r11, r9 /* t1 = h | t1 */ | |
544 | swi r9, r5, 28 /* *(d + 28) = t1 */ | |
545 | bsrli r11, r12, 16 /* h = v >> 16 */ | |
546 | lwi r12, r8, 24 /* v = *(as + 24) */ | |
547 | bslli r9, r12, 16 /* t1 = v << 16 */ | |
548 | or r9, r11, r9 /* t1 = h | t1 */ | |
549 | swi r9, r5, 24 /* *(d + 24) = t1 */ | |
550 | bsrli r11, r12, 16 /* h = v >> 16 */ | |
551 | lwi r12, r8, 20 /* v = *(as + 20) */ | |
552 | bslli r9, r12, 16 /* t1 = v << 16 */ | |
553 | or r9, r11, r9 /* t1 = h | t1 */ | |
554 | swi r9, r5, 20 /* *(d + 20) = t1 */ | |
555 | bsrli r11, r12, 16 /* h = v >> 16 */ | |
556 | lwi r12, r8, 16 /* v = *(as + 16) */ | |
557 | bslli r9, r12, 16 /* t1 = v << 16 */ | |
558 | or r9, r11, r9 /* t1 = h | t1 */ | |
559 | swi r9, r5, 16 /* *(d + 16) = t1 */ | |
560 | bsrli r11, r12, 16 /* h = v >> 16 */ | |
561 | lwi r12, r8, 12 /* v = *(as + 12) */ | |
562 | bslli r9, r12, 16 /* t1 = v << 16 */ | |
563 | or r9, r11, r9 /* t1 = h | t1 */ | |
564 | swi r9, r5, 12 /* *(d + 112) = t1 */ | |
565 | bsrli r11, r12, 16 /* h = v >> 16 */ | |
566 | lwi r12, r8, 8 /* v = *(as + 8) */ | |
567 | bslli r9, r12, 16 /* t1 = v << 16 */ | |
568 | or r9, r11, r9 /* t1 = h | t1 */ | |
569 | swi r9, r5, 8 /* *(d + 8) = t1 */ | |
570 | bsrli r11, r12, 16 /* h = v >> 16 */ | |
571 | lwi r12, r8, 4 /* v = *(as + 4) */ | |
572 | bslli r9, r12, 16 /* t1 = v << 16 */ | |
573 | or r9, r11, r9 /* t1 = h | t1 */ | |
574 | swi r9, r5, 4 /* *(d + 4) = t1 */ | |
575 | bsrli r11, r12, 16 /* h = v >> 16 */ | |
576 | lwi r12, r8, 0 /* v = *(as + 0) */ | |
577 | bslli r9, r12, 16 /* t1 = v << 16 */ | |
578 | or r9, r11, r9 /* t1 = h | t1 */ | |
579 | swi r9, r5, 0 /* *(d + 0) = t1 */ | |
580 | addi r4, r4, -32 /* n = n - 32 */ | |
581 | bneid r4, d_bu2_loop /* while (n) loop */ | |
582 | bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */ | |
583 | ||
584 | d_block_done: | |
585 | addi r4, r0, 4 /* n = 4 */ | |
586 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ | |
587 | blti r4,d_xfer_end /* if n < 0, less than one word to transfer */ | |
588 | ||
589 | d_word_xfer: | |
590 | andi r4, r7, 0xfffffffc /* n = c & ~3 */ | |
591 | rsub r5, r4, r5 /* d = d - n */ | |
592 | rsub r6, r4, r6 /* s = s - n */ | |
593 | rsub r7, r4, r7 /* c = c - n */ | |
594 | ||
595 | andi r9, r6, 3 /* t1 = s & 3 */ | |
596 | /* if temp != 0, unaligned transfers needed */ | |
597 | bnei r9, d_word_unaligned | |
598 | ||
599 | d_word_aligned: | |
600 | addi r4, r4,-4 /* n-- */ | |
601 | lw r9, r6, r4 /* t1 = *(s+n) */ | |
602 | bneid r4, d_word_aligned /* loop */ | |
603 | sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */ | |
604 | ||
605 | bri d_word_done | |
606 | ||
607 | d_word_unaligned: | |
608 | andi r8, r6, 0xfffffffc /* as = s & ~3 */ | |
609 | lw r11, r8, r4 /* h = *(as + n) */ | |
610 | ||
611 | addi r9, r9, -1 | |
612 | beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */ | |
613 | addi r9, r9, -1 | |
614 | beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */ | |
615 | ||
616 | d_word_u3: | |
617 | bsrli r11, r11, 8 /* h = h >> 8 */ | |
618 | d_wu3_loop: | |
619 | addi r4, r4,-4 /* n = n - 4 */ | |
620 | lw r12, r8, r4 /* v = *(as + n) */ | |
621 | bslli r9, r12, 24 /* t1 = v << 24 */ | |
622 | or r9, r11, r9 /* t1 = h | t1 */ | |
623 | sw r9, r5, r4 /* *(d + n) = t1 */ | |
624 | bneid r4, d_wu3_loop /* while (n) loop */ | |
625 | bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */ | |
626 | ||
627 | bri d_word_done | |
628 | ||
629 | d_word_u1: | |
630 | bsrli r11, r11, 24 /* h = h >> 24 */ | |
631 | d_wu1_loop: | |
632 | addi r4, r4,-4 /* n = n - 4 */ | |
633 | lw r12, r8, r4 /* v = *(as + n) */ | |
634 | bslli r9, r12, 8 /* t1 = v << 8 */ | |
635 | or r9, r11, r9 /* t1 = h | t1 */ | |
636 | sw r9, r5, r4 /* *(d + n) = t1 */ | |
637 | bneid r4, d_wu1_loop /* while (n) loop */ | |
638 | bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */ | |
639 | ||
640 | bri d_word_done | |
641 | ||
642 | d_word_u2: | |
643 | bsrli r11, r11, 16 /* h = h >> 16 */ | |
644 | d_wu2_loop: | |
645 | addi r4, r4,-4 /* n = n - 4 */ | |
646 | lw r12, r8, r4 /* v = *(as + n) */ | |
647 | bslli r9, r12, 16 /* t1 = v << 16 */ | |
648 | or r9, r11, r9 /* t1 = h | t1 */ | |
649 | sw r9, r5, r4 /* *(d + n) = t1 */ | |
650 | bneid r4, d_wu2_loop /* while (n) loop */ | |
651 | bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */ | |
652 | ||
653 | d_word_done: | |
654 | ||
655 | d_xfer_end: | |
656 | d_xfer_end_loop: | |
657 | beqi r7, a_done /* while (c) */ | |
658 | addi r6, r6, -1 /* s-- */ | |
659 | lbui r9, r6, 0 /* t1 = *s */ | |
660 | addi r5, r5, -1 /* d-- */ | |
661 | sbi r9, r5, 0 /* *d = t1 */ | |
662 | brid d_xfer_end_loop /* loop */ | |
663 | addi r7, r7, -1 /* c-- (IN DELAY SLOT) */ | |
664 | ||
665 | d_done: | |
666 | rtsd r15, 8 | |
667 | nop | |
668 | ||
13851966 | 669 | .size memmove, . - memmove |
322ae8eb | 670 | .end memmove |