Commit | Line | Data |
---|---|---|
14cf11af PM |
1 | /* |
2 | * This file contains assembly-language implementations | |
3 | * of IP-style 1's complement checksum routines. | |
4 | * | |
5 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) | |
6 | * | |
7 | * This program is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation; either version | |
10 | * 2 of the License, or (at your option) any later version. | |
11 | * | |
12 | * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). | |
13 | */ | |
14 | ||
15 | #include <linux/sys.h> | |
16 | #include <asm/processor.h> | |
17 | #include <asm/errno.h> | |
18 | #include <asm/ppc_asm.h> | |
19 | ||
14cf11af PM |
20 | /* |
21 | * Computes the checksum of a memory block at buff, length len, | |
22 | * and adds in "sum" (32-bit). | |
23 | * | |
7e393220 | 24 | * __csum_partial(r3=buff, r4=len, r5=sum) |
14cf11af | 25 | */ |
7e393220 | 26 | _GLOBAL(__csum_partial) |
9b83ecb0 AB |
27 | addic r0,r5,0 /* clear carry */ |
28 | ||
29 | srdi. r6,r4,3 /* less than 8 bytes? */ | |
30 | beq .Lcsum_tail_word | |
31 | ||
32 | /* | |
33 | * If only halfword aligned, align to a double word. Since odd | |
34 | * aligned addresses should be rare and they would require more | |
35 | * work to calculate the correct checksum, we ignore that case | |
36 | * and take the potential slowdown of unaligned loads. | |
37 | */ | |
38 | rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ | |
39 | beq .Lcsum_aligned | |
40 | ||
41 | li r7,4 | |
42 | sub r6,r7,r6 | |
43 | mtctr r6 | |
44 | ||
45 | 1: | |
46 | lhz r6,0(r3) /* align to doubleword */ | |
47 | subi r4,r4,2 | |
48 | addi r3,r3,2 | |
49 | adde r0,r0,r6 | |
50 | bdnz 1b | |
51 | ||
52 | .Lcsum_aligned: | |
53 | /* | |
54 | * We unroll the loop such that each iteration is 64 bytes with an | |
55 | * entry and exit limb of 64 bytes, meaning a minimum size of | |
56 | * 128 bytes. | |
57 | */ | |
58 | srdi. r6,r4,7 | |
59 | beq .Lcsum_tail_doublewords /* len < 128 */ | |
60 | ||
61 | srdi r6,r4,6 | |
62 | subi r6,r6,1 | |
63 | mtctr r6 | |
64 | ||
65 | stdu r1,-STACKFRAMESIZE(r1) | |
c75df6f9 MN |
66 | std r14,STK_REG(R14)(r1) |
67 | std r15,STK_REG(R15)(r1) | |
68 | std r16,STK_REG(R16)(r1) | |
9b83ecb0 AB |
69 | |
70 | ld r6,0(r3) | |
71 | ld r9,8(r3) | |
72 | ||
73 | ld r10,16(r3) | |
74 | ld r11,24(r3) | |
75 | ||
76 | /* | |
ec5619fd SS |
77 | * On POWER6 and POWER7 back to back adde instructions take 2 cycles |
78 | * because of the XER dependency. This means the fastest this loop can | |
79 | * go is 16 cycles per iteration. The scheduling of the loop below has | |
9b83ecb0 AB |
80 | * been shown to hit this on both POWER6 and POWER7. |
81 | */ | |
82 | .align 5 | |
83 | 2: | |
84 | adde r0,r0,r6 | |
85 | ld r12,32(r3) | |
86 | ld r14,40(r3) | |
87 | ||
88 | adde r0,r0,r9 | |
89 | ld r15,48(r3) | |
90 | ld r16,56(r3) | |
91 | addi r3,r3,64 | |
92 | ||
93 | adde r0,r0,r10 | |
94 | ||
95 | adde r0,r0,r11 | |
96 | ||
97 | adde r0,r0,r12 | |
98 | ||
99 | adde r0,r0,r14 | |
100 | ||
101 | adde r0,r0,r15 | |
102 | ld r6,0(r3) | |
103 | ld r9,8(r3) | |
104 | ||
105 | adde r0,r0,r16 | |
106 | ld r10,16(r3) | |
107 | ld r11,24(r3) | |
108 | bdnz 2b | |
109 | ||
110 | ||
111 | adde r0,r0,r6 | |
112 | ld r12,32(r3) | |
113 | ld r14,40(r3) | |
114 | ||
115 | adde r0,r0,r9 | |
116 | ld r15,48(r3) | |
117 | ld r16,56(r3) | |
118 | addi r3,r3,64 | |
119 | ||
120 | adde r0,r0,r10 | |
121 | adde r0,r0,r11 | |
122 | adde r0,r0,r12 | |
123 | adde r0,r0,r14 | |
124 | adde r0,r0,r15 | |
125 | adde r0,r0,r16 | |
126 | ||
c75df6f9 MN |
127 | ld r14,STK_REG(R14)(r1) |
128 | ld r15,STK_REG(R15)(r1) | |
129 | ld r16,STK_REG(R16)(r1) | |
9b83ecb0 AB |
130 | addi r1,r1,STACKFRAMESIZE |
131 | ||
132 | andi. r4,r4,63 | |
133 | ||
134 | .Lcsum_tail_doublewords: /* Up to 127 bytes to go */ | |
135 | srdi. r6,r4,3 | |
136 | beq .Lcsum_tail_word | |
137 | ||
138 | mtctr r6 | |
139 | 3: | |
140 | ld r6,0(r3) | |
141 | addi r3,r3,8 | |
142 | adde r0,r0,r6 | |
143 | bdnz 3b | |
144 | ||
145 | andi. r4,r4,7 | |
146 | ||
147 | .Lcsum_tail_word: /* Up to 7 bytes to go */ | |
148 | srdi. r6,r4,2 | |
149 | beq .Lcsum_tail_halfword | |
150 | ||
151 | lwz r6,0(r3) | |
14cf11af | 152 | addi r3,r3,4 |
9b83ecb0 | 153 | adde r0,r0,r6 |
14cf11af | 154 | subi r4,r4,4 |
9b83ecb0 AB |
155 | |
156 | .Lcsum_tail_halfword: /* Up to 3 bytes to go */ | |
157 | srdi. r6,r4,1 | |
158 | beq .Lcsum_tail_byte | |
159 | ||
160 | lhz r6,0(r3) | |
161 | addi r3,r3,2 | |
162 | adde r0,r0,r6 | |
163 | subi r4,r4,2 | |
164 | ||
165 | .Lcsum_tail_byte: /* Up to 1 byte to go */ | |
166 | andi. r6,r4,1 | |
167 | beq .Lcsum_finish | |
168 | ||
169 | lbz r6,0(r3) | |
170 | sldi r9,r6,8 /* Pad the byte out to 16 bits */ | |
171 | adde r0,r0,r9 | |
172 | ||
173 | .Lcsum_finish: | |
174 | addze r0,r0 /* add in final carry */ | |
175 | rldicl r4,r0,32,0 /* fold two 32 bit halves together */ | |
176 | add r3,r4,r0 | |
177 | srdi r3,r3,32 | |
178 | blr | |
14cf11af | 179 | |
fdd374b6 | 180 | |
8f21bd00 | 181 | .macro srcnr |
fdd374b6 AB |
182 | 100: |
183 | .section __ex_table,"a" | |
184 | .align 3 | |
8f21bd00 | 185 | .llong 100b,.Lsrc_error_nr |
fdd374b6 AB |
186 | .previous |
187 | .endm | |
188 | ||
8f21bd00 PM |
189 | .macro source |
190 | 150: | |
191 | .section __ex_table,"a" | |
192 | .align 3 | |
193 | .llong 150b,.Lsrc_error | |
194 | .previous | |
195 | .endm | |
196 | ||
197 | .macro dstnr | |
fdd374b6 AB |
198 | 200: |
199 | .section __ex_table,"a" | |
200 | .align 3 | |
8f21bd00 PM |
201 | .llong 200b,.Ldest_error_nr |
202 | .previous | |
203 | .endm | |
204 | ||
205 | .macro dest | |
206 | 250: | |
207 | .section __ex_table,"a" | |
208 | .align 3 | |
209 | .llong 250b,.Ldest_error | |
fdd374b6 AB |
210 | .previous |
211 | .endm | |
212 | ||
14cf11af PM |
213 | /* |
214 | * Computes the checksum of a memory block at src, length len, | |
215 | * and adds in "sum" (32-bit), while copying the block to dst. | |
216 | * If an access exception occurs on src or dst, it stores -EFAULT | |
fdd374b6 AB |
217 | * to *src_err or *dst_err respectively. The caller must take any action |
218 | * required in this case (zeroing memory, recalculating partial checksum etc). | |
14cf11af PM |
219 | * |
220 | * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) | |
221 | */ | |
222 | _GLOBAL(csum_partial_copy_generic) | |
fdd374b6 AB |
223 | addic r0,r6,0 /* clear carry */ |
224 | ||
225 | srdi. r6,r5,3 /* less than 8 bytes? */ | |
226 | beq .Lcopy_tail_word | |
227 | ||
228 | /* | |
229 | * If only halfword aligned, align to a double word. Since odd | |
230 | * aligned addresses should be rare and they would require more | |
231 | * work to calculate the correct checksum, we ignore that case | |
232 | * and take the potential slowdown of unaligned loads. | |
233 | * | |
234 | * If the source and destination are relatively unaligned we only | |
235 | * align the source. This keeps things simple. | |
236 | */ | |
237 | rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ | |
238 | beq .Lcopy_aligned | |
239 | ||
d9813c36 PM |
240 | li r9,4 |
241 | sub r6,r9,r6 | |
fdd374b6 AB |
242 | mtctr r6 |
243 | ||
244 | 1: | |
8f21bd00 | 245 | srcnr; lhz r6,0(r3) /* align to doubleword */ |
14cf11af | 246 | subi r5,r5,2 |
14cf11af | 247 | addi r3,r3,2 |
fdd374b6 | 248 | adde r0,r0,r6 |
8f21bd00 | 249 | dstnr; sth r6,0(r4) |
14cf11af | 250 | addi r4,r4,2 |
fdd374b6 AB |
251 | bdnz 1b |
252 | ||
253 | .Lcopy_aligned: | |
254 | /* | |
255 | * We unroll the loop such that each iteration is 64 bytes with an | |
256 | * entry and exit limb of 64 bytes, meaning a minimum size of | |
257 | * 128 bytes. | |
258 | */ | |
259 | srdi. r6,r5,7 | |
260 | beq .Lcopy_tail_doublewords /* len < 128 */ | |
261 | ||
262 | srdi r6,r5,6 | |
263 | subi r6,r6,1 | |
264 | mtctr r6 | |
265 | ||
266 | stdu r1,-STACKFRAMESIZE(r1) | |
c75df6f9 MN |
267 | std r14,STK_REG(R14)(r1) |
268 | std r15,STK_REG(R15)(r1) | |
269 | std r16,STK_REG(R16)(r1) | |
fdd374b6 AB |
270 | |
271 | source; ld r6,0(r3) | |
272 | source; ld r9,8(r3) | |
273 | ||
274 | source; ld r10,16(r3) | |
275 | source; ld r11,24(r3) | |
276 | ||
277 | /* | |
ec5619fd SS |
278 | * On POWER6 and POWER7 back to back adde instructions take 2 cycles |
279 | * because of the XER dependency. This means the fastest this loop can | |
280 | * go is 16 cycles per iteration. The scheduling of the loop below has | |
fdd374b6 AB |
281 | * been shown to hit this on both POWER6 and POWER7. |
282 | */ | |
283 | .align 5 | |
284 | 2: | |
285 | adde r0,r0,r6 | |
286 | source; ld r12,32(r3) | |
287 | source; ld r14,40(r3) | |
288 | ||
289 | adde r0,r0,r9 | |
290 | source; ld r15,48(r3) | |
291 | source; ld r16,56(r3) | |
292 | addi r3,r3,64 | |
293 | ||
294 | adde r0,r0,r10 | |
295 | dest; std r6,0(r4) | |
296 | dest; std r9,8(r4) | |
297 | ||
298 | adde r0,r0,r11 | |
299 | dest; std r10,16(r4) | |
300 | dest; std r11,24(r4) | |
301 | ||
302 | adde r0,r0,r12 | |
303 | dest; std r12,32(r4) | |
304 | dest; std r14,40(r4) | |
305 | ||
306 | adde r0,r0,r14 | |
307 | dest; std r15,48(r4) | |
308 | dest; std r16,56(r4) | |
309 | addi r4,r4,64 | |
310 | ||
311 | adde r0,r0,r15 | |
312 | source; ld r6,0(r3) | |
313 | source; ld r9,8(r3) | |
314 | ||
315 | adde r0,r0,r16 | |
316 | source; ld r10,16(r3) | |
317 | source; ld r11,24(r3) | |
318 | bdnz 2b | |
319 | ||
320 | ||
14cf11af | 321 | adde r0,r0,r6 |
fdd374b6 AB |
322 | source; ld r12,32(r3) |
323 | source; ld r14,40(r3) | |
324 | ||
325 | adde r0,r0,r9 | |
326 | source; ld r15,48(r3) | |
327 | source; ld r16,56(r3) | |
328 | addi r3,r3,64 | |
329 | ||
330 | adde r0,r0,r10 | |
331 | dest; std r6,0(r4) | |
332 | dest; std r9,8(r4) | |
333 | ||
334 | adde r0,r0,r11 | |
335 | dest; std r10,16(r4) | |
336 | dest; std r11,24(r4) | |
337 | ||
338 | adde r0,r0,r12 | |
339 | dest; std r12,32(r4) | |
340 | dest; std r14,40(r4) | |
341 | ||
342 | adde r0,r0,r14 | |
343 | dest; std r15,48(r4) | |
344 | dest; std r16,56(r4) | |
345 | addi r4,r4,64 | |
346 | ||
347 | adde r0,r0,r15 | |
348 | adde r0,r0,r16 | |
349 | ||
c75df6f9 MN |
350 | ld r14,STK_REG(R14)(r1) |
351 | ld r15,STK_REG(R15)(r1) | |
352 | ld r16,STK_REG(R16)(r1) | |
fdd374b6 AB |
353 | addi r1,r1,STACKFRAMESIZE |
354 | ||
355 | andi. r5,r5,63 | |
356 | ||
357 | .Lcopy_tail_doublewords: /* Up to 127 bytes to go */ | |
358 | srdi. r6,r5,3 | |
359 | beq .Lcopy_tail_word | |
360 | ||
361 | mtctr r6 | |
362 | 3: | |
8f21bd00 | 363 | srcnr; ld r6,0(r3) |
fdd374b6 | 364 | addi r3,r3,8 |
14cf11af | 365 | adde r0,r0,r6 |
8f21bd00 | 366 | dstnr; std r6,0(r4) |
fdd374b6 AB |
367 | addi r4,r4,8 |
368 | bdnz 3b | |
14cf11af | 369 | |
fdd374b6 | 370 | andi. r5,r5,7 |
14cf11af | 371 | |
fdd374b6 AB |
372 | .Lcopy_tail_word: /* Up to 7 bytes to go */ |
373 | srdi. r6,r5,2 | |
374 | beq .Lcopy_tail_halfword | |
375 | ||
8f21bd00 | 376 | srcnr; lwz r6,0(r3) |
fdd374b6 AB |
377 | addi r3,r3,4 |
378 | adde r0,r0,r6 | |
8f21bd00 | 379 | dstnr; stw r6,0(r4) |
fdd374b6 AB |
380 | addi r4,r4,4 |
381 | subi r5,r5,4 | |
382 | ||
383 | .Lcopy_tail_halfword: /* Up to 3 bytes to go */ | |
384 | srdi. r6,r5,1 | |
385 | beq .Lcopy_tail_byte | |
386 | ||
8f21bd00 | 387 | srcnr; lhz r6,0(r3) |
fdd374b6 AB |
388 | addi r3,r3,2 |
389 | adde r0,r0,r6 | |
8f21bd00 | 390 | dstnr; sth r6,0(r4) |
14cf11af | 391 | addi r4,r4,2 |
fdd374b6 AB |
392 | subi r5,r5,2 |
393 | ||
394 | .Lcopy_tail_byte: /* Up to 1 byte to go */ | |
395 | andi. r6,r5,1 | |
396 | beq .Lcopy_finish | |
397 | ||
8f21bd00 | 398 | srcnr; lbz r6,0(r3) |
fdd374b6 AB |
399 | sldi r9,r6,8 /* Pad the byte out to 16 bits */ |
400 | adde r0,r0,r9 | |
8f21bd00 | 401 | dstnr; stb r6,0(r4) |
fdd374b6 AB |
402 | |
403 | .Lcopy_finish: | |
404 | addze r0,r0 /* add in final carry */ | |
405 | rldicl r4,r0,32,0 /* fold two 32 bit halves together */ | |
406 | add r3,r4,r0 | |
407 | srdi r3,r3,32 | |
408 | blr | |
409 | ||
410 | .Lsrc_error: | |
8f21bd00 PM |
411 | ld r14,STK_REG(R14)(r1) |
412 | ld r15,STK_REG(R15)(r1) | |
413 | ld r16,STK_REG(R16)(r1) | |
414 | addi r1,r1,STACKFRAMESIZE | |
415 | .Lsrc_error_nr: | |
14cf11af | 416 | cmpdi 0,r7,0 |
fdd374b6 | 417 | beqlr |
14cf11af PM |
418 | li r6,-EFAULT |
419 | stw r6,0(r7) | |
14cf11af PM |
420 | blr |
421 | ||
fdd374b6 | 422 | .Ldest_error: |
8f21bd00 PM |
423 | ld r14,STK_REG(R14)(r1) |
424 | ld r15,STK_REG(R15)(r1) | |
425 | ld r16,STK_REG(R16)(r1) | |
426 | addi r1,r1,STACKFRAMESIZE | |
427 | .Ldest_error_nr: | |
14cf11af | 428 | cmpdi 0,r8,0 |
fdd374b6 | 429 | beqlr |
14cf11af PM |
430 | li r6,-EFAULT |
431 | stw r6,0(r8) | |
14cf11af | 432 | blr |