Commit | Line | Data |
---|---|---|
14cf11af PM |
1 | /* |
2 | * This file contains assembly-language implementations | |
3 | * of IP-style 1's complement checksum routines. | |
4 | * | |
5 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) | |
6 | * | |
7 | * This program is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation; either version | |
10 | * 2 of the License, or (at your option) any later version. | |
11 | * | |
12 | * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). | |
13 | */ | |
14 | ||
15 | #include <linux/sys.h> | |
16 | #include <asm/processor.h> | |
17 | #include <asm/errno.h> | |
18 | #include <asm/ppc_asm.h> | |
19 | ||
20 | /* | |
21 | * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header | |
22 | * len is in words and is always >= 5. | |
23 | * | |
24 | * In practice len == 5, but this is not guaranteed. So this code does not | |
25 | * attempt to use doubleword instructions. | |
26 | */ | |
27 | _GLOBAL(ip_fast_csum) | |
28 | lwz r0,0(r3) | |
29 | lwzu r5,4(r3) | |
30 | addic. r4,r4,-2 | |
31 | addc r0,r0,r5 | |
32 | mtctr r4 | |
33 | blelr- | |
34 | 1: lwzu r4,4(r3) | |
35 | adde r0,r0,r4 | |
36 | bdnz 1b | |
37 | addze r0,r0 /* add in final carry */ | |
38 | rldicl r4,r0,32,0 /* fold two 32-bit halves together */ | |
39 | add r0,r0,r4 | |
40 | srdi r0,r0,32 | |
41 | rlwinm r3,r0,16,0,31 /* fold two halves together */ | |
42 | add r3,r0,r3 | |
43 | not r3,r3 | |
44 | srwi r3,r3,16 | |
45 | blr | |
46 | ||
47 | /* | |
48 | * Compute checksum of TCP or UDP pseudo-header: | |
49 | * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum) | |
50 | * No real gain trying to do this specially for 64 bit, but | |
51 | * the 32 bit addition may spill into the upper bits of | |
52 | * the doubleword so we still must fold it down from 64. | |
53 | */ | |
54 | _GLOBAL(csum_tcpudp_magic) | |
55 | rlwimi r5,r6,16,0,15 /* put proto in upper half of len */ | |
56 | addc r0,r3,r4 /* add 4 32-bit words together */ | |
57 | adde r0,r0,r5 | |
58 | adde r0,r0,r7 | |
59 | rldicl r4,r0,32,0 /* fold 64 bit value */ | |
60 | add r0,r4,r0 | |
61 | srdi r0,r0,32 | |
62 | rlwinm r3,r0,16,0,31 /* fold two halves together */ | |
63 | add r3,r0,r3 | |
64 | not r3,r3 | |
65 | srwi r3,r3,16 | |
66 | blr | |
67 | ||
68 | /* | |
69 | * Computes the checksum of a memory block at buff, length len, | |
70 | * and adds in "sum" (32-bit). | |
71 | * | |
14cf11af PM |
72 | * csum_partial(r3=buff, r4=len, r5=sum) |
73 | */ | |
74 | _GLOBAL(csum_partial) | |
9b83ecb0 AB |
75 | addic r0,r5,0 /* clear carry */ |
76 | ||
77 | srdi. r6,r4,3 /* less than 8 bytes? */ | |
78 | beq .Lcsum_tail_word | |
79 | ||
80 | /* | |
81 | * If only halfword aligned, align to a double word. Since odd | |
82 | * aligned addresses should be rare and they would require more | |
83 | * work to calculate the correct checksum, we ignore that case | |
84 | * and take the potential slowdown of unaligned loads. | |
85 | */ | |
86 | rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ | |
87 | beq .Lcsum_aligned | |
88 | ||
89 | li r7,4 | |
90 | sub r6,r7,r6 | |
91 | mtctr r6 | |
92 | ||
93 | 1: | |
94 | lhz r6,0(r3) /* align to doubleword */ | |
95 | subi r4,r4,2 | |
96 | addi r3,r3,2 | |
97 | adde r0,r0,r6 | |
98 | bdnz 1b | |
99 | ||
100 | .Lcsum_aligned: | |
101 | /* | |
102 | * We unroll the loop such that each iteration is 64 bytes with an | |
103 | * entry and exit limb of 64 bytes, meaning a minimum size of | |
104 | * 128 bytes. | |
105 | */ | |
106 | srdi. r6,r4,7 | |
107 | beq .Lcsum_tail_doublewords /* len < 128 */ | |
108 | ||
109 | srdi r6,r4,6 | |
110 | subi r6,r6,1 | |
111 | mtctr r6 | |
112 | ||
113 | stdu r1,-STACKFRAMESIZE(r1) | |
c75df6f9 MN |
114 | std r14,STK_REG(R14)(r1) |
115 | std r15,STK_REG(R15)(r1) | |
116 | std r16,STK_REG(R16)(r1) | |
9b83ecb0 AB |
117 | |
118 | ld r6,0(r3) | |
119 | ld r9,8(r3) | |
120 | ||
121 | ld r10,16(r3) | |
122 | ld r11,24(r3) | |
123 | ||
124 | /* | |
125 | * On POWER6 and POWER7 back to back addes take 2 cycles because of | |
126 | * the XER dependency. This means the fastest this loop can go is | |
127 | * 16 cycles per iteration. The scheduling of the loop below has | |
128 | * been shown to hit this on both POWER6 and POWER7. | |
129 | */ | |
130 | .align 5 | |
131 | 2: | |
132 | adde r0,r0,r6 | |
133 | ld r12,32(r3) | |
134 | ld r14,40(r3) | |
135 | ||
136 | adde r0,r0,r9 | |
137 | ld r15,48(r3) | |
138 | ld r16,56(r3) | |
139 | addi r3,r3,64 | |
140 | ||
141 | adde r0,r0,r10 | |
142 | ||
143 | adde r0,r0,r11 | |
144 | ||
145 | adde r0,r0,r12 | |
146 | ||
147 | adde r0,r0,r14 | |
148 | ||
149 | adde r0,r0,r15 | |
150 | ld r6,0(r3) | |
151 | ld r9,8(r3) | |
152 | ||
153 | adde r0,r0,r16 | |
154 | ld r10,16(r3) | |
155 | ld r11,24(r3) | |
156 | bdnz 2b | |
157 | ||
158 | ||
159 | adde r0,r0,r6 | |
160 | ld r12,32(r3) | |
161 | ld r14,40(r3) | |
162 | ||
163 | adde r0,r0,r9 | |
164 | ld r15,48(r3) | |
165 | ld r16,56(r3) | |
166 | addi r3,r3,64 | |
167 | ||
168 | adde r0,r0,r10 | |
169 | adde r0,r0,r11 | |
170 | adde r0,r0,r12 | |
171 | adde r0,r0,r14 | |
172 | adde r0,r0,r15 | |
173 | adde r0,r0,r16 | |
174 | ||
c75df6f9 MN |
175 | ld r14,STK_REG(R14)(r1) |
176 | ld r15,STK_REG(R15)(r1) | |
177 | ld r16,STK_REG(R16)(r1) | |
9b83ecb0 AB |
178 | addi r1,r1,STACKFRAMESIZE |
179 | ||
180 | andi. r4,r4,63 | |
181 | ||
182 | .Lcsum_tail_doublewords: /* Up to 127 bytes to go */ | |
183 | srdi. r6,r4,3 | |
184 | beq .Lcsum_tail_word | |
185 | ||
186 | mtctr r6 | |
187 | 3: | |
188 | ld r6,0(r3) | |
189 | addi r3,r3,8 | |
190 | adde r0,r0,r6 | |
191 | bdnz 3b | |
192 | ||
193 | andi. r4,r4,7 | |
194 | ||
195 | .Lcsum_tail_word: /* Up to 7 bytes to go */ | |
196 | srdi. r6,r4,2 | |
197 | beq .Lcsum_tail_halfword | |
198 | ||
199 | lwz r6,0(r3) | |
14cf11af | 200 | addi r3,r3,4 |
9b83ecb0 | 201 | adde r0,r0,r6 |
14cf11af | 202 | subi r4,r4,4 |
9b83ecb0 AB |
203 | |
204 | .Lcsum_tail_halfword: /* Up to 3 bytes to go */ | |
205 | srdi. r6,r4,1 | |
206 | beq .Lcsum_tail_byte | |
207 | ||
208 | lhz r6,0(r3) | |
209 | addi r3,r3,2 | |
210 | adde r0,r0,r6 | |
211 | subi r4,r4,2 | |
212 | ||
213 | .Lcsum_tail_byte: /* Up to 1 byte to go */ | |
214 | andi. r6,r4,1 | |
215 | beq .Lcsum_finish | |
216 | ||
217 | lbz r6,0(r3) | |
218 | sldi r9,r6,8 /* Pad the byte out to 16 bits */ | |
219 | adde r0,r0,r9 | |
220 | ||
221 | .Lcsum_finish: | |
222 | addze r0,r0 /* add in final carry */ | |
223 | rldicl r4,r0,32,0 /* fold two 32 bit halves together */ | |
224 | add r3,r4,r0 | |
225 | srdi r3,r3,32 | |
226 | blr | |
14cf11af | 227 | |
fdd374b6 | 228 | |
8f21bd00 | 229 | .macro srcnr |
fdd374b6 AB |
230 | 100: |
231 | .section __ex_table,"a" | |
232 | .align 3 | |
8f21bd00 | 233 | .llong 100b,.Lsrc_error_nr |
fdd374b6 AB |
234 | .previous |
235 | .endm | |
236 | ||
8f21bd00 PM |
237 | .macro source |
238 | 150: | |
239 | .section __ex_table,"a" | |
240 | .align 3 | |
241 | .llong 150b,.Lsrc_error | |
242 | .previous | |
243 | .endm | |
244 | ||
245 | .macro dstnr | |
fdd374b6 AB |
246 | 200: |
247 | .section __ex_table,"a" | |
248 | .align 3 | |
8f21bd00 PM |
249 | .llong 200b,.Ldest_error_nr |
250 | .previous | |
251 | .endm | |
252 | ||
253 | .macro dest | |
254 | 250: | |
255 | .section __ex_table,"a" | |
256 | .align 3 | |
257 | .llong 250b,.Ldest_error | |
fdd374b6 AB |
258 | .previous |
259 | .endm | |
260 | ||
14cf11af PM |
261 | /* |
262 | * Computes the checksum of a memory block at src, length len, | |
263 | * and adds in "sum" (32-bit), while copying the block to dst. | |
264 | * If an access exception occurs on src or dst, it stores -EFAULT | |
fdd374b6 AB |
265 | * to *src_err or *dst_err respectively. The caller must take any action |
266 | * required in this case (zeroing memory, recalculating partial checksum etc). | |
14cf11af PM |
267 | * |
268 | * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) | |
269 | */ | |
270 | _GLOBAL(csum_partial_copy_generic) | |
fdd374b6 AB |
271 | addic r0,r6,0 /* clear carry */ |
272 | ||
273 | srdi. r6,r5,3 /* less than 8 bytes? */ | |
274 | beq .Lcopy_tail_word | |
275 | ||
276 | /* | |
277 | * If only halfword aligned, align to a double word. Since odd | |
278 | * aligned addresses should be rare and they would require more | |
279 | * work to calculate the correct checksum, we ignore that case | |
280 | * and take the potential slowdown of unaligned loads. | |
281 | * | |
282 | * If the source and destination are relatively unaligned we only | |
283 | * align the source. This keeps things simple. | |
284 | */ | |
285 | rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ | |
286 | beq .Lcopy_aligned | |
287 | ||
d9813c36 PM |
288 | li r9,4 |
289 | sub r6,r9,r6 | |
fdd374b6 AB |
290 | mtctr r6 |
291 | ||
292 | 1: | |
8f21bd00 | 293 | srcnr; lhz r6,0(r3) /* align to doubleword */ |
14cf11af | 294 | subi r5,r5,2 |
14cf11af | 295 | addi r3,r3,2 |
fdd374b6 | 296 | adde r0,r0,r6 |
8f21bd00 | 297 | dstnr; sth r6,0(r4) |
14cf11af | 298 | addi r4,r4,2 |
fdd374b6 AB |
299 | bdnz 1b |
300 | ||
301 | .Lcopy_aligned: | |
302 | /* | |
303 | * We unroll the loop such that each iteration is 64 bytes with an | |
304 | * entry and exit limb of 64 bytes, meaning a minimum size of | |
305 | * 128 bytes. | |
306 | */ | |
307 | srdi. r6,r5,7 | |
308 | beq .Lcopy_tail_doublewords /* len < 128 */ | |
309 | ||
310 | srdi r6,r5,6 | |
311 | subi r6,r6,1 | |
312 | mtctr r6 | |
313 | ||
314 | stdu r1,-STACKFRAMESIZE(r1) | |
c75df6f9 MN |
315 | std r14,STK_REG(R14)(r1) |
316 | std r15,STK_REG(R15)(r1) | |
317 | std r16,STK_REG(R16)(r1) | |
fdd374b6 AB |
318 | |
319 | source; ld r6,0(r3) | |
320 | source; ld r9,8(r3) | |
321 | ||
322 | source; ld r10,16(r3) | |
323 | source; ld r11,24(r3) | |
324 | ||
325 | /* | |
326 | * On POWER6 and POWER7 back to back addes take 2 cycles because of | |
327 | * the XER dependency. This means the fastest this loop can go is | |
328 | * 16 cycles per iteration. The scheduling of the loop below has | |
329 | * been shown to hit this on both POWER6 and POWER7. | |
330 | */ | |
331 | .align 5 | |
332 | 2: | |
333 | adde r0,r0,r6 | |
334 | source; ld r12,32(r3) | |
335 | source; ld r14,40(r3) | |
336 | ||
337 | adde r0,r0,r9 | |
338 | source; ld r15,48(r3) | |
339 | source; ld r16,56(r3) | |
340 | addi r3,r3,64 | |
341 | ||
342 | adde r0,r0,r10 | |
343 | dest; std r6,0(r4) | |
344 | dest; std r9,8(r4) | |
345 | ||
346 | adde r0,r0,r11 | |
347 | dest; std r10,16(r4) | |
348 | dest; std r11,24(r4) | |
349 | ||
350 | adde r0,r0,r12 | |
351 | dest; std r12,32(r4) | |
352 | dest; std r14,40(r4) | |
353 | ||
354 | adde r0,r0,r14 | |
355 | dest; std r15,48(r4) | |
356 | dest; std r16,56(r4) | |
357 | addi r4,r4,64 | |
358 | ||
359 | adde r0,r0,r15 | |
360 | source; ld r6,0(r3) | |
361 | source; ld r9,8(r3) | |
362 | ||
363 | adde r0,r0,r16 | |
364 | source; ld r10,16(r3) | |
365 | source; ld r11,24(r3) | |
366 | bdnz 2b | |
367 | ||
368 | ||
14cf11af | 369 | adde r0,r0,r6 |
fdd374b6 AB |
370 | source; ld r12,32(r3) |
371 | source; ld r14,40(r3) | |
372 | ||
373 | adde r0,r0,r9 | |
374 | source; ld r15,48(r3) | |
375 | source; ld r16,56(r3) | |
376 | addi r3,r3,64 | |
377 | ||
378 | adde r0,r0,r10 | |
379 | dest; std r6,0(r4) | |
380 | dest; std r9,8(r4) | |
381 | ||
382 | adde r0,r0,r11 | |
383 | dest; std r10,16(r4) | |
384 | dest; std r11,24(r4) | |
385 | ||
386 | adde r0,r0,r12 | |
387 | dest; std r12,32(r4) | |
388 | dest; std r14,40(r4) | |
389 | ||
390 | adde r0,r0,r14 | |
391 | dest; std r15,48(r4) | |
392 | dest; std r16,56(r4) | |
393 | addi r4,r4,64 | |
394 | ||
395 | adde r0,r0,r15 | |
396 | adde r0,r0,r16 | |
397 | ||
c75df6f9 MN |
398 | ld r14,STK_REG(R14)(r1) |
399 | ld r15,STK_REG(R15)(r1) | |
400 | ld r16,STK_REG(R16)(r1) | |
fdd374b6 AB |
401 | addi r1,r1,STACKFRAMESIZE |
402 | ||
403 | andi. r5,r5,63 | |
404 | ||
405 | .Lcopy_tail_doublewords: /* Up to 127 bytes to go */ | |
406 | srdi. r6,r5,3 | |
407 | beq .Lcopy_tail_word | |
408 | ||
409 | mtctr r6 | |
410 | 3: | |
8f21bd00 | 411 | srcnr; ld r6,0(r3) |
fdd374b6 | 412 | addi r3,r3,8 |
14cf11af | 413 | adde r0,r0,r6 |
8f21bd00 | 414 | dstnr; std r6,0(r4) |
fdd374b6 AB |
415 | addi r4,r4,8 |
416 | bdnz 3b | |
14cf11af | 417 | |
fdd374b6 | 418 | andi. r5,r5,7 |
14cf11af | 419 | |
fdd374b6 AB |
420 | .Lcopy_tail_word: /* Up to 7 bytes to go */ |
421 | srdi. r6,r5,2 | |
422 | beq .Lcopy_tail_halfword | |
423 | ||
8f21bd00 | 424 | srcnr; lwz r6,0(r3) |
fdd374b6 AB |
425 | addi r3,r3,4 |
426 | adde r0,r0,r6 | |
8f21bd00 | 427 | dstnr; stw r6,0(r4) |
fdd374b6 AB |
428 | addi r4,r4,4 |
429 | subi r5,r5,4 | |
430 | ||
431 | .Lcopy_tail_halfword: /* Up to 3 bytes to go */ | |
432 | srdi. r6,r5,1 | |
433 | beq .Lcopy_tail_byte | |
434 | ||
8f21bd00 | 435 | srcnr; lhz r6,0(r3) |
fdd374b6 AB |
436 | addi r3,r3,2 |
437 | adde r0,r0,r6 | |
8f21bd00 | 438 | dstnr; sth r6,0(r4) |
14cf11af | 439 | addi r4,r4,2 |
fdd374b6 AB |
440 | subi r5,r5,2 |
441 | ||
442 | .Lcopy_tail_byte: /* Up to 1 byte to go */ | |
443 | andi. r6,r5,1 | |
444 | beq .Lcopy_finish | |
445 | ||
8f21bd00 | 446 | srcnr; lbz r6,0(r3) |
fdd374b6 AB |
447 | sldi r9,r6,8 /* Pad the byte out to 16 bits */ |
448 | adde r0,r0,r9 | |
8f21bd00 | 449 | dstnr; stb r6,0(r4) |
fdd374b6 AB |
450 | |
451 | .Lcopy_finish: | |
452 | addze r0,r0 /* add in final carry */ | |
453 | rldicl r4,r0,32,0 /* fold two 32 bit halves together */ | |
454 | add r3,r4,r0 | |
455 | srdi r3,r3,32 | |
456 | blr | |
457 | ||
458 | .Lsrc_error: | |
8f21bd00 PM |
459 | ld r14,STK_REG(R14)(r1) |
460 | ld r15,STK_REG(R15)(r1) | |
461 | ld r16,STK_REG(R16)(r1) | |
462 | addi r1,r1,STACKFRAMESIZE | |
463 | .Lsrc_error_nr: | |
14cf11af | 464 | cmpdi 0,r7,0 |
fdd374b6 | 465 | beqlr |
14cf11af PM |
466 | li r6,-EFAULT |
467 | stw r6,0(r7) | |
14cf11af PM |
468 | blr |
469 | ||
fdd374b6 | 470 | .Ldest_error: |
8f21bd00 PM |
471 | ld r14,STK_REG(R14)(r1) |
472 | ld r15,STK_REG(R15)(r1) | |
473 | ld r16,STK_REG(R16)(r1) | |
474 | addi r1,r1,STACKFRAMESIZE | |
475 | .Ldest_error_nr: | |
14cf11af | 476 | cmpdi 0,r8,0 |
fdd374b6 | 477 | beqlr |
14cf11af PM |
478 | li r6,-EFAULT |
479 | stw r6,0(r8) | |
14cf11af | 480 | blr |