Commit | Line | Data |
---|---|---|
14cf11af PM |
1 | /* |
2 | * This file contains assembly-language implementations | |
3 | * of IP-style 1's complement checksum routines. | |
4 | * | |
5 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) | |
6 | * | |
7 | * This program is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation; either version | |
10 | * 2 of the License, or (at your option) any later version. | |
11 | * | |
12 | * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). | |
13 | */ | |
14 | ||
15 | #include <linux/sys.h> | |
16 | #include <asm/processor.h> | |
7aef4136 | 17 | #include <asm/cache.h> |
14cf11af PM |
18 | #include <asm/errno.h> |
19 | #include <asm/ppc_asm.h> | |
20 | ||
21 | .text | |
22 | ||
14cf11af PM |
23 | /* |
24 | * computes the checksum of a memory block at buff, length len, | |
25 | * and adds in "sum" (32-bit) | |
26 | * | |
7e393220 | 27 | * __csum_partial(buff, len, sum) |
14cf11af | 28 | */ |
7e393220 | 29 | _GLOBAL(__csum_partial) |
14cf11af | 30 | subi r3,r3,4 |
48821a34 | 31 | srawi. r6,r4,2 /* Divide len by 4 and also clear carry */ |
14cf11af | 32 | beq 3f /* if we're doing < 4 bytes */ |
48821a34 | 33 | andi. r0,r3,2 /* Align buffer to longword boundary */ |
14cf11af | 34 | beq+ 1f |
48821a34 | 35 | lhz r0,4(r3) /* do 2 bytes to get aligned */ |
14cf11af | 36 | subi r4,r4,2 |
48821a34 | 37 | addi r3,r3,2 |
14cf11af | 38 | srwi. r6,r4,2 /* # words to do */ |
48821a34 | 39 | adde r5,r5,r0 |
14cf11af | 40 | beq 3f |
f867d556 CL |
41 | 1: andi. r6,r6,3 /* Prepare to handle words 4 by 4 */ |
42 | beq 21f | |
43 | mtctr r6 | |
48821a34 CL |
44 | 2: lwzu r0,4(r3) |
45 | adde r5,r5,r0 | |
14cf11af | 46 | bdnz 2b |
f867d556 CL |
47 | 21: srwi. r6,r4,4 /* # blocks of 4 words to do */ |
48 | beq 3f | |
49 | mtctr r6 | |
50 | 22: lwz r0,4(r3) | |
51 | lwz r6,8(r3) | |
52 | lwz r7,12(r3) | |
53 | lwzu r8,16(r3) | |
54 | adde r5,r5,r0 | |
55 | adde r5,r5,r6 | |
56 | adde r5,r5,r7 | |
57 | adde r5,r5,r8 | |
58 | bdnz 22b | |
48821a34 CL |
59 | 3: andi. r0,r4,2 |
60 | beq+ 4f | |
61 | lhz r0,4(r3) | |
14cf11af | 62 | addi r3,r3,2 |
48821a34 CL |
63 | adde r5,r5,r0 |
64 | 4: andi. r0,r4,1 | |
65 | beq+ 5f | |
66 | lbz r0,4(r3) | |
67 | slwi r0,r0,8 /* Upper byte of word */ | |
68 | adde r5,r5,r0 | |
69 | 5: addze r3,r5 /* add in final carry */ | |
14cf11af PM |
70 | blr |
71 | ||
72 | /* | |
73 | * Computes the checksum of a memory block at src, length len, | |
74 | * and adds in "sum" (32-bit), while copying the block to dst. | |
75 | * If an access exception occurs on src or dst, it stores -EFAULT | |
76 | * to *src_err or *dst_err respectively, and (for an error on | |
77 | * src) zeroes the rest of dst. | |
78 | * | |
79 | * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err) | |
80 | */ | |
7aef4136 CL |
81 | #define CSUM_COPY_16_BYTES_WITHEX(n) \ |
82 | 8 ## n ## 0: \ | |
83 | lwz r7,4(r4); \ | |
84 | 8 ## n ## 1: \ | |
85 | lwz r8,8(r4); \ | |
86 | 8 ## n ## 2: \ | |
87 | lwz r9,12(r4); \ | |
88 | 8 ## n ## 3: \ | |
89 | lwzu r10,16(r4); \ | |
90 | 8 ## n ## 4: \ | |
91 | stw r7,4(r6); \ | |
92 | adde r12,r12,r7; \ | |
93 | 8 ## n ## 5: \ | |
94 | stw r8,8(r6); \ | |
95 | adde r12,r12,r8; \ | |
96 | 8 ## n ## 6: \ | |
97 | stw r9,12(r6); \ | |
98 | adde r12,r12,r9; \ | |
99 | 8 ## n ## 7: \ | |
100 | stwu r10,16(r6); \ | |
101 | adde r12,r12,r10 | |
102 | ||
103 | #define CSUM_COPY_16_BYTES_EXCODE(n) \ | |
104 | .section __ex_table,"a"; \ | |
105 | .align 2; \ | |
106 | .long 8 ## n ## 0b,src_error; \ | |
107 | .long 8 ## n ## 1b,src_error; \ | |
108 | .long 8 ## n ## 2b,src_error; \ | |
109 | .long 8 ## n ## 3b,src_error; \ | |
110 | .long 8 ## n ## 4b,dst_error; \ | |
111 | .long 8 ## n ## 5b,dst_error; \ | |
112 | .long 8 ## n ## 6b,dst_error; \ | |
113 | .long 8 ## n ## 7b,dst_error; \ | |
114 | .text | |
115 | ||
116 | .text | |
117 | .stabs "arch/powerpc/lib/",N_SO,0,0,0f | |
118 | .stabs "checksum_32.S",N_SO,0,0,0f | |
119 | 0: | |
120 | ||
121 | CACHELINE_BYTES = L1_CACHE_BYTES | |
122 | LG_CACHELINE_BYTES = L1_CACHE_SHIFT | |
123 | CACHELINE_MASK = (L1_CACHE_BYTES-1) | |
124 | ||
14cf11af | 125 | _GLOBAL(csum_partial_copy_generic) |
7aef4136 CL |
126 | stwu r1,-16(r1) |
127 | stw r7,12(r1) | |
128 | stw r8,8(r1) | |
129 | ||
1bc8b816 CL |
130 | rlwinm r0,r4,3,0x8 |
131 | rlwnm r6,r6,r0,0,31 /* odd destination address: rotate one byte */ | |
132 | cmplwi cr7,r0,0 /* is destination address even ? */ | |
7aef4136 CL |
133 | addic r12,r6,0 |
134 | addi r6,r4,-4 | |
135 | neg r0,r4 | |
136 | addi r4,r3,-4 | |
137 | andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ | |
138 | beq 58f | |
139 | ||
140 | cmplw 0,r5,r0 /* is this more than total to do? */ | |
141 | blt 63f /* if not much to do */ | |
142 | andi. r8,r0,3 /* get it word-aligned first */ | |
143 | mtctr r8 | |
144 | beq+ 61f | |
145 | li r3,0 | |
146 | 70: lbz r9,4(r4) /* do some bytes */ | |
147 | addi r4,r4,1 | |
148 | slwi r3,r3,8 | |
149 | rlwimi r3,r9,0,24,31 | |
150 | 71: stb r9,4(r6) | |
151 | addi r6,r6,1 | |
152 | bdnz 70b | |
153 | adde r12,r12,r3 | |
154 | 61: subf r5,r0,r5 | |
155 | srwi. r0,r0,2 | |
156 | mtctr r0 | |
157 | beq 58f | |
158 | 72: lwzu r9,4(r4) /* do some words */ | |
159 | adde r12,r12,r9 | |
160 | 73: stwu r9,4(r6) | |
161 | bdnz 72b | |
162 | ||
163 | 58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ | |
164 | clrlwi r5,r5,32-LG_CACHELINE_BYTES | |
165 | li r11,4 | |
166 | beq 63f | |
167 | ||
168 | /* Here we decide how far ahead to prefetch the source */ | |
169 | li r3,4 | |
170 | cmpwi r0,1 | |
171 | li r7,0 | |
172 | ble 114f | |
173 | li r7,1 | |
174 | #if MAX_COPY_PREFETCH > 1 | |
175 | /* Heuristically, for large transfers we prefetch | |
176 | MAX_COPY_PREFETCH cachelines ahead. For small transfers | |
177 | we prefetch 1 cacheline ahead. */ | |
178 | cmpwi r0,MAX_COPY_PREFETCH | |
179 | ble 112f | |
180 | li r7,MAX_COPY_PREFETCH | |
181 | 112: mtctr r7 | |
182 | 111: dcbt r3,r4 | |
183 | addi r3,r3,CACHELINE_BYTES | |
184 | bdnz 111b | |
185 | #else | |
186 | dcbt r3,r4 | |
187 | addi r3,r3,CACHELINE_BYTES | |
188 | #endif /* MAX_COPY_PREFETCH > 1 */ | |
189 | ||
190 | 114: subf r8,r7,r0 | |
191 | mr r0,r7 | |
192 | mtctr r8 | |
193 | ||
194 | 53: dcbt r3,r4 | |
195 | 54: dcbz r11,r6 | |
196 | /* the main body of the cacheline loop */ | |
197 | CSUM_COPY_16_BYTES_WITHEX(0) | |
198 | #if L1_CACHE_BYTES >= 32 | |
199 | CSUM_COPY_16_BYTES_WITHEX(1) | |
200 | #if L1_CACHE_BYTES >= 64 | |
201 | CSUM_COPY_16_BYTES_WITHEX(2) | |
202 | CSUM_COPY_16_BYTES_WITHEX(3) | |
203 | #if L1_CACHE_BYTES >= 128 | |
204 | CSUM_COPY_16_BYTES_WITHEX(4) | |
205 | CSUM_COPY_16_BYTES_WITHEX(5) | |
206 | CSUM_COPY_16_BYTES_WITHEX(6) | |
207 | CSUM_COPY_16_BYTES_WITHEX(7) | |
208 | #endif | |
209 | #endif | |
210 | #endif | |
211 | bdnz 53b | |
212 | cmpwi r0,0 | |
213 | li r3,4 | |
214 | li r7,0 | |
215 | bne 114b | |
216 | ||
217 | 63: srwi. r0,r5,2 | |
218 | mtctr r0 | |
219 | beq 64f | |
220 | 30: lwzu r0,4(r4) | |
221 | adde r12,r12,r0 | |
222 | 31: stwu r0,4(r6) | |
223 | bdnz 30b | |
224 | ||
225 | 64: andi. r0,r5,2 | |
226 | beq+ 65f | |
227 | 40: lhz r0,4(r4) | |
14cf11af | 228 | addi r4,r4,2 |
7aef4136 CL |
229 | 41: sth r0,4(r6) |
230 | adde r12,r12,r0 | |
231 | addi r6,r6,2 | |
232 | 65: andi. r0,r5,1 | |
233 | beq+ 66f | |
234 | 50: lbz r0,4(r4) | |
235 | 51: stb r0,4(r6) | |
236 | slwi r0,r0,8 | |
237 | adde r12,r12,r0 | |
238 | 66: addze r3,r12 | |
239 | addi r1,r1,16 | |
240 | beqlr+ cr7 | |
1bc8b816 | 241 | rlwinm r3,r3,8,0,31 /* odd destination address: rotate one byte */ |
14cf11af PM |
242 | blr |
243 | ||
7aef4136 | 244 | /* read fault */ |
14cf11af | 245 | src_error: |
7aef4136 CL |
246 | lwz r7,12(r1) |
247 | addi r1,r1,16 | |
248 | cmpwi cr0,r7,0 | |
249 | beqlr | |
250 | li r0,-EFAULT | |
251 | stw r0,0(r7) | |
14cf11af | 252 | blr |
7aef4136 | 253 | /* write fault */ |
14cf11af | 254 | dst_error: |
7aef4136 CL |
255 | lwz r8,8(r1) |
256 | addi r1,r1,16 | |
257 | cmpwi cr0,r8,0 | |
258 | beqlr | |
259 | li r0,-EFAULT | |
260 | stw r0,0(r8) | |
14cf11af PM |
261 | blr |
262 | ||
7aef4136 CL |
263 | .section __ex_table,"a" |
264 | .align 2 | |
265 | .long 70b,src_error | |
266 | .long 71b,dst_error | |
267 | .long 72b,src_error | |
268 | .long 73b,dst_error | |
269 | .long 54b,dst_error | |
270 | .text | |
271 | ||
272 | /* | |
273 | * this stuff handles faults in the cacheline loop and branches to either | |
274 | * src_error (if in read part) or dst_error (if in write part) | |
275 | */ | |
276 | CSUM_COPY_16_BYTES_EXCODE(0) | |
277 | #if L1_CACHE_BYTES >= 32 | |
278 | CSUM_COPY_16_BYTES_EXCODE(1) | |
279 | #if L1_CACHE_BYTES >= 64 | |
280 | CSUM_COPY_16_BYTES_EXCODE(2) | |
281 | CSUM_COPY_16_BYTES_EXCODE(3) | |
282 | #if L1_CACHE_BYTES >= 128 | |
283 | CSUM_COPY_16_BYTES_EXCODE(4) | |
284 | CSUM_COPY_16_BYTES_EXCODE(5) | |
285 | CSUM_COPY_16_BYTES_EXCODE(6) | |
286 | CSUM_COPY_16_BYTES_EXCODE(7) | |
287 | #endif | |
288 | #endif | |
289 | #endif | |
290 | ||
291 | .section __ex_table,"a" | |
292 | .align 2 | |
293 | .long 30b,src_error | |
294 | .long 31b,dst_error | |
295 | .long 40b,src_error | |
296 | .long 41b,dst_error | |
297 | .long 50b,src_error | |
298 | .long 51b,dst_error |