Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * Optmized version of the ip_fast_csum() function | |
3 | * Used for calculating IP header checksum | |
4 | * | |
5 | * Return: 16bit checksum, complemented | |
6 | * | |
7 | * Inputs: | |
8 | * in0: address of buffer to checksum (char *) | |
9 | * in1: length of the buffer (int) | |
10 | * | |
11 | * Copyright (C) 2002 Intel Corp. | |
12 | * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com> | |
13 | */ | |
14 | ||
15 | #include <asm/asmmacro.h> | |
16 | ||
17 | /* | |
18 | * Since we know that most likely this function is called with buf aligned | |
19 | * on 4-byte boundary and 20 bytes in length, we can execution rather quickly | |
20 | * versus calling generic version of do_csum, which has lots of overhead in | |
21 | * handling various alignments and sizes. However, due to lack of constrains | |
22 | * put on the function input argument, cases with alignment not on 4-byte or | |
23 | * size not equal to 20 bytes will be handled by the generic do_csum function. | |
24 | */ | |
25 | ||
26 | #define in0 r32 | |
27 | #define in1 r33 | |
28 | #define ret0 r8 | |
29 | ||
30 | GLOBAL_ENTRY(ip_fast_csum) | |
31 | .prologue | |
32 | .body | |
33 | cmp.ne p6,p7=5,in1 // size other than 20 byte? | |
34 | and r14=3,in0 // is it aligned on 4-byte? | |
35 | add r15=4,in0 // second source pointer | |
36 | ;; | |
37 | cmp.ne.or.andcm p6,p7=r14,r0 | |
38 | ;; | |
39 | (p7) ld4 r20=[in0],8 | |
40 | (p7) ld4 r21=[r15],8 | |
41 | (p6) br.spnt .generic | |
42 | ;; | |
43 | ld4 r22=[in0],8 | |
44 | ld4 r23=[r15],8 | |
45 | ;; | |
46 | ld4 r24=[in0] | |
47 | add r20=r20,r21 | |
48 | add r22=r22,r23 | |
49 | ;; | |
50 | add r20=r20,r22 | |
51 | ;; | |
52 | add r20=r20,r24 | |
53 | ;; | |
54 | shr.u ret0=r20,16 // now need to add the carry | |
55 | zxt2 r20=r20 | |
56 | ;; | |
57 | add r20=ret0,r20 | |
58 | ;; | |
59 | shr.u ret0=r20,16 // add carry again | |
60 | zxt2 r20=r20 | |
61 | ;; | |
62 | add r20=ret0,r20 | |
63 | ;; | |
64 | shr.u ret0=r20,16 | |
65 | zxt2 r20=r20 | |
66 | ;; | |
67 | add r20=ret0,r20 | |
68 | ;; | |
69 | andcm ret0=-1,r20 | |
70 | .restore sp // reset frame state | |
71 | br.ret.sptk.many b0 | |
72 | ;; | |
73 | ||
74 | .generic: | |
75 | .prologue | |
76 | .save ar.pfs, r35 | |
77 | alloc r35=ar.pfs,2,2,2,0 | |
78 | .save rp, r34 | |
79 | mov r34=b0 | |
80 | .body | |
81 | dep.z out1=in1,2,30 | |
82 | mov out0=in0 | |
83 | ;; | |
84 | br.call.sptk.many b0=do_csum | |
85 | ;; | |
86 | andcm ret0=-1,ret0 | |
87 | mov ar.pfs=r35 | |
88 | mov b0=r34 | |
89 | br.ret.sptk.many b0 | |
90 | END(ip_fast_csum) |