Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * This routine clears to zero a linear memory buffer in user space. | |
3 | * | |
4 | * Inputs: | |
5 | * in0: address of buffer | |
6 | * in1: length of buffer in bytes | |
7 | * Outputs: | |
8 | * r8: number of bytes that didn't get cleared due to a fault | |
9 | * | |
10 | * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co | |
11 | * Stephane Eranian <eranian@hpl.hp.com> | |
12 | */ | |
13 | ||
14 | #include <asm/asmmacro.h> | |
e007c533 | 15 | #include <asm/export.h> |
1da177e4 LT |
16 | |
17 | // | |
18 | // arguments | |
19 | // | |
20 | #define buf r32 | |
21 | #define len r33 | |
22 | ||
23 | // | |
24 | // local registers | |
25 | // | |
26 | #define cnt r16 | |
27 | #define buf2 r17 | |
28 | #define saved_lc r18 | |
29 | #define saved_pfs r19 | |
30 | #define tmp r20 | |
31 | #define len2 r21 | |
32 | #define len3 r22 | |
33 | ||
34 | // | |
35 | // Theory of operations: | |
36 | // - we check whether or not the buffer is small, i.e., less than 17 | |
37 | // in which case we do the byte by byte loop. | |
38 | // | |
39 | // - Otherwise we go progressively from 1 byte store to 8byte store in | |
40 | // the head part, the body is a 16byte store loop and we finish we the | |
41 | // tail for the last 15 bytes. | |
42 | // The good point about this breakdown is that the long buffer handling | |
43 | // contains only 2 branches. | |
44 | // | |
45 | // The reason for not using shifting & masking for both the head and the | |
46 | // tail is to stay semantically correct. This routine is not supposed | |
47 | // to write bytes outside of the buffer. While most of the time this would | |
48 | // be ok, we can't tolerate a mistake. A classical example is the case | |
49 | // of multithreaded code were to the extra bytes touched is actually owned | |
50 | // by another thread which runs concurrently to ours. Another, less likely, | |
51 | // example is with device drivers where reading an I/O mapped location may | |
52 | // have side effects (same thing for writing). | |
53 | // | |
54 | ||
55 | GLOBAL_ENTRY(__do_clear_user) | |
56 | .prologue | |
57 | .save ar.pfs, saved_pfs | |
58 | alloc saved_pfs=ar.pfs,2,0,0,0 | |
59 | cmp.eq p6,p0=r0,len // check for zero length | |
60 | .save ar.lc, saved_lc | |
61 | mov saved_lc=ar.lc // preserve ar.lc (slow) | |
62 | .body | |
63 | ;; // avoid WAW on CFM | |
64 | adds tmp=-1,len // br.ctop is repeat/until | |
65 | mov ret0=len // return value is length at this point | |
66 | (p6) br.ret.spnt.many rp | |
67 | ;; | |
68 | cmp.lt p6,p0=16,len // if len > 16 then long memset | |
69 | mov ar.lc=tmp // initialize lc for small count | |
70 | (p6) br.cond.dptk .long_do_clear | |
71 | ;; // WAR on ar.lc | |
72 | // | |
73 | // worst case 16 iterations, avg 8 iterations | |
74 | // | |
75 | // We could have played with the predicates to use the extra | |
76 | // M slot for 2 stores/iteration but the cost the initialization | |
77 | // the various counters compared to how long the loop is supposed | |
78 | // to last on average does not make this solution viable. | |
79 | // | |
80 | 1: | |
81 | EX( .Lexit1, st1 [buf]=r0,1 ) | |
82 | adds len=-1,len // countdown length using len | |
83 | br.cloop.dptk 1b | |
84 | ;; // avoid RAW on ar.lc | |
85 | // | |
86 | // .Lexit4: comes from byte by byte loop | |
87 | // len contains bytes left | |
88 | .Lexit1: | |
89 | mov ret0=len // faster than using ar.lc | |
90 | mov ar.lc=saved_lc | |
91 | br.ret.sptk.many rp // end of short clear_user | |
92 | ||
93 | ||
94 | // | |
95 | // At this point we know we have more than 16 bytes to copy | |
96 | // so we focus on alignment (no branches required) | |
97 | // | |
98 | // The use of len/len2 for countdown of the number of bytes left | |
99 | // instead of ret0 is due to the fact that the exception code | |
100 | // changes the values of r8. | |
101 | // | |
102 | .long_do_clear: | |
103 | tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear) | |
104 | ;; | |
105 | EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned | |
106 | (p6) adds len=-1,len;; // sync because buf is modified | |
107 | tbit.nz p6,p0=buf,1 | |
108 | ;; | |
109 | EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned | |
110 | (p6) adds len=-2,len;; | |
111 | tbit.nz p6,p0=buf,2 | |
112 | ;; | |
113 | EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned | |
114 | (p6) adds len=-4,len;; | |
115 | tbit.nz p6,p0=buf,3 | |
116 | ;; | |
117 | EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned | |
118 | (p6) adds len=-8,len;; | |
119 | shr.u cnt=len,4 // number of 128-bit (2x64bit) words | |
120 | ;; | |
121 | cmp.eq p6,p0=r0,cnt | |
122 | adds tmp=-1,cnt | |
123 | (p6) br.cond.dpnt .dotail // we have less than 16 bytes left | |
124 | ;; | |
125 | adds buf2=8,buf // setup second base pointer | |
126 | mov ar.lc=tmp | |
127 | ;; | |
128 | ||
129 | // | |
130 | // 16bytes/iteration core loop | |
131 | // | |
132 | // The second store can never generate a fault because | |
133 | // we come into the loop only when we are 16-byte aligned. | |
134 | // This means that if we cross a page then it will always be | |
135 | // in the first store and never in the second. | |
136 | // | |
137 | // | |
138 | // We need to keep track of the remaining length. A possible (optimistic) | |
139 | // way would be to use ar.lc and derive how many byte were left by | |
140 | // doing : left= 16*ar.lc + 16. this would avoid the addition at | |
141 | // every iteration. | |
142 | // However we need to keep the synchronization point. A template | |
143 | // M;;MB does not exist and thus we can keep the addition at no | |
144 | // extra cycle cost (use a nop slot anyway). It also simplifies the | |
145 | // (unlikely) error recovery code | |
146 | // | |
147 | ||
148 | 2: EX(.Lexit3, st8 [buf]=r0,16 ) | |
149 | ;; // needed to get len correct when error | |
150 | st8 [buf2]=r0,16 | |
151 | adds len=-16,len | |
152 | br.cloop.dptk 2b | |
153 | ;; | |
154 | mov ar.lc=saved_lc | |
155 | // | |
156 | // tail correction based on len only | |
157 | // | |
158 | // We alternate the use of len3,len2 to allow parallelism and correct | |
159 | // error handling. We also reuse p6/p7 to return correct value. | |
160 | // The addition of len2/len3 does not cost anything more compared to | |
161 | // the regular memset as we had empty slots. | |
162 | // | |
163 | .dotail: | |
164 | mov len2=len // for parallelization of error handling | |
165 | mov len3=len | |
166 | tbit.nz p6,p0=len,3 | |
167 | ;; | |
168 | EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes | |
169 | (p6) adds len3=-8,len2 | |
170 | tbit.nz p7,p6=len,2 | |
171 | ;; | |
172 | EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes | |
173 | (p7) adds len2=-4,len3 | |
174 | tbit.nz p6,p7=len,1 | |
175 | ;; | |
176 | EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes | |
177 | (p6) adds len3=-2,len2 | |
178 | tbit.nz p7,p6=len,0 | |
179 | ;; | |
180 | EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left | |
181 | mov ret0=r0 // success | |
182 | br.ret.sptk.many rp // end of most likely path | |
183 | ||
184 | // | |
185 | // Outlined error handling code | |
186 | // | |
187 | ||
188 | // | |
189 | // .Lexit3: comes from core loop, need restore pr/lc | |
190 | // len contains bytes left | |
191 | // | |
192 | // | |
193 | // .Lexit2: | |
194 | // if p6 -> coming from st8 or st2 : len2 contains what's left | |
195 | // if p7 -> coming from st4 or st1 : len3 contains what's left | |
196 | // We must restore lc/pr even though might not have been used. | |
197 | .Lexit2: | |
198 | .pred.rel "mutex", p6, p7 | |
199 | (p6) mov len=len2 | |
200 | (p7) mov len=len3 | |
201 | ;; | |
202 | // | |
203 | // .Lexit4: comes from head, need not restore pr/lc | |
204 | // len contains bytes left | |
205 | // | |
206 | .Lexit3: | |
207 | mov ret0=len | |
208 | mov ar.lc=saved_lc | |
209 | br.ret.sptk.many rp | |
210 | END(__do_clear_user) | |
e007c533 | 211 | EXPORT_SYMBOL(__do_clear_user) |