Merge remote-tracking branch 'mmc-uh/next'
[deliverable/linux.git] / arch / alpha / lib / ev6-copy_user.S
CommitLineData
1da177e4
LT
1/*
2 * arch/alpha/lib/ev6-copy_user.S
3 *
4 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
5 *
6 * Copy to/from user space, handling exceptions as we go.. This
7 * isn't exactly pretty.
8 *
9 * This is essentially the same as "memcpy()", but with a few twists.
10 * Notably, we have to make sure that $0 is always up-to-date and
11 * contains the right "bytes left to copy" value (and that it is updated
12 * only _after_ a successful copy). There is also some rather minor
13 * exception setup stuff..
14 *
15 * NOTE! This is not directly C-callable, because the calling semantics are
16 * different:
17 *
18 * Inputs:
19 * length in $0
20 * destination address in $6
21 * source address in $7
22 * return address in $28
23 *
24 * Outputs:
25 * bytes left to copy in $0
26 *
27 * Clobbers:
28 * $1,$2,$3,$4,$5,$6,$7
29 *
30 * Much of the information about 21264 scheduling/coding comes from:
31 * Compiler Writer's Guide for the Alpha 21264
32 * abbreviated as 'CWG' in other comments here
33 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
34 * Scheduling notation:
35 * E - either cluster
36 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
37 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
38 */
39
00fc0e0d 40#include <asm/export.h>
1da177e4
LT
41/* Allow an exception for an insn; exit if we get one. */
42#define EXI(x,y...) \
43 99: x,##y; \
44 .section __ex_table,"a"; \
45 .long 99b - .; \
46 lda $31, $exitin-99b($31); \
47 .previous
48
49#define EXO(x,y...) \
50 99: x,##y; \
51 .section __ex_table,"a"; \
52 .long 99b - .; \
53 lda $31, $exitout-99b($31); \
54 .previous
55
56 .set noat
57 .align 4
58 .globl __copy_user
59 .ent __copy_user
60 # Pipeline info: Slotting & Comments
61__copy_user:
62 .prologue 0
63 subq $0, 32, $1 # .. E .. .. : Is this going to be a small copy?
64 beq $0, $zerolength # U .. .. .. : U L U L
65
66 and $6,7,$3 # .. .. .. E : is leading dest misalignment
67 ble $1, $onebyteloop # .. .. U .. : 1st branch : small amount of data
68 beq $3, $destaligned # .. U .. .. : 2nd (one cycle fetcher stall)
69 subq $3, 8, $3 # E .. .. .. : L U U L : trip counter
70/*
71 * The fetcher stall also hides the 1 cycle cross-cluster stall for $3 (L --> U)
72 * This loop aligns the destination a byte at a time
73 * We know we have at least one trip through this loop
74 */
75$aligndest:
76 EXI( ldbu $1,0($7) ) # .. .. .. L : Keep loads separate from stores
77 addq $6,1,$6 # .. .. E .. : Section 3.8 in the CWG
78 addq $3,1,$3 # .. E .. .. :
79 nop # E .. .. .. : U L U L
80
81/*
82 * the -1 is to compensate for the inc($6) done in a previous quadpack
83 * which allows us zero dependencies within either quadpack in the loop
84 */
85 EXO( stb $1,-1($6) ) # .. .. .. L :
86 addq $7,1,$7 # .. .. E .. : Section 3.8 in the CWG
87 subq $0,1,$0 # .. E .. .. :
88 bne $3, $aligndest # U .. .. .. : U L U L
89
90/*
91 * If we fell through into here, we have a minimum of 33 - 7 bytes
92 * If we arrived via branch, we have a minimum of 32 bytes
93 */
94$destaligned:
95 and $7,7,$1 # .. .. .. E : Check _current_ source alignment
96 bic $0,7,$4 # .. .. E .. : number bytes as a quadword loop
97 EXI( ldq_u $3,0($7) ) # .. L .. .. : Forward fetch for fallthrough code
98 beq $1,$quadaligned # U .. .. .. : U L U L
99
100/*
101 * In the worst case, we've just executed an ldq_u here from 0($7)
102 * and we'll repeat it once if we take the branch
103 */
104
105/* Misaligned quadword loop - not unrolled. Leave it that way. */
106$misquad:
107 EXI( ldq_u $2,8($7) ) # .. .. .. L :
108 subq $4,8,$4 # .. .. E .. :
109 extql $3,$7,$3 # .. U .. .. :
110 extqh $2,$7,$1 # U .. .. .. : U U L L
111
112 bis $3,$1,$1 # .. .. .. E :
113 EXO( stq $1,0($6) ) # .. .. L .. :
114 addq $7,8,$7 # .. E .. .. :
115 subq $0,8,$0 # E .. .. .. : U L L U
116
117 addq $6,8,$6 # .. .. .. E :
118 bis $2,$2,$3 # .. .. E .. :
119 nop # .. E .. .. :
120 bne $4,$misquad # U .. .. .. : U L U L
121
122 nop # .. .. .. E
123 nop # .. .. E ..
124 nop # .. E .. ..
125 beq $0,$zerolength # U .. .. .. : U L U L
126
127/* We know we have at least one trip through the byte loop */
128 EXI ( ldbu $2,0($7) ) # .. .. .. L : No loads in the same quad
129 addq $6,1,$6 # .. .. E .. : as the store (Section 3.8 in CWG)
130 nop # .. E .. .. :
131 br $31, $dirtyentry # L0 .. .. .. : L U U L
132/* Do the trailing byte loop load, then hop into the store part of the loop */
133
134/*
135 * A minimum of (33 - 7) bytes to do a quad at a time.
136 * Based upon the usage context, it's worth the effort to unroll this loop
137 * $0 - number of bytes to be moved
138 * $4 - number of bytes to move as quadwords
139 * $6 is current destination address
140 * $7 is current source address
141 */
142$quadaligned:
143 subq $4, 32, $2 # .. .. .. E : do not unroll for small stuff
144 nop # .. .. E ..
145 nop # .. E .. ..
146 blt $2, $onequad # U .. .. .. : U L U L
147
148/*
149 * There is a significant assumption here that the source and destination
150 * addresses differ by more than 32 bytes. In this particular case, a
151 * sparsity of registers further bounds this to be a minimum of 8 bytes.
152 * But if this isn't met, then the output result will be incorrect.
153 * Furthermore, due to a lack of available registers, we really can't
154 * unroll this to be an 8x loop (which would enable us to use the wh64
155 * instruction memory hint instruction).
156 */
157$unroll4:
158 EXI( ldq $1,0($7) ) # .. .. .. L
159 EXI( ldq $2,8($7) ) # .. .. L ..
160 subq $4,32,$4 # .. E .. ..
161 nop # E .. .. .. : U U L L
162
163 addq $7,16,$7 # .. .. .. E
164 EXO( stq $1,0($6) ) # .. .. L ..
165 EXO( stq $2,8($6) ) # .. L .. ..
166 subq $0,16,$0 # E .. .. .. : U L L U
167
168 addq $6,16,$6 # .. .. .. E
169 EXI( ldq $1,0($7) ) # .. .. L ..
170 EXI( ldq $2,8($7) ) # .. L .. ..
171 subq $4, 32, $3 # E .. .. .. : U U L L : is there enough for another trip?
172
173 EXO( stq $1,0($6) ) # .. .. .. L
174 EXO( stq $2,8($6) ) # .. .. L ..
175 subq $0,16,$0 # .. E .. ..
176 addq $7,16,$7 # E .. .. .. : U L L U
177
178 nop # .. .. .. E
179 nop # .. .. E ..
180 addq $6,16,$6 # .. E .. ..
181 bgt $3,$unroll4 # U .. .. .. : U L U L
182
183 nop
184 nop
185 nop
186 beq $4, $noquads
187
188$onequad:
189 EXI( ldq $1,0($7) )
190 subq $4,8,$4
191 addq $7,8,$7
192 nop
193
194 EXO( stq $1,0($6) )
195 subq $0,8,$0
196 addq $6,8,$6
197 bne $4,$onequad
198
199$noquads:
200 nop
201 nop
202 nop
203 beq $0,$zerolength
204
205/*
206 * For small copies (or the tail of a larger copy), do a very simple byte loop.
207 * There's no point in doing a lot of complex alignment calculations to try to
208 * to quadword stuff for a small amount of data.
209 * $0 - remaining number of bytes left to copy
210 * $6 - current dest addr
211 * $7 - current source addr
212 */
213
214$onebyteloop:
215 EXI ( ldbu $2,0($7) ) # .. .. .. L : No loads in the same quad
216 addq $6,1,$6 # .. .. E .. : as the store (Section 3.8 in CWG)
217 nop # .. E .. .. :
218 nop # E .. .. .. : U L U L
219
220$dirtyentry:
221/*
222 * the -1 is to compensate for the inc($6) done in a previous quadpack
223 * which allows us zero dependencies within either quadpack in the loop
224 */
225 EXO ( stb $2,-1($6) ) # .. .. .. L :
226 addq $7,1,$7 # .. .. E .. : quadpack as the load
227 subq $0,1,$0 # .. E .. .. : change count _after_ copy
228 bgt $0,$onebyteloop # U .. .. .. : U L U L
229
230$zerolength:
231$exitout: # Destination for exception recovery(?)
232 nop # .. .. .. E
233 nop # .. .. E ..
234 nop # .. E .. ..
235 ret $31,($28),1 # L0 .. .. .. : L U L U
236
237$exitin:
238
239 /* A stupid byte-by-byte zeroing of the rest of the output
240 buffer. This cures security holes by never leaving
241 random kernel data around to be copied elsewhere. */
242
243 nop
244 nop
245 nop
246 mov $0,$1
247
248$101:
249 EXO ( stb $31,0($6) ) # L
250 subq $1,1,$1 # E
251 addq $6,1,$6 # E
252 bgt $1,$101 # U
253
254 nop
255 nop
256 nop
257 ret $31,($28),1 # L0
258
259 .end __copy_user
00fc0e0d 260 EXPORT_SYMBOL(__copy_user)
This page took 0.766294 seconds and 5 git commands to generate.