Commit | Line | Data |
---|---|---|
a1005012 MS |
1 | #ifndef _VIDEO_ATAFB_UTILS_H |
2 | #define _VIDEO_ATAFB_UTILS_H | |
3 | ||
4 | /* ================================================================= */ | |
5 | /* Utility Assembler Functions */ | |
6 | /* ================================================================= */ | |
7 | ||
8 | /* ====================================================================== */ | |
9 | ||
10 | /* Those of a delicate disposition might like to skip the next couple of | |
11 | * pages. | |
12 | * | |
13 | * These functions are drop in replacements for memmove and | |
14 | * memset(_, 0, _). However their five instances add at least a kilobyte | |
15 | * to the object file. You have been warned. | |
16 | * | |
17 | * Not a great fan of assembler for the sake of it, but I think | |
18 | * that these routines are at least 10 times faster than their C | |
19 | * equivalents for large blits, and that's important to the lowest level of | |
20 | * a graphics driver. Question is whether some scheme with the blitter | |
21 | * would be faster. I suspect not for simple text system - not much | |
22 | * asynchrony. | |
23 | * | |
24 | * Code is very simple, just gruesome expansion. Basic strategy is to | |
25 | * increase data moved/cleared at each step to 16 bytes to reduce | |
26 | * instruction per data move overhead. movem might be faster still | |
27 | * For more than 15 bytes, we try to align the write direction on a | |
28 | * longword boundary to get maximum speed. This is even more gruesome. | |
29 | * Unaligned read/write used requires 68020+ - think this is a problem? | |
30 | * | |
31 | * Sorry! | |
32 | */ | |
33 | ||
34 | ||
35 | /* ++roman: I've optimized Robert's original versions in some minor | |
36 | * aspects, e.g. moveq instead of movel, let gcc choose the registers, | |
37 | * use movem in some places... | |
38 | * For other modes than 1 plane, lots of more such assembler functions | |
39 | * were needed (e.g. the ones using movep or expanding color values). | |
40 | */ | |
41 | ||
42 | /* ++andreas: more optimizations: | |
43 | subl #65536,d0 replaced by clrw d0; subql #1,d0 for dbcc | |
44 | addal is faster than addaw | |
45 | movep is rather expensive compared to ordinary move's | |
46 | some functions rewritten in C for clarity, no speed loss */ | |
47 | ||
48 | static inline void *fb_memclear_small(void *s, size_t count) | |
49 | { | |
50 | if (!count) | |
51 | return 0; | |
52 | ||
53 | asm volatile ("\n" | |
54 | " lsr.l #1,%1 ; jcc 1f ; move.b %2,-(%0)\n" | |
55 | "1: lsr.l #1,%1 ; jcc 1f ; move.w %2,-(%0)\n" | |
56 | "1: lsr.l #1,%1 ; jcc 1f ; move.l %2,-(%0)\n" | |
57 | "1: lsr.l #1,%1 ; jcc 1f ; move.l %2,-(%0) ; move.l %2,-(%0)\n" | |
58 | "1:" | |
59 | : "=a" (s), "=d" (count) | |
60 | : "d" (0), "0" ((char *)s + count), "1" (count)); | |
61 | asm volatile ("\n" | |
62 | " subq.l #1,%1\n" | |
63 | " jcs 3f\n" | |
64 | " move.l %2,%%d4; move.l %2,%%d5; move.l %2,%%d6\n" | |
65 | "2: movem.l %2/%%d4/%%d5/%%d6,-(%0)\n" | |
66 | " dbra %1,2b\n" | |
67 | "3:" | |
68 | : "=a" (s), "=d" (count) | |
69 | : "d" (0), "0" (s), "1" (count) | |
70 | : "d4", "d5", "d6" | |
71 | ); | |
72 | ||
73 | return 0; | |
74 | } | |
75 | ||
76 | ||
77 | static inline void *fb_memclear(void *s, size_t count) | |
78 | { | |
79 | if (!count) | |
80 | return 0; | |
81 | ||
82 | if (count < 16) { | |
83 | asm volatile ("\n" | |
84 | " lsr.l #1,%1 ; jcc 1f ; clr.b (%0)+\n" | |
85 | "1: lsr.l #1,%1 ; jcc 1f ; clr.w (%0)+\n" | |
86 | "1: lsr.l #1,%1 ; jcc 1f ; clr.l (%0)+\n" | |
87 | "1: lsr.l #1,%1 ; jcc 1f ; clr.l (%0)+ ; clr.l (%0)+\n" | |
88 | "1:" | |
89 | : "=a" (s), "=d" (count) | |
90 | : "0" (s), "1" (count)); | |
91 | } else { | |
92 | long tmp; | |
93 | asm volatile ("\n" | |
94 | " move.l %1,%2\n" | |
95 | " lsr.l #1,%2 ; jcc 1f ; clr.b (%0)+ ; subq.w #1,%1\n" | |
96 | " lsr.l #1,%2 ; jcs 2f\n" /* %0 increased=>bit 2 switched*/ | |
97 | " clr.w (%0)+ ; subq.w #2,%1 ; jra 2f\n" | |
98 | "1: lsr.l #1,%2 ; jcc 2f\n" | |
99 | " clr.w (%0)+ ; subq.w #2,%1\n" | |
100 | "2: move.w %1,%2; lsr.l #2,%1 ; jeq 6f\n" | |
101 | " lsr.l #1,%1 ; jcc 3f ; clr.l (%0)+\n" | |
102 | "3: lsr.l #1,%1 ; jcc 4f ; clr.l (%0)+ ; clr.l (%0)+\n" | |
103 | "4: subq.l #1,%1 ; jcs 6f\n" | |
104 | "5: clr.l (%0)+; clr.l (%0)+ ; clr.l (%0)+ ; clr.l (%0)+\n" | |
105 | " dbra %1,5b ; clr.w %1; subq.l #1,%1; jcc 5b\n" | |
106 | "6: move.w %2,%1; btst #1,%1 ; jeq 7f ; clr.w (%0)+\n" | |
107 | "7: btst #0,%1 ; jeq 8f ; clr.b (%0)+\n" | |
108 | "8:" | |
109 | : "=a" (s), "=d" (count), "=d" (tmp) | |
110 | : "0" (s), "1" (count)); | |
111 | } | |
112 | ||
113 | return 0; | |
114 | } | |
115 | ||
116 | ||
117 | static inline void *fb_memset255(void *s, size_t count) | |
118 | { | |
119 | if (!count) | |
120 | return 0; | |
121 | ||
122 | asm volatile ("\n" | |
123 | " lsr.l #1,%1 ; jcc 1f ; move.b %2,-(%0)\n" | |
124 | "1: lsr.l #1,%1 ; jcc 1f ; move.w %2,-(%0)\n" | |
125 | "1: lsr.l #1,%1 ; jcc 1f ; move.l %2,-(%0)\n" | |
126 | "1: lsr.l #1,%1 ; jcc 1f ; move.l %2,-(%0) ; move.l %2,-(%0)\n" | |
127 | "1:" | |
128 | : "=a" (s), "=d" (count) | |
129 | : "d" (-1), "0" ((char *)s+count), "1" (count)); | |
130 | asm volatile ("\n" | |
131 | " subq.l #1,%1 ; jcs 3f\n" | |
132 | " move.l %2,%%d4; move.l %2,%%d5; move.l %2,%%d6\n" | |
133 | "2: movem.l %2/%%d4/%%d5/%%d6,-(%0)\n" | |
134 | " dbra %1,2b\n" | |
135 | "3:" | |
136 | : "=a" (s), "=d" (count) | |
137 | : "d" (-1), "0" (s), "1" (count) | |
138 | : "d4", "d5", "d6"); | |
139 | ||
140 | return 0; | |
141 | } | |
142 | ||
143 | ||
144 | static inline void *fb_memmove(void *d, const void *s, size_t count) | |
145 | { | |
146 | if (d < s) { | |
147 | if (count < 16) { | |
148 | asm volatile ("\n" | |
149 | " lsr.l #1,%2 ; jcc 1f ; move.b (%1)+,(%0)+\n" | |
150 | "1: lsr.l #1,%2 ; jcc 1f ; move.w (%1)+,(%0)+\n" | |
151 | "1: lsr.l #1,%2 ; jcc 1f ; move.l (%1)+,(%0)+\n" | |
152 | "1: lsr.l #1,%2 ; jcc 1f ; move.l (%1)+,(%0)+ ; move.l (%1)+,(%0)+\n" | |
153 | "1:" | |
154 | : "=a" (d), "=a" (s), "=d" (count) | |
155 | : "0" (d), "1" (s), "2" (count)); | |
156 | } else { | |
157 | long tmp; | |
158 | asm volatile ("\n" | |
159 | " move.l %0,%3\n" | |
160 | " lsr.l #1,%3 ; jcc 1f ; move.b (%1)+,(%0)+ ; subqw #1,%2\n" | |
161 | " lsr.l #1,%3 ; jcs 2f\n" /* %0 increased=>bit 2 switched*/ | |
162 | " move.w (%1)+,(%0)+ ; subqw #2,%2 ; jra 2f\n" | |
163 | "1: lsr.l #1,%3 ; jcc 2f\n" | |
164 | " move.w (%1)+,(%0)+ ; subqw #2,%2\n" | |
165 | "2: move.w %2,%-; lsr.l #2,%2 ; jeq 6f\n" | |
166 | " lsr.l #1,%2 ; jcc 3f ; move.l (%1)+,(%0)+\n" | |
167 | "3: lsr.l #1,%2 ; jcc 4f ; move.l (%1)+,(%0)+ ; move.l (%1)+,(%0)+\n" | |
168 | "4: subq.l #1,%2 ; jcs 6f\n" | |
169 | "5: move.l (%1)+,(%0)+; move.l (%1)+,(%0)+\n" | |
170 | " move.l (%1)+,(%0)+; move.l (%1)+,(%0)+\n" | |
171 | " dbra %2,5b ; clr.w %2; subq.l #1,%2; jcc 5b\n" | |
172 | "6: move.w %+,%2; btst #1,%2 ; jeq 7f ; move.w (%1)+,(%0)+\n" | |
173 | "7: btst #0,%2 ; jeq 8f ; move.b (%1)+,(%0)+\n" | |
174 | "8:" | |
175 | : "=a" (d), "=a" (s), "=d" (count), "=d" (tmp) | |
176 | : "0" (d), "1" (s), "2" (count)); | |
177 | } | |
178 | } else { | |
179 | if (count < 16) { | |
180 | asm volatile ("\n" | |
181 | " lsr.l #1,%2 ; jcc 1f ; move.b -(%1),-(%0)\n" | |
182 | "1: lsr.l #1,%2 ; jcc 1f ; move.w -(%1),-(%0)\n" | |
183 | "1: lsr.l #1,%2 ; jcc 1f ; move.l -(%1),-(%0)\n" | |
184 | "1: lsr.l #1,%2 ; jcc 1f ; move.l -(%1),-(%0) ; move.l -(%1),-(%0)\n" | |
185 | "1:" | |
186 | : "=a" (d), "=a" (s), "=d" (count) | |
187 | : "0" ((char *) d + count), "1" ((char *) s + count), "2" (count)); | |
188 | } else { | |
189 | long tmp; | |
190 | ||
191 | asm volatile ("\n" | |
192 | " move.l %0,%3\n" | |
193 | " lsr.l #1,%3 ; jcc 1f ; move.b -(%1),-(%0) ; subqw #1,%2\n" | |
194 | " lsr.l #1,%3 ; jcs 2f\n" /* %0 increased=>bit 2 switched*/ | |
195 | " move.w -(%1),-(%0) ; subqw #2,%2 ; jra 2f\n" | |
196 | "1: lsr.l #1,%3 ; jcc 2f\n" | |
197 | " move.w -(%1),-(%0) ; subqw #2,%2\n" | |
198 | "2: move.w %2,%-; lsr.l #2,%2 ; jeq 6f\n" | |
199 | " lsr.l #1,%2 ; jcc 3f ; move.l -(%1),-(%0)\n" | |
200 | "3: lsr.l #1,%2 ; jcc 4f ; move.l -(%1),-(%0) ; move.l -(%1),-(%0)\n" | |
201 | "4: subq.l #1,%2 ; jcs 6f\n" | |
202 | "5: move.l -(%1),-(%0); move.l -(%1),-(%0)\n" | |
203 | " move.l -(%1),-(%0); move.l -(%1),-(%0)\n" | |
204 | " dbra %2,5b ; clr.w %2; subq.l #1,%2; jcc 5b\n" | |
205 | "6: move.w %+,%2; btst #1,%2 ; jeq 7f ; move.w -(%1),-(%0)\n" | |
206 | "7: btst #0,%2 ; jeq 8f ; move.b -(%1),-(%0)\n" | |
207 | "8:" | |
208 | : "=a" (d), "=a" (s), "=d" (count), "=d" (tmp) | |
209 | : "0" ((char *) d + count), "1" ((char *) s + count), "2" (count)); | |
210 | } | |
211 | } | |
212 | ||
213 | return 0; | |
214 | } | |
215 | ||
216 | ||
217 | /* ++andreas: Simple and fast version of memmove, assumes size is | |
218 | divisible by 16, suitable for moving the whole screen bitplane */ | |
219 | static inline void fast_memmove(char *dst, const char *src, size_t size) | |
220 | { | |
221 | if (!size) | |
222 | return; | |
223 | if (dst < src) | |
224 | asm volatile ("\n" | |
225 | "1: movem.l (%0)+,%%d0/%%d1/%%a0/%%a1\n" | |
226 | " movem.l %%d0/%%d1/%%a0/%%a1,%1@\n" | |
227 | " addq.l #8,%1; addq.l #8,%1\n" | |
228 | " dbra %2,1b\n" | |
229 | " clr.w %2; subq.l #1,%2\n" | |
230 | " jcc 1b" | |
231 | : "=a" (src), "=a" (dst), "=d" (size) | |
232 | : "0" (src), "1" (dst), "2" (size / 16 - 1) | |
233 | : "d0", "d1", "a0", "a1", "memory"); | |
234 | else | |
235 | asm volatile ("\n" | |
236 | "1: subq.l #8,%0; subq.l #8,%0\n" | |
237 | " movem.l %0@,%%d0/%%d1/%%a0/%%a1\n" | |
238 | " movem.l %%d0/%%d1/%%a0/%%a1,-(%1)\n" | |
239 | " dbra %2,1b\n" | |
240 | " clr.w %2; subq.l #1,%2\n" | |
241 | " jcc 1b" | |
242 | : "=a" (src), "=a" (dst), "=d" (size) | |
243 | : "0" (src + size), "1" (dst + size), "2" (size / 16 - 1) | |
244 | : "d0", "d1", "a0", "a1", "memory"); | |
245 | } | |
246 | ||
247 | #ifdef BPL | |
248 | ||
249 | /* | |
250 | * This expands a up to 8 bit color into two longs | |
251 | * for movel operations. | |
252 | */ | |
253 | static const u32 four2long[] = { | |
254 | 0x00000000, 0x000000ff, 0x0000ff00, 0x0000ffff, | |
255 | 0x00ff0000, 0x00ff00ff, 0x00ffff00, 0x00ffffff, | |
256 | 0xff000000, 0xff0000ff, 0xff00ff00, 0xff00ffff, | |
257 | 0xffff0000, 0xffff00ff, 0xffffff00, 0xffffffff, | |
258 | }; | |
259 | ||
260 | static inline void expand8_col2mask(u8 c, u32 m[]) | |
261 | { | |
262 | m[0] = four2long[c & 15]; | |
263 | #if BPL > 4 | |
264 | m[1] = four2long[c >> 4]; | |
265 | #endif | |
266 | } | |
267 | ||
268 | static inline void expand8_2col2mask(u8 fg, u8 bg, u32 fgm[], u32 bgm[]) | |
269 | { | |
270 | fgm[0] = four2long[fg & 15] ^ (bgm[0] = four2long[bg & 15]); | |
271 | #if BPL > 4 | |
272 | fgm[1] = four2long[fg >> 4] ^ (bgm[1] = four2long[bg >> 4]); | |
273 | #endif | |
274 | } | |
275 | ||
276 | /* | |
277 | * set an 8bit value to a color | |
278 | */ | |
279 | static inline void fill8_col(u8 *dst, u32 m[]) | |
280 | { | |
281 | u32 tmp = m[0]; | |
282 | dst[0] = tmp; | |
283 | dst[2] = (tmp >>= 8); | |
284 | #if BPL > 2 | |
285 | dst[4] = (tmp >>= 8); | |
286 | dst[6] = tmp >> 8; | |
287 | #endif | |
288 | #if BPL > 4 | |
289 | tmp = m[1]; | |
290 | dst[8] = tmp; | |
291 | dst[10] = (tmp >>= 8); | |
292 | dst[12] = (tmp >>= 8); | |
293 | dst[14] = tmp >> 8; | |
294 | #endif | |
295 | } | |
296 | ||
297 | /* | |
298 | * set an 8bit value according to foreground/background color | |
299 | */ | |
300 | static inline void fill8_2col(u8 *dst, u8 fg, u8 bg, u32 mask) | |
301 | { | |
302 | u32 fgm[2], bgm[2], tmp; | |
303 | ||
304 | expand8_2col2mask(fg, bg, fgm, bgm); | |
305 | ||
306 | mask |= mask << 8; | |
307 | #if BPL > 2 | |
308 | mask |= mask << 16; | |
309 | #endif | |
310 | tmp = (mask & fgm[0]) ^ bgm[0]; | |
311 | dst[0] = tmp; | |
312 | dst[2] = (tmp >>= 8); | |
313 | #if BPL > 2 | |
314 | dst[4] = (tmp >>= 8); | |
315 | dst[6] = tmp >> 8; | |
316 | #endif | |
317 | #if BPL > 4 | |
318 | tmp = (mask & fgm[1]) ^ bgm[1]; | |
319 | dst[8] = tmp; | |
320 | dst[10] = (tmp >>= 8); | |
321 | dst[12] = (tmp >>= 8); | |
322 | dst[14] = tmp >> 8; | |
323 | #endif | |
324 | } | |
325 | ||
326 | static const u32 two2word[] = { | |
327 | 0x00000000, 0xffff0000, 0x0000ffff, 0xffffffff | |
328 | }; | |
329 | ||
330 | static inline void expand16_col2mask(u8 c, u32 m[]) | |
331 | { | |
332 | m[0] = two2word[c & 3]; | |
333 | #if BPL > 2 | |
334 | m[1] = two2word[(c >> 2) & 3]; | |
335 | #endif | |
336 | #if BPL > 4 | |
337 | m[2] = two2word[(c >> 4) & 3]; | |
338 | m[3] = two2word[c >> 6]; | |
339 | #endif | |
340 | } | |
341 | ||
342 | static inline void expand16_2col2mask(u8 fg, u8 bg, u32 fgm[], u32 bgm[]) | |
343 | { | |
344 | bgm[0] = two2word[bg & 3]; | |
345 | fgm[0] = two2word[fg & 3] ^ bgm[0]; | |
346 | #if BPL > 2 | |
347 | bgm[1] = two2word[(bg >> 2) & 3]; | |
348 | fgm[1] = two2word[(fg >> 2) & 3] ^ bgm[1]; | |
349 | #endif | |
350 | #if BPL > 4 | |
351 | bgm[2] = two2word[(bg >> 4) & 3]; | |
352 | fgm[2] = two2word[(fg >> 4) & 3] ^ bgm[2]; | |
353 | bgm[3] = two2word[bg >> 6]; | |
354 | fgm[3] = two2word[fg >> 6] ^ bgm[3]; | |
355 | #endif | |
356 | } | |
357 | ||
358 | static inline u32 *fill16_col(u32 *dst, int rows, u32 m[]) | |
359 | { | |
360 | while (rows) { | |
361 | *dst++ = m[0]; | |
362 | #if BPL > 2 | |
363 | *dst++ = m[1]; | |
364 | #endif | |
365 | #if BPL > 4 | |
366 | *dst++ = m[2]; | |
367 | *dst++ = m[3]; | |
368 | #endif | |
369 | rows--; | |
370 | } | |
371 | return dst; | |
372 | } | |
373 | ||
374 | static inline void memmove32_col(void *dst, void *src, u32 mask, u32 h, u32 bytes) | |
375 | { | |
376 | u32 *s, *d, v; | |
377 | ||
378 | s = src; | |
379 | d = dst; | |
380 | do { | |
381 | v = (*s++ & mask) | (*d & ~mask); | |
382 | *d++ = v; | |
383 | #if BPL > 2 | |
384 | v = (*s++ & mask) | (*d & ~mask); | |
385 | *d++ = v; | |
386 | #endif | |
387 | #if BPL > 4 | |
388 | v = (*s++ & mask) | (*d & ~mask); | |
389 | *d++ = v; | |
390 | v = (*s++ & mask) | (*d & ~mask); | |
391 | *d++ = v; | |
392 | #endif | |
393 | d = (u32 *)((u8 *)d + bytes); | |
394 | s = (u32 *)((u8 *)s + bytes); | |
395 | } while (--h); | |
396 | } | |
397 | ||
398 | #endif | |
399 | ||
400 | #endif /* _VIDEO_ATAFB_UTILS_H */ |