Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
atafb_utils.h
Go to the documentation of this file.
1 #ifndef _VIDEO_ATAFB_UTILS_H
2 #define _VIDEO_ATAFB_UTILS_H
3 
4 /* ================================================================= */
5 /* Utility Assembler Functions */
6 /* ================================================================= */
7 
8 /* ====================================================================== */
9 
10 /* Those of a delicate disposition might like to skip the next couple of
11  * pages.
12  *
13  * These functions are drop in replacements for memmove and
14  * memset(_, 0, _). However their five instances add at least a kilobyte
15  * to the object file. You have been warned.
16  *
17  * Not a great fan of assembler for the sake of it, but I think
18  * that these routines are at least 10 times faster than their C
19  * equivalents for large blits, and that's important to the lowest level of
20  * a graphics driver. Question is whether some scheme with the blitter
21  * would be faster. I suspect not for simple text system - not much
22  * asynchrony.
23  *
24  * Code is very simple, just gruesome expansion. Basic strategy is to
25  * increase data moved/cleared at each step to 16 bytes to reduce
26  * instruction per data move overhead. movem might be faster still
27  * For more than 15 bytes, we try to align the write direction on a
28  * longword boundary to get maximum speed. This is even more gruesome.
29  * Unaligned read/write used requires 68020+ - think this is a problem?
30  *
31  * Sorry!
32  */
33 
34 
35 /* ++roman: I've optimized Robert's original versions in some minor
36  * aspects, e.g. moveq instead of movel, let gcc choose the registers,
37  * use movem in some places...
38  * For other modes than 1 plane, lots of more such assembler functions
39  * were needed (e.g. the ones using movep or expanding color values).
40  */
41 
42 /* ++andreas: more optimizations:
43  subl #65536,d0 replaced by clrw d0; subql #1,d0 for dbcc
44  addal is faster than addaw
45  movep is rather expensive compared to ordinary move's
46  some functions rewritten in C for clarity, no speed loss */
47 
48 static inline void *fb_memclear_small(void *s, size_t count)
49 {
50  if (!count)
51  return 0;
52 
53  asm volatile ("\n"
54  " lsr.l #1,%1 ; jcc 1f ; move.b %2,-(%0)\n"
55  "1: lsr.l #1,%1 ; jcc 1f ; move.w %2,-(%0)\n"
56  "1: lsr.l #1,%1 ; jcc 1f ; move.l %2,-(%0)\n"
57  "1: lsr.l #1,%1 ; jcc 1f ; move.l %2,-(%0) ; move.l %2,-(%0)\n"
58  "1:"
59  : "=a" (s), "=d" (count)
60  : "d" (0), "0" ((char *)s + count), "1" (count));
61  asm volatile ("\n"
62  " subq.l #1,%1\n"
63  " jcs 3f\n"
64  " move.l %2,%%d4; move.l %2,%%d5; move.l %2,%%d6\n"
65  "2: movem.l %2/%%d4/%%d5/%%d6,-(%0)\n"
66  " dbra %1,2b\n"
67  "3:"
68  : "=a" (s), "=d" (count)
69  : "d" (0), "0" (s), "1" (count)
70  : "d4", "d5", "d6"
71  );
72 
73  return 0;
74 }
75 
76 
77 static inline void *fb_memclear(void *s, size_t count)
78 {
79  if (!count)
80  return 0;
81 
82  if (count < 16) {
83  asm volatile ("\n"
84  " lsr.l #1,%1 ; jcc 1f ; clr.b (%0)+\n"
85  "1: lsr.l #1,%1 ; jcc 1f ; clr.w (%0)+\n"
86  "1: lsr.l #1,%1 ; jcc 1f ; clr.l (%0)+\n"
87  "1: lsr.l #1,%1 ; jcc 1f ; clr.l (%0)+ ; clr.l (%0)+\n"
88  "1:"
89  : "=a" (s), "=d" (count)
90  : "0" (s), "1" (count));
91  } else {
92  long tmp;
93  asm volatile ("\n"
94  " move.l %1,%2\n"
95  " lsr.l #1,%2 ; jcc 1f ; clr.b (%0)+ ; subq.w #1,%1\n"
96  " lsr.l #1,%2 ; jcs 2f\n" /* %0 increased=>bit 2 switched*/
97  " clr.w (%0)+ ; subq.w #2,%1 ; jra 2f\n"
98  "1: lsr.l #1,%2 ; jcc 2f\n"
99  " clr.w (%0)+ ; subq.w #2,%1\n"
100  "2: move.w %1,%2; lsr.l #2,%1 ; jeq 6f\n"
101  " lsr.l #1,%1 ; jcc 3f ; clr.l (%0)+\n"
102  "3: lsr.l #1,%1 ; jcc 4f ; clr.l (%0)+ ; clr.l (%0)+\n"
103  "4: subq.l #1,%1 ; jcs 6f\n"
104  "5: clr.l (%0)+; clr.l (%0)+ ; clr.l (%0)+ ; clr.l (%0)+\n"
105  " dbra %1,5b ; clr.w %1; subq.l #1,%1; jcc 5b\n"
106  "6: move.w %2,%1; btst #1,%1 ; jeq 7f ; clr.w (%0)+\n"
107  "7: btst #0,%1 ; jeq 8f ; clr.b (%0)+\n"
108  "8:"
109  : "=a" (s), "=d" (count), "=d" (tmp)
110  : "0" (s), "1" (count));
111  }
112 
113  return 0;
114 }
115 
116 
117 static inline void *fb_memset255(void *s, size_t count)
118 {
119  if (!count)
120  return 0;
121 
122  asm volatile ("\n"
123  " lsr.l #1,%1 ; jcc 1f ; move.b %2,-(%0)\n"
124  "1: lsr.l #1,%1 ; jcc 1f ; move.w %2,-(%0)\n"
125  "1: lsr.l #1,%1 ; jcc 1f ; move.l %2,-(%0)\n"
126  "1: lsr.l #1,%1 ; jcc 1f ; move.l %2,-(%0) ; move.l %2,-(%0)\n"
127  "1:"
128  : "=a" (s), "=d" (count)
129  : "d" (-1), "0" ((char *)s+count), "1" (count));
130  asm volatile ("\n"
131  " subq.l #1,%1 ; jcs 3f\n"
132  " move.l %2,%%d4; move.l %2,%%d5; move.l %2,%%d6\n"
133  "2: movem.l %2/%%d4/%%d5/%%d6,-(%0)\n"
134  " dbra %1,2b\n"
135  "3:"
136  : "=a" (s), "=d" (count)
137  : "d" (-1), "0" (s), "1" (count)
138  : "d4", "d5", "d6");
139 
140  return 0;
141 }
142 
143 
144 static inline void *fb_memmove(void *d, const void *s, size_t count)
145 {
146  if (d < s) {
147  if (count < 16) {
148  asm volatile ("\n"
149  " lsr.l #1,%2 ; jcc 1f ; move.b (%1)+,(%0)+\n"
150  "1: lsr.l #1,%2 ; jcc 1f ; move.w (%1)+,(%0)+\n"
151  "1: lsr.l #1,%2 ; jcc 1f ; move.l (%1)+,(%0)+\n"
152  "1: lsr.l #1,%2 ; jcc 1f ; move.l (%1)+,(%0)+ ; move.l (%1)+,(%0)+\n"
153  "1:"
154  : "=a" (d), "=a" (s), "=d" (count)
155  : "0" (d), "1" (s), "2" (count));
156  } else {
157  long tmp;
158  asm volatile ("\n"
159  " move.l %0,%3\n"
160  " lsr.l #1,%3 ; jcc 1f ; move.b (%1)+,(%0)+ ; subqw #1,%2\n"
161  " lsr.l #1,%3 ; jcs 2f\n" /* %0 increased=>bit 2 switched*/
162  " move.w (%1)+,(%0)+ ; subqw #2,%2 ; jra 2f\n"
163  "1: lsr.l #1,%3 ; jcc 2f\n"
164  " move.w (%1)+,(%0)+ ; subqw #2,%2\n"
165  "2: move.w %2,%-; lsr.l #2,%2 ; jeq 6f\n"
166  " lsr.l #1,%2 ; jcc 3f ; move.l (%1)+,(%0)+\n"
167  "3: lsr.l #1,%2 ; jcc 4f ; move.l (%1)+,(%0)+ ; move.l (%1)+,(%0)+\n"
168  "4: subq.l #1,%2 ; jcs 6f\n"
169  "5: move.l (%1)+,(%0)+; move.l (%1)+,(%0)+\n"
170  " move.l (%1)+,(%0)+; move.l (%1)+,(%0)+\n"
171  " dbra %2,5b ; clr.w %2; subq.l #1,%2; jcc 5b\n"
172  "6: move.w %+,%2; btst #1,%2 ; jeq 7f ; move.w (%1)+,(%0)+\n"
173  "7: btst #0,%2 ; jeq 8f ; move.b (%1)+,(%0)+\n"
174  "8:"
175  : "=a" (d), "=a" (s), "=d" (count), "=d" (tmp)
176  : "0" (d), "1" (s), "2" (count));
177  }
178  } else {
179  if (count < 16) {
180  asm volatile ("\n"
181  " lsr.l #1,%2 ; jcc 1f ; move.b -(%1),-(%0)\n"
182  "1: lsr.l #1,%2 ; jcc 1f ; move.w -(%1),-(%0)\n"
183  "1: lsr.l #1,%2 ; jcc 1f ; move.l -(%1),-(%0)\n"
184  "1: lsr.l #1,%2 ; jcc 1f ; move.l -(%1),-(%0) ; move.l -(%1),-(%0)\n"
185  "1:"
186  : "=a" (d), "=a" (s), "=d" (count)
187  : "0" ((char *) d + count), "1" ((char *) s + count), "2" (count));
188  } else {
189  long tmp;
190 
191  asm volatile ("\n"
192  " move.l %0,%3\n"
193  " lsr.l #1,%3 ; jcc 1f ; move.b -(%1),-(%0) ; subqw #1,%2\n"
194  " lsr.l #1,%3 ; jcs 2f\n" /* %0 increased=>bit 2 switched*/
195  " move.w -(%1),-(%0) ; subqw #2,%2 ; jra 2f\n"
196  "1: lsr.l #1,%3 ; jcc 2f\n"
197  " move.w -(%1),-(%0) ; subqw #2,%2\n"
198  "2: move.w %2,%-; lsr.l #2,%2 ; jeq 6f\n"
199  " lsr.l #1,%2 ; jcc 3f ; move.l -(%1),-(%0)\n"
200  "3: lsr.l #1,%2 ; jcc 4f ; move.l -(%1),-(%0) ; move.l -(%1),-(%0)\n"
201  "4: subq.l #1,%2 ; jcs 6f\n"
202  "5: move.l -(%1),-(%0); move.l -(%1),-(%0)\n"
203  " move.l -(%1),-(%0); move.l -(%1),-(%0)\n"
204  " dbra %2,5b ; clr.w %2; subq.l #1,%2; jcc 5b\n"
205  "6: move.w %+,%2; btst #1,%2 ; jeq 7f ; move.w -(%1),-(%0)\n"
206  "7: btst #0,%2 ; jeq 8f ; move.b -(%1),-(%0)\n"
207  "8:"
208  : "=a" (d), "=a" (s), "=d" (count), "=d" (tmp)
209  : "0" ((char *) d + count), "1" ((char *) s + count), "2" (count));
210  }
211  }
212 
213  return 0;
214 }
215 
216 
217 /* ++andreas: Simple and fast version of memmove, assumes size is
218  divisible by 16, suitable for moving the whole screen bitplane */
219 static inline void fast_memmove(char *dst, const char *src, size_t size)
220 {
221  if (!size)
222  return;
223  if (dst < src)
224  asm volatile ("\n"
225  "1: movem.l (%0)+,%%d0/%%d1/%%a0/%%a1\n"
226  " movem.l %%d0/%%d1/%%a0/%%a1,%1@\n"
227  " addq.l #8,%1; addq.l #8,%1\n"
228  " dbra %2,1b\n"
229  " clr.w %2; subq.l #1,%2\n"
230  " jcc 1b"
231  : "=a" (src), "=a" (dst), "=d" (size)
232  : "0" (src), "1" (dst), "2" (size / 16 - 1)
233  : "d0", "d1", "a0", "a1", "memory");
234  else
235  asm volatile ("\n"
236  "1: subq.l #8,%0; subq.l #8,%0\n"
237  " movem.l %0@,%%d0/%%d1/%%a0/%%a1\n"
238  " movem.l %%d0/%%d1/%%a0/%%a1,-(%1)\n"
239  " dbra %2,1b\n"
240  " clr.w %2; subq.l #1,%2\n"
241  " jcc 1b"
242  : "=a" (src), "=a" (dst), "=d" (size)
243  : "0" (src + size), "1" (dst + size), "2" (size / 16 - 1)
244  : "d0", "d1", "a0", "a1", "memory");
245 }
246 
247 #ifdef BPL
248 
249 /*
250  * This expands a up to 8 bit color into two longs
251  * for movel operations.
252  */
253 static const u32 four2long[] = {
254  0x00000000, 0x000000ff, 0x0000ff00, 0x0000ffff,
255  0x00ff0000, 0x00ff00ff, 0x00ffff00, 0x00ffffff,
256  0xff000000, 0xff0000ff, 0xff00ff00, 0xff00ffff,
257  0xffff0000, 0xffff00ff, 0xffffff00, 0xffffffff,
258 };
259 
260 static inline void expand8_col2mask(u8 c, u32 m[])
261 {
262  m[0] = four2long[c & 15];
263 #if BPL > 4
264  m[1] = four2long[c >> 4];
265 #endif
266 }
267 
268 static inline void expand8_2col2mask(u8 fg, u8 bg, u32 fgm[], u32 bgm[])
269 {
270  fgm[0] = four2long[fg & 15] ^ (bgm[0] = four2long[bg & 15]);
271 #if BPL > 4
272  fgm[1] = four2long[fg >> 4] ^ (bgm[1] = four2long[bg >> 4]);
273 #endif
274 }
275 
276 /*
277  * set an 8bit value to a color
278  */
279 static inline void fill8_col(u8 *dst, u32 m[])
280 {
281  u32 tmp = m[0];
282  dst[0] = tmp;
283  dst[2] = (tmp >>= 8);
284 #if BPL > 2
285  dst[4] = (tmp >>= 8);
286  dst[6] = tmp >> 8;
287 #endif
288 #if BPL > 4
289  tmp = m[1];
290  dst[8] = tmp;
291  dst[10] = (tmp >>= 8);
292  dst[12] = (tmp >>= 8);
293  dst[14] = tmp >> 8;
294 #endif
295 }
296 
297 /*
298  * set an 8bit value according to foreground/background color
299  */
300 static inline void fill8_2col(u8 *dst, u8 fg, u8 bg, u32 mask)
301 {
302  u32 fgm[2], bgm[2], tmp;
303 
304  expand8_2col2mask(fg, bg, fgm, bgm);
305 
306  mask |= mask << 8;
307 #if BPL > 2
308  mask |= mask << 16;
309 #endif
310  tmp = (mask & fgm[0]) ^ bgm[0];
311  dst[0] = tmp;
312  dst[2] = (tmp >>= 8);
313 #if BPL > 2
314  dst[4] = (tmp >>= 8);
315  dst[6] = tmp >> 8;
316 #endif
317 #if BPL > 4
318  tmp = (mask & fgm[1]) ^ bgm[1];
319  dst[8] = tmp;
320  dst[10] = (tmp >>= 8);
321  dst[12] = (tmp >>= 8);
322  dst[14] = tmp >> 8;
323 #endif
324 }
325 
326 static const u32 two2word[] = {
327  0x00000000, 0xffff0000, 0x0000ffff, 0xffffffff
328 };
329 
330 static inline void expand16_col2mask(u8 c, u32 m[])
331 {
332  m[0] = two2word[c & 3];
333 #if BPL > 2
334  m[1] = two2word[(c >> 2) & 3];
335 #endif
336 #if BPL > 4
337  m[2] = two2word[(c >> 4) & 3];
338  m[3] = two2word[c >> 6];
339 #endif
340 }
341 
342 static inline void expand16_2col2mask(u8 fg, u8 bg, u32 fgm[], u32 bgm[])
343 {
344  bgm[0] = two2word[bg & 3];
345  fgm[0] = two2word[fg & 3] ^ bgm[0];
346 #if BPL > 2
347  bgm[1] = two2word[(bg >> 2) & 3];
348  fgm[1] = two2word[(fg >> 2) & 3] ^ bgm[1];
349 #endif
350 #if BPL > 4
351  bgm[2] = two2word[(bg >> 4) & 3];
352  fgm[2] = two2word[(fg >> 4) & 3] ^ bgm[2];
353  bgm[3] = two2word[bg >> 6];
354  fgm[3] = two2word[fg >> 6] ^ bgm[3];
355 #endif
356 }
357 
358 static inline u32 *fill16_col(u32 *dst, int rows, u32 m[])
359 {
360  while (rows) {
361  *dst++ = m[0];
362 #if BPL > 2
363  *dst++ = m[1];
364 #endif
365 #if BPL > 4
366  *dst++ = m[2];
367  *dst++ = m[3];
368 #endif
369  rows--;
370  }
371  return dst;
372 }
373 
374 static inline void memmove32_col(void *dst, void *src, u32 mask, u32 h, u32 bytes)
375 {
376  u32 *s, *d, v;
377 
378  s = src;
379  d = dst;
380  do {
381  v = (*s++ & mask) | (*d & ~mask);
382  *d++ = v;
383 #if BPL > 2
384  v = (*s++ & mask) | (*d & ~mask);
385  *d++ = v;
386 #endif
387 #if BPL > 4
388  v = (*s++ & mask) | (*d & ~mask);
389  *d++ = v;
390  v = (*s++ & mask) | (*d & ~mask);
391  *d++ = v;
392 #endif
393  d = (u32 *)((u8 *)d + bytes);
394  s = (u32 *)((u8 *)s + bytes);
395  } while (--h);
396 }
397 
398 #endif
399 
400 #endif /* _VIDEO_ATAFB_UTILS_H */