Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
mmx_32.c
Go to the documentation of this file.
1 /*
2  * MMX 3DNow! library helper functions
3  *
4  * To do:
5  * We can use MMX just for prefetch in IRQ's. This may be a win.
6  * (reported so on K6-III)
7  * We should use a better code neutral filler for the short jump
8  * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
9  * We also want to clobber the filler register so we don't get any
10  * register forwarding stalls on the filler.
11  *
12  * Add *user handling. Checksums are not a win with MMX on any CPU
13  * tested so far for any MMX solution figured.
14  *
15  * 22/09/2000 - Arjan van de Ven
16  * Improved for non-egineering-sample Athlons
17  *
18  */
19 #include <linux/hardirq.h>
20 #include <linux/string.h>
21 #include <linux/module.h>
22 #include <linux/sched.h>
23 #include <linux/types.h>
24 
25 #include <asm/i387.h>
26 #include <asm/asm.h>
27 
28 void *_mmx_memcpy(void *to, const void *from, size_t len)
29 {
30  void *p;
31  int i;
32 
33  if (unlikely(in_interrupt()))
34  return __memcpy(to, from, len);
35 
36  p = to;
37  i = len >> 6; /* len/64 */
38 
39  kernel_fpu_begin();
40 
41  __asm__ __volatile__ (
42  "1: prefetch (%0)\n" /* This set is 28 bytes */
43  " prefetch 64(%0)\n"
44  " prefetch 128(%0)\n"
45  " prefetch 192(%0)\n"
46  " prefetch 256(%0)\n"
47  "2: \n"
48  ".section .fixup, \"ax\"\n"
49  "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
50  " jmp 2b\n"
51  ".previous\n"
52  _ASM_EXTABLE(1b, 3b)
53  : : "r" (from));
54 
55  for ( ; i > 5; i--) {
56  __asm__ __volatile__ (
57  "1: prefetch 320(%0)\n"
58  "2: movq (%0), %%mm0\n"
59  " movq 8(%0), %%mm1\n"
60  " movq 16(%0), %%mm2\n"
61  " movq 24(%0), %%mm3\n"
62  " movq %%mm0, (%1)\n"
63  " movq %%mm1, 8(%1)\n"
64  " movq %%mm2, 16(%1)\n"
65  " movq %%mm3, 24(%1)\n"
66  " movq 32(%0), %%mm0\n"
67  " movq 40(%0), %%mm1\n"
68  " movq 48(%0), %%mm2\n"
69  " movq 56(%0), %%mm3\n"
70  " movq %%mm0, 32(%1)\n"
71  " movq %%mm1, 40(%1)\n"
72  " movq %%mm2, 48(%1)\n"
73  " movq %%mm3, 56(%1)\n"
74  ".section .fixup, \"ax\"\n"
75  "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
76  " jmp 2b\n"
77  ".previous\n"
78  _ASM_EXTABLE(1b, 3b)
79  : : "r" (from), "r" (to) : "memory");
80 
81  from += 64;
82  to += 64;
83  }
84 
85  for ( ; i > 0; i--) {
86  __asm__ __volatile__ (
87  " movq (%0), %%mm0\n"
88  " movq 8(%0), %%mm1\n"
89  " movq 16(%0), %%mm2\n"
90  " movq 24(%0), %%mm3\n"
91  " movq %%mm0, (%1)\n"
92  " movq %%mm1, 8(%1)\n"
93  " movq %%mm2, 16(%1)\n"
94  " movq %%mm3, 24(%1)\n"
95  " movq 32(%0), %%mm0\n"
96  " movq 40(%0), %%mm1\n"
97  " movq 48(%0), %%mm2\n"
98  " movq 56(%0), %%mm3\n"
99  " movq %%mm0, 32(%1)\n"
100  " movq %%mm1, 40(%1)\n"
101  " movq %%mm2, 48(%1)\n"
102  " movq %%mm3, 56(%1)\n"
103  : : "r" (from), "r" (to) : "memory");
104 
105  from += 64;
106  to += 64;
107  }
108  /*
109  * Now do the tail of the block:
110  */
111  __memcpy(to, from, len & 63);
112  kernel_fpu_end();
113 
114  return p;
115 }
117 
118 #ifdef CONFIG_MK7
119 
120 /*
121  * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
122  * other MMX using processors do not.
123  */
124 
125 static void fast_clear_page(void *page)
126 {
127  int i;
128 
129  kernel_fpu_begin();
130 
131  __asm__ __volatile__ (
132  " pxor %%mm0, %%mm0\n" : :
133  );
134 
135  for (i = 0; i < 4096/64; i++) {
136  __asm__ __volatile__ (
137  " movntq %%mm0, (%0)\n"
138  " movntq %%mm0, 8(%0)\n"
139  " movntq %%mm0, 16(%0)\n"
140  " movntq %%mm0, 24(%0)\n"
141  " movntq %%mm0, 32(%0)\n"
142  " movntq %%mm0, 40(%0)\n"
143  " movntq %%mm0, 48(%0)\n"
144  " movntq %%mm0, 56(%0)\n"
145  : : "r" (page) : "memory");
146  page += 64;
147  }
148 
149  /*
150  * Since movntq is weakly-ordered, a "sfence" is needed to become
151  * ordered again:
152  */
153  __asm__ __volatile__("sfence\n"::);
154 
155  kernel_fpu_end();
156 }
157 
158 static void fast_copy_page(void *to, void *from)
159 {
160  int i;
161 
162  kernel_fpu_begin();
163 
164  /*
165  * maybe the prefetch stuff can go before the expensive fnsave...
166  * but that is for later. -AV
167  */
168  __asm__ __volatile__(
169  "1: prefetch (%0)\n"
170  " prefetch 64(%0)\n"
171  " prefetch 128(%0)\n"
172  " prefetch 192(%0)\n"
173  " prefetch 256(%0)\n"
174  "2: \n"
175  ".section .fixup, \"ax\"\n"
176  "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
177  " jmp 2b\n"
178  ".previous\n"
179  _ASM_EXTABLE(1b, 3b) : : "r" (from));
180 
181  for (i = 0; i < (4096-320)/64; i++) {
182  __asm__ __volatile__ (
183  "1: prefetch 320(%0)\n"
184  "2: movq (%0), %%mm0\n"
185  " movntq %%mm0, (%1)\n"
186  " movq 8(%0), %%mm1\n"
187  " movntq %%mm1, 8(%1)\n"
188  " movq 16(%0), %%mm2\n"
189  " movntq %%mm2, 16(%1)\n"
190  " movq 24(%0), %%mm3\n"
191  " movntq %%mm3, 24(%1)\n"
192  " movq 32(%0), %%mm4\n"
193  " movntq %%mm4, 32(%1)\n"
194  " movq 40(%0), %%mm5\n"
195  " movntq %%mm5, 40(%1)\n"
196  " movq 48(%0), %%mm6\n"
197  " movntq %%mm6, 48(%1)\n"
198  " movq 56(%0), %%mm7\n"
199  " movntq %%mm7, 56(%1)\n"
200  ".section .fixup, \"ax\"\n"
201  "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
202  " jmp 2b\n"
203  ".previous\n"
204  _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory");
205 
206  from += 64;
207  to += 64;
208  }
209 
210  for (i = (4096-320)/64; i < 4096/64; i++) {
211  __asm__ __volatile__ (
212  "2: movq (%0), %%mm0\n"
213  " movntq %%mm0, (%1)\n"
214  " movq 8(%0), %%mm1\n"
215  " movntq %%mm1, 8(%1)\n"
216  " movq 16(%0), %%mm2\n"
217  " movntq %%mm2, 16(%1)\n"
218  " movq 24(%0), %%mm3\n"
219  " movntq %%mm3, 24(%1)\n"
220  " movq 32(%0), %%mm4\n"
221  " movntq %%mm4, 32(%1)\n"
222  " movq 40(%0), %%mm5\n"
223  " movntq %%mm5, 40(%1)\n"
224  " movq 48(%0), %%mm6\n"
225  " movntq %%mm6, 48(%1)\n"
226  " movq 56(%0), %%mm7\n"
227  " movntq %%mm7, 56(%1)\n"
228  : : "r" (from), "r" (to) : "memory");
229  from += 64;
230  to += 64;
231  }
232  /*
233  * Since movntq is weakly-ordered, a "sfence" is needed to become
234  * ordered again:
235  */
236  __asm__ __volatile__("sfence \n"::);
237  kernel_fpu_end();
238 }
239 
240 #else /* CONFIG_MK7 */
241 
242 /*
243  * Generic MMX implementation without K7 specific streaming
244  */
245 static void fast_clear_page(void *page)
246 {
247  int i;
248 
249  kernel_fpu_begin();
250 
251  __asm__ __volatile__ (
252  " pxor %%mm0, %%mm0\n" : :
253  );
254 
255  for (i = 0; i < 4096/128; i++) {
256  __asm__ __volatile__ (
257  " movq %%mm0, (%0)\n"
258  " movq %%mm0, 8(%0)\n"
259  " movq %%mm0, 16(%0)\n"
260  " movq %%mm0, 24(%0)\n"
261  " movq %%mm0, 32(%0)\n"
262  " movq %%mm0, 40(%0)\n"
263  " movq %%mm0, 48(%0)\n"
264  " movq %%mm0, 56(%0)\n"
265  " movq %%mm0, 64(%0)\n"
266  " movq %%mm0, 72(%0)\n"
267  " movq %%mm0, 80(%0)\n"
268  " movq %%mm0, 88(%0)\n"
269  " movq %%mm0, 96(%0)\n"
270  " movq %%mm0, 104(%0)\n"
271  " movq %%mm0, 112(%0)\n"
272  " movq %%mm0, 120(%0)\n"
273  : : "r" (page) : "memory");
274  page += 128;
275  }
276 
277  kernel_fpu_end();
278 }
279 
280 static void fast_copy_page(void *to, void *from)
281 {
282  int i;
283 
284  kernel_fpu_begin();
285 
286  __asm__ __volatile__ (
287  "1: prefetch (%0)\n"
288  " prefetch 64(%0)\n"
289  " prefetch 128(%0)\n"
290  " prefetch 192(%0)\n"
291  " prefetch 256(%0)\n"
292  "2: \n"
293  ".section .fixup, \"ax\"\n"
294  "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
295  " jmp 2b\n"
296  ".previous\n"
297  _ASM_EXTABLE(1b, 3b) : : "r" (from));
298 
299  for (i = 0; i < 4096/64; i++) {
300  __asm__ __volatile__ (
301  "1: prefetch 320(%0)\n"
302  "2: movq (%0), %%mm0\n"
303  " movq 8(%0), %%mm1\n"
304  " movq 16(%0), %%mm2\n"
305  " movq 24(%0), %%mm3\n"
306  " movq %%mm0, (%1)\n"
307  " movq %%mm1, 8(%1)\n"
308  " movq %%mm2, 16(%1)\n"
309  " movq %%mm3, 24(%1)\n"
310  " movq 32(%0), %%mm0\n"
311  " movq 40(%0), %%mm1\n"
312  " movq 48(%0), %%mm2\n"
313  " movq 56(%0), %%mm3\n"
314  " movq %%mm0, 32(%1)\n"
315  " movq %%mm1, 40(%1)\n"
316  " movq %%mm2, 48(%1)\n"
317  " movq %%mm3, 56(%1)\n"
318  ".section .fixup, \"ax\"\n"
319  "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
320  " jmp 2b\n"
321  ".previous\n"
322  _ASM_EXTABLE(1b, 3b)
323  : : "r" (from), "r" (to) : "memory");
324 
325  from += 64;
326  to += 64;
327  }
328  kernel_fpu_end();
329 }
330 
331 #endif /* !CONFIG_MK7 */
332 
333 /*
334  * Favour MMX for page clear and copy:
335  */
336 static void slow_zero_page(void *page)
337 {
338  int d0, d1;
339 
340  __asm__ __volatile__(
341  "cld\n\t"
342  "rep ; stosl"
343 
344  : "=&c" (d0), "=&D" (d1)
345  :"a" (0), "1" (page), "0" (1024)
346  :"memory");
347 }
348 
349 void mmx_clear_page(void *page)
350 {
351  if (unlikely(in_interrupt()))
352  slow_zero_page(page);
353  else
354  fast_clear_page(page);
355 }
357 
358 static void slow_copy_page(void *to, void *from)
359 {
360  int d0, d1, d2;
361 
362  __asm__ __volatile__(
363  "cld\n\t"
364  "rep ; movsl"
365  : "=&c" (d0), "=&D" (d1), "=&S" (d2)
366  : "0" (1024), "1" ((long) to), "2" ((long) from)
367  : "memory");
368 }
369 
370 void mmx_copy_page(void *to, void *from)
371 {
372  if (unlikely(in_interrupt()))
373  slow_copy_page(to, from);
374  else
375  fast_copy_page(to, from);
376 }