Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
xor_32.h
Go to the documentation of this file.
1 #ifndef _ASM_X86_XOR_32_H
2 #define _ASM_X86_XOR_32_H
3 
4 /*
5  * Optimized RAID-5 checksumming functions for MMX and SSE.
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2, or (at your option)
10  * any later version.
11  *
12  * You should have received a copy of the GNU General Public License
13  * (for example /usr/src/linux/COPYING); if not, write to the Free
14  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15  */
16 
17 /*
18  * High-speed RAID5 checksumming functions utilizing MMX instructions.
19  * Copyright (C) 1998 Ingo Molnar.
20  */
21 
22 #define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n"
23 #define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n"
24 #define XO1(x, y) " pxor 8*("#x")(%2), %%mm"#y" ;\n"
25 #define XO2(x, y) " pxor 8*("#x")(%3), %%mm"#y" ;\n"
26 #define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
27 #define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
28 
29 #include <asm/i387.h>
30 
31 static void
32 xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
33 {
34  unsigned long lines = bytes >> 7;
35 
36  kernel_fpu_begin();
37 
38  asm volatile(
39 #undef BLOCK
40 #define BLOCK(i) \
41  LD(i, 0) \
42  LD(i + 1, 1) \
43  LD(i + 2, 2) \
44  LD(i + 3, 3) \
45  XO1(i, 0) \
46  ST(i, 0) \
47  XO1(i+1, 1) \
48  ST(i+1, 1) \
49  XO1(i + 2, 2) \
50  ST(i + 2, 2) \
51  XO1(i + 3, 3) \
52  ST(i + 3, 3)
53 
54  " .align 32 ;\n"
55  " 1: ;\n"
56 
57  BLOCK(0)
58  BLOCK(4)
59  BLOCK(8)
60  BLOCK(12)
61 
62  " addl $128, %1 ;\n"
63  " addl $128, %2 ;\n"
64  " decl %0 ;\n"
65  " jnz 1b ;\n"
66  : "+r" (lines),
67  "+r" (p1), "+r" (p2)
68  :
69  : "memory");
70 
72 }
73 
74 static void
75 xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
76  unsigned long *p3)
77 {
78  unsigned long lines = bytes >> 7;
79 
80  kernel_fpu_begin();
81 
82  asm volatile(
83 #undef BLOCK
84 #define BLOCK(i) \
85  LD(i, 0) \
86  LD(i + 1, 1) \
87  LD(i + 2, 2) \
88  LD(i + 3, 3) \
89  XO1(i, 0) \
90  XO1(i + 1, 1) \
91  XO1(i + 2, 2) \
92  XO1(i + 3, 3) \
93  XO2(i, 0) \
94  ST(i, 0) \
95  XO2(i + 1, 1) \
96  ST(i + 1, 1) \
97  XO2(i + 2, 2) \
98  ST(i + 2, 2) \
99  XO2(i + 3, 3) \
100  ST(i + 3, 3)
101 
102  " .align 32 ;\n"
103  " 1: ;\n"
104 
105  BLOCK(0)
106  BLOCK(4)
107  BLOCK(8)
108  BLOCK(12)
109 
110  " addl $128, %1 ;\n"
111  " addl $128, %2 ;\n"
112  " addl $128, %3 ;\n"
113  " decl %0 ;\n"
114  " jnz 1b ;\n"
115  : "+r" (lines),
116  "+r" (p1), "+r" (p2), "+r" (p3)
117  :
118  : "memory");
119 
120  kernel_fpu_end();
121 }
122 
123 static void
124 xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
125  unsigned long *p3, unsigned long *p4)
126 {
127  unsigned long lines = bytes >> 7;
128 
129  kernel_fpu_begin();
130 
131  asm volatile(
132 #undef BLOCK
133 #define BLOCK(i) \
134  LD(i, 0) \
135  LD(i + 1, 1) \
136  LD(i + 2, 2) \
137  LD(i + 3, 3) \
138  XO1(i, 0) \
139  XO1(i + 1, 1) \
140  XO1(i + 2, 2) \
141  XO1(i + 3, 3) \
142  XO2(i, 0) \
143  XO2(i + 1, 1) \
144  XO2(i + 2, 2) \
145  XO2(i + 3, 3) \
146  XO3(i, 0) \
147  ST(i, 0) \
148  XO3(i + 1, 1) \
149  ST(i + 1, 1) \
150  XO3(i + 2, 2) \
151  ST(i + 2, 2) \
152  XO3(i + 3, 3) \
153  ST(i + 3, 3)
154 
155  " .align 32 ;\n"
156  " 1: ;\n"
157 
158  BLOCK(0)
159  BLOCK(4)
160  BLOCK(8)
161  BLOCK(12)
162 
163  " addl $128, %1 ;\n"
164  " addl $128, %2 ;\n"
165  " addl $128, %3 ;\n"
166  " addl $128, %4 ;\n"
167  " decl %0 ;\n"
168  " jnz 1b ;\n"
169  : "+r" (lines),
170  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
171  :
172  : "memory");
173 
174  kernel_fpu_end();
175 }
176 
177 
178 static void
179 xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
180  unsigned long *p3, unsigned long *p4, unsigned long *p5)
181 {
182  unsigned long lines = bytes >> 7;
183 
184  kernel_fpu_begin();
185 
186  /* Make sure GCC forgets anything it knows about p4 or p5,
187  such that it won't pass to the asm volatile below a
188  register that is shared with any other variable. That's
189  because we modify p4 and p5 there, but we can't mark them
190  as read/write, otherwise we'd overflow the 10-asm-operands
191  limit of GCC < 3.1. */
192  asm("" : "+r" (p4), "+r" (p5));
193 
194  asm volatile(
195 #undef BLOCK
196 #define BLOCK(i) \
197  LD(i, 0) \
198  LD(i + 1, 1) \
199  LD(i + 2, 2) \
200  LD(i + 3, 3) \
201  XO1(i, 0) \
202  XO1(i + 1, 1) \
203  XO1(i + 2, 2) \
204  XO1(i + 3, 3) \
205  XO2(i, 0) \
206  XO2(i + 1, 1) \
207  XO2(i + 2, 2) \
208  XO2(i + 3, 3) \
209  XO3(i, 0) \
210  XO3(i + 1, 1) \
211  XO3(i + 2, 2) \
212  XO3(i + 3, 3) \
213  XO4(i, 0) \
214  ST(i, 0) \
215  XO4(i + 1, 1) \
216  ST(i + 1, 1) \
217  XO4(i + 2, 2) \
218  ST(i + 2, 2) \
219  XO4(i + 3, 3) \
220  ST(i + 3, 3)
221 
222  " .align 32 ;\n"
223  " 1: ;\n"
224 
225  BLOCK(0)
226  BLOCK(4)
227  BLOCK(8)
228  BLOCK(12)
229 
230  " addl $128, %1 ;\n"
231  " addl $128, %2 ;\n"
232  " addl $128, %3 ;\n"
233  " addl $128, %4 ;\n"
234  " addl $128, %5 ;\n"
235  " decl %0 ;\n"
236  " jnz 1b ;\n"
237  : "+r" (lines),
238  "+r" (p1), "+r" (p2), "+r" (p3)
239  : "r" (p4), "r" (p5)
240  : "memory");
241 
242  /* p4 and p5 were modified, and now the variables are dead.
243  Clobber them just to be sure nobody does something stupid
244  like assuming they have some legal value. */
245  asm("" : "=r" (p4), "=r" (p5));
246 
247  kernel_fpu_end();
248 }
249 
250 #undef LD
251 #undef XO1
252 #undef XO2
253 #undef XO3
254 #undef XO4
255 #undef ST
256 #undef BLOCK
257 
258 static void
259 xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
260 {
261  unsigned long lines = bytes >> 6;
262 
263  kernel_fpu_begin();
264 
265  asm volatile(
266  " .align 32 ;\n"
267  " 1: ;\n"
268  " movq (%1), %%mm0 ;\n"
269  " movq 8(%1), %%mm1 ;\n"
270  " pxor (%2), %%mm0 ;\n"
271  " movq 16(%1), %%mm2 ;\n"
272  " movq %%mm0, (%1) ;\n"
273  " pxor 8(%2), %%mm1 ;\n"
274  " movq 24(%1), %%mm3 ;\n"
275  " movq %%mm1, 8(%1) ;\n"
276  " pxor 16(%2), %%mm2 ;\n"
277  " movq 32(%1), %%mm4 ;\n"
278  " movq %%mm2, 16(%1) ;\n"
279  " pxor 24(%2), %%mm3 ;\n"
280  " movq 40(%1), %%mm5 ;\n"
281  " movq %%mm3, 24(%1) ;\n"
282  " pxor 32(%2), %%mm4 ;\n"
283  " movq 48(%1), %%mm6 ;\n"
284  " movq %%mm4, 32(%1) ;\n"
285  " pxor 40(%2), %%mm5 ;\n"
286  " movq 56(%1), %%mm7 ;\n"
287  " movq %%mm5, 40(%1) ;\n"
288  " pxor 48(%2), %%mm6 ;\n"
289  " pxor 56(%2), %%mm7 ;\n"
290  " movq %%mm6, 48(%1) ;\n"
291  " movq %%mm7, 56(%1) ;\n"
292 
293  " addl $64, %1 ;\n"
294  " addl $64, %2 ;\n"
295  " decl %0 ;\n"
296  " jnz 1b ;\n"
297  : "+r" (lines),
298  "+r" (p1), "+r" (p2)
299  :
300  : "memory");
301 
302  kernel_fpu_end();
303 }
304 
305 static void
306 xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
307  unsigned long *p3)
308 {
309  unsigned long lines = bytes >> 6;
310 
311  kernel_fpu_begin();
312 
313  asm volatile(
314  " .align 32,0x90 ;\n"
315  " 1: ;\n"
316  " movq (%1), %%mm0 ;\n"
317  " movq 8(%1), %%mm1 ;\n"
318  " pxor (%2), %%mm0 ;\n"
319  " movq 16(%1), %%mm2 ;\n"
320  " pxor 8(%2), %%mm1 ;\n"
321  " pxor (%3), %%mm0 ;\n"
322  " pxor 16(%2), %%mm2 ;\n"
323  " movq %%mm0, (%1) ;\n"
324  " pxor 8(%3), %%mm1 ;\n"
325  " pxor 16(%3), %%mm2 ;\n"
326  " movq 24(%1), %%mm3 ;\n"
327  " movq %%mm1, 8(%1) ;\n"
328  " movq 32(%1), %%mm4 ;\n"
329  " movq 40(%1), %%mm5 ;\n"
330  " pxor 24(%2), %%mm3 ;\n"
331  " movq %%mm2, 16(%1) ;\n"
332  " pxor 32(%2), %%mm4 ;\n"
333  " pxor 24(%3), %%mm3 ;\n"
334  " pxor 40(%2), %%mm5 ;\n"
335  " movq %%mm3, 24(%1) ;\n"
336  " pxor 32(%3), %%mm4 ;\n"
337  " pxor 40(%3), %%mm5 ;\n"
338  " movq 48(%1), %%mm6 ;\n"
339  " movq %%mm4, 32(%1) ;\n"
340  " movq 56(%1), %%mm7 ;\n"
341  " pxor 48(%2), %%mm6 ;\n"
342  " movq %%mm5, 40(%1) ;\n"
343  " pxor 56(%2), %%mm7 ;\n"
344  " pxor 48(%3), %%mm6 ;\n"
345  " pxor 56(%3), %%mm7 ;\n"
346  " movq %%mm6, 48(%1) ;\n"
347  " movq %%mm7, 56(%1) ;\n"
348 
349  " addl $64, %1 ;\n"
350  " addl $64, %2 ;\n"
351  " addl $64, %3 ;\n"
352  " decl %0 ;\n"
353  " jnz 1b ;\n"
354  : "+r" (lines),
355  "+r" (p1), "+r" (p2), "+r" (p3)
356  :
357  : "memory" );
358 
359  kernel_fpu_end();
360 }
361 
362 static void
363 xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
364  unsigned long *p3, unsigned long *p4)
365 {
366  unsigned long lines = bytes >> 6;
367 
368  kernel_fpu_begin();
369 
370  asm volatile(
371  " .align 32,0x90 ;\n"
372  " 1: ;\n"
373  " movq (%1), %%mm0 ;\n"
374  " movq 8(%1), %%mm1 ;\n"
375  " pxor (%2), %%mm0 ;\n"
376  " movq 16(%1), %%mm2 ;\n"
377  " pxor 8(%2), %%mm1 ;\n"
378  " pxor (%3), %%mm0 ;\n"
379  " pxor 16(%2), %%mm2 ;\n"
380  " pxor 8(%3), %%mm1 ;\n"
381  " pxor (%4), %%mm0 ;\n"
382  " movq 24(%1), %%mm3 ;\n"
383  " pxor 16(%3), %%mm2 ;\n"
384  " pxor 8(%4), %%mm1 ;\n"
385  " movq %%mm0, (%1) ;\n"
386  " movq 32(%1), %%mm4 ;\n"
387  " pxor 24(%2), %%mm3 ;\n"
388  " pxor 16(%4), %%mm2 ;\n"
389  " movq %%mm1, 8(%1) ;\n"
390  " movq 40(%1), %%mm5 ;\n"
391  " pxor 32(%2), %%mm4 ;\n"
392  " pxor 24(%3), %%mm3 ;\n"
393  " movq %%mm2, 16(%1) ;\n"
394  " pxor 40(%2), %%mm5 ;\n"
395  " pxor 32(%3), %%mm4 ;\n"
396  " pxor 24(%4), %%mm3 ;\n"
397  " movq %%mm3, 24(%1) ;\n"
398  " movq 56(%1), %%mm7 ;\n"
399  " movq 48(%1), %%mm6 ;\n"
400  " pxor 40(%3), %%mm5 ;\n"
401  " pxor 32(%4), %%mm4 ;\n"
402  " pxor 48(%2), %%mm6 ;\n"
403  " movq %%mm4, 32(%1) ;\n"
404  " pxor 56(%2), %%mm7 ;\n"
405  " pxor 40(%4), %%mm5 ;\n"
406  " pxor 48(%3), %%mm6 ;\n"
407  " pxor 56(%3), %%mm7 ;\n"
408  " movq %%mm5, 40(%1) ;\n"
409  " pxor 48(%4), %%mm6 ;\n"
410  " pxor 56(%4), %%mm7 ;\n"
411  " movq %%mm6, 48(%1) ;\n"
412  " movq %%mm7, 56(%1) ;\n"
413 
414  " addl $64, %1 ;\n"
415  " addl $64, %2 ;\n"
416  " addl $64, %3 ;\n"
417  " addl $64, %4 ;\n"
418  " decl %0 ;\n"
419  " jnz 1b ;\n"
420  : "+r" (lines),
421  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
422  :
423  : "memory");
424 
425  kernel_fpu_end();
426 }
427 
428 static void
429 xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
430  unsigned long *p3, unsigned long *p4, unsigned long *p5)
431 {
432  unsigned long lines = bytes >> 6;
433 
434  kernel_fpu_begin();
435 
436  /* Make sure GCC forgets anything it knows about p4 or p5,
437  such that it won't pass to the asm volatile below a
438  register that is shared with any other variable. That's
439  because we modify p4 and p5 there, but we can't mark them
440  as read/write, otherwise we'd overflow the 10-asm-operands
441  limit of GCC < 3.1. */
442  asm("" : "+r" (p4), "+r" (p5));
443 
444  asm volatile(
445  " .align 32,0x90 ;\n"
446  " 1: ;\n"
447  " movq (%1), %%mm0 ;\n"
448  " movq 8(%1), %%mm1 ;\n"
449  " pxor (%2), %%mm0 ;\n"
450  " pxor 8(%2), %%mm1 ;\n"
451  " movq 16(%1), %%mm2 ;\n"
452  " pxor (%3), %%mm0 ;\n"
453  " pxor 8(%3), %%mm1 ;\n"
454  " pxor 16(%2), %%mm2 ;\n"
455  " pxor (%4), %%mm0 ;\n"
456  " pxor 8(%4), %%mm1 ;\n"
457  " pxor 16(%3), %%mm2 ;\n"
458  " movq 24(%1), %%mm3 ;\n"
459  " pxor (%5), %%mm0 ;\n"
460  " pxor 8(%5), %%mm1 ;\n"
461  " movq %%mm0, (%1) ;\n"
462  " pxor 16(%4), %%mm2 ;\n"
463  " pxor 24(%2), %%mm3 ;\n"
464  " movq %%mm1, 8(%1) ;\n"
465  " pxor 16(%5), %%mm2 ;\n"
466  " pxor 24(%3), %%mm3 ;\n"
467  " movq 32(%1), %%mm4 ;\n"
468  " movq %%mm2, 16(%1) ;\n"
469  " pxor 24(%4), %%mm3 ;\n"
470  " pxor 32(%2), %%mm4 ;\n"
471  " movq 40(%1), %%mm5 ;\n"
472  " pxor 24(%5), %%mm3 ;\n"
473  " pxor 32(%3), %%mm4 ;\n"
474  " pxor 40(%2), %%mm5 ;\n"
475  " movq %%mm3, 24(%1) ;\n"
476  " pxor 32(%4), %%mm4 ;\n"
477  " pxor 40(%3), %%mm5 ;\n"
478  " movq 48(%1), %%mm6 ;\n"
479  " movq 56(%1), %%mm7 ;\n"
480  " pxor 32(%5), %%mm4 ;\n"
481  " pxor 40(%4), %%mm5 ;\n"
482  " pxor 48(%2), %%mm6 ;\n"
483  " pxor 56(%2), %%mm7 ;\n"
484  " movq %%mm4, 32(%1) ;\n"
485  " pxor 48(%3), %%mm6 ;\n"
486  " pxor 56(%3), %%mm7 ;\n"
487  " pxor 40(%5), %%mm5 ;\n"
488  " pxor 48(%4), %%mm6 ;\n"
489  " pxor 56(%4), %%mm7 ;\n"
490  " movq %%mm5, 40(%1) ;\n"
491  " pxor 48(%5), %%mm6 ;\n"
492  " pxor 56(%5), %%mm7 ;\n"
493  " movq %%mm6, 48(%1) ;\n"
494  " movq %%mm7, 56(%1) ;\n"
495 
496  " addl $64, %1 ;\n"
497  " addl $64, %2 ;\n"
498  " addl $64, %3 ;\n"
499  " addl $64, %4 ;\n"
500  " addl $64, %5 ;\n"
501  " decl %0 ;\n"
502  " jnz 1b ;\n"
503  : "+r" (lines),
504  "+r" (p1), "+r" (p2), "+r" (p3)
505  : "r" (p4), "r" (p5)
506  : "memory");
507 
508  /* p4 and p5 were modified, and now the variables are dead.
509  Clobber them just to be sure nobody does something stupid
510  like assuming they have some legal value. */
511  asm("" : "=r" (p4), "=r" (p5));
512 
513  kernel_fpu_end();
514 }
515 
516 static struct xor_block_template xor_block_pII_mmx = {
517  .name = "pII_mmx",
518  .do_2 = xor_pII_mmx_2,
519  .do_3 = xor_pII_mmx_3,
520  .do_4 = xor_pII_mmx_4,
521  .do_5 = xor_pII_mmx_5,
522 };
523 
524 static struct xor_block_template xor_block_p5_mmx = {
525  .name = "p5_mmx",
526  .do_2 = xor_p5_mmx_2,
527  .do_3 = xor_p5_mmx_3,
528  .do_4 = xor_p5_mmx_4,
529  .do_5 = xor_p5_mmx_5,
530 };
531 
532 /*
533  * Cache avoiding checksumming functions utilizing KNI instructions
534  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
535  */
536 
537 #define OFFS(x) "16*("#x")"
538 #define PF_OFFS(x) "256+16*("#x")"
539 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n"
540 #define LD(x, y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
541 #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
542 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n"
543 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n"
544 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n"
545 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n"
546 #define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n"
547 #define XO1(x, y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
548 #define XO2(x, y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
549 #define XO3(x, y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
550 #define XO4(x, y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
551 #define XO5(x, y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
552 
553 
554 static void
555 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
556 {
557  unsigned long lines = bytes >> 8;
558 
559  kernel_fpu_begin();
560 
561  asm volatile(
562 #undef BLOCK
563 #define BLOCK(i) \
564  LD(i, 0) \
565  LD(i + 1, 1) \
566  PF1(i) \
567  PF1(i + 2) \
568  LD(i + 2, 2) \
569  LD(i + 3, 3) \
570  PF0(i + 4) \
571  PF0(i + 6) \
572  XO1(i, 0) \
573  XO1(i + 1, 1) \
574  XO1(i + 2, 2) \
575  XO1(i + 3, 3) \
576  ST(i, 0) \
577  ST(i + 1, 1) \
578  ST(i + 2, 2) \
579  ST(i + 3, 3) \
580 
581 
582  PF0(0)
583  PF0(2)
584 
585  " .align 32 ;\n"
586  " 1: ;\n"
587 
588  BLOCK(0)
589  BLOCK(4)
590  BLOCK(8)
591  BLOCK(12)
592 
593  " addl $256, %1 ;\n"
594  " addl $256, %2 ;\n"
595  " decl %0 ;\n"
596  " jnz 1b ;\n"
597  : "+r" (lines),
598  "+r" (p1), "+r" (p2)
599  :
600  : "memory");
601 
602  kernel_fpu_end();
603 }
604 
605 static void
606 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
607  unsigned long *p3)
608 {
609  unsigned long lines = bytes >> 8;
610 
611  kernel_fpu_begin();
612 
613  asm volatile(
614 #undef BLOCK
615 #define BLOCK(i) \
616  PF1(i) \
617  PF1(i + 2) \
618  LD(i,0) \
619  LD(i + 1, 1) \
620  LD(i + 2, 2) \
621  LD(i + 3, 3) \
622  PF2(i) \
623  PF2(i + 2) \
624  PF0(i + 4) \
625  PF0(i + 6) \
626  XO1(i,0) \
627  XO1(i + 1, 1) \
628  XO1(i + 2, 2) \
629  XO1(i + 3, 3) \
630  XO2(i,0) \
631  XO2(i + 1, 1) \
632  XO2(i + 2, 2) \
633  XO2(i + 3, 3) \
634  ST(i,0) \
635  ST(i + 1, 1) \
636  ST(i + 2, 2) \
637  ST(i + 3, 3) \
638 
639 
640  PF0(0)
641  PF0(2)
642 
643  " .align 32 ;\n"
644  " 1: ;\n"
645 
646  BLOCK(0)
647  BLOCK(4)
648  BLOCK(8)
649  BLOCK(12)
650 
651  " addl $256, %1 ;\n"
652  " addl $256, %2 ;\n"
653  " addl $256, %3 ;\n"
654  " decl %0 ;\n"
655  " jnz 1b ;\n"
656  : "+r" (lines),
657  "+r" (p1), "+r"(p2), "+r"(p3)
658  :
659  : "memory" );
660 
661  kernel_fpu_end();
662 }
663 
664 static void
665 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
666  unsigned long *p3, unsigned long *p4)
667 {
668  unsigned long lines = bytes >> 8;
669 
670  kernel_fpu_begin();
671 
672  asm volatile(
673 #undef BLOCK
674 #define BLOCK(i) \
675  PF1(i) \
676  PF1(i + 2) \
677  LD(i,0) \
678  LD(i + 1, 1) \
679  LD(i + 2, 2) \
680  LD(i + 3, 3) \
681  PF2(i) \
682  PF2(i + 2) \
683  XO1(i,0) \
684  XO1(i + 1, 1) \
685  XO1(i + 2, 2) \
686  XO1(i + 3, 3) \
687  PF3(i) \
688  PF3(i + 2) \
689  PF0(i + 4) \
690  PF0(i + 6) \
691  XO2(i,0) \
692  XO2(i + 1, 1) \
693  XO2(i + 2, 2) \
694  XO2(i + 3, 3) \
695  XO3(i,0) \
696  XO3(i + 1, 1) \
697  XO3(i + 2, 2) \
698  XO3(i + 3, 3) \
699  ST(i,0) \
700  ST(i + 1, 1) \
701  ST(i + 2, 2) \
702  ST(i + 3, 3) \
703 
704 
705  PF0(0)
706  PF0(2)
707 
708  " .align 32 ;\n"
709  " 1: ;\n"
710 
711  BLOCK(0)
712  BLOCK(4)
713  BLOCK(8)
714  BLOCK(12)
715 
716  " addl $256, %1 ;\n"
717  " addl $256, %2 ;\n"
718  " addl $256, %3 ;\n"
719  " addl $256, %4 ;\n"
720  " decl %0 ;\n"
721  " jnz 1b ;\n"
722  : "+r" (lines),
723  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
724  :
725  : "memory" );
726 
727  kernel_fpu_end();
728 }
729 
730 static void
731 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
732  unsigned long *p3, unsigned long *p4, unsigned long *p5)
733 {
734  unsigned long lines = bytes >> 8;
735 
736  kernel_fpu_begin();
737 
738  /* Make sure GCC forgets anything it knows about p4 or p5,
739  such that it won't pass to the asm volatile below a
740  register that is shared with any other variable. That's
741  because we modify p4 and p5 there, but we can't mark them
742  as read/write, otherwise we'd overflow the 10-asm-operands
743  limit of GCC < 3.1. */
744  asm("" : "+r" (p4), "+r" (p5));
745 
746  asm volatile(
747 #undef BLOCK
748 #define BLOCK(i) \
749  PF1(i) \
750  PF1(i + 2) \
751  LD(i,0) \
752  LD(i + 1, 1) \
753  LD(i + 2, 2) \
754  LD(i + 3, 3) \
755  PF2(i) \
756  PF2(i + 2) \
757  XO1(i,0) \
758  XO1(i + 1, 1) \
759  XO1(i + 2, 2) \
760  XO1(i + 3, 3) \
761  PF3(i) \
762  PF3(i + 2) \
763  XO2(i,0) \
764  XO2(i + 1, 1) \
765  XO2(i + 2, 2) \
766  XO2(i + 3, 3) \
767  PF4(i) \
768  PF4(i + 2) \
769  PF0(i + 4) \
770  PF0(i + 6) \
771  XO3(i,0) \
772  XO3(i + 1, 1) \
773  XO3(i + 2, 2) \
774  XO3(i + 3, 3) \
775  XO4(i,0) \
776  XO4(i + 1, 1) \
777  XO4(i + 2, 2) \
778  XO4(i + 3, 3) \
779  ST(i,0) \
780  ST(i + 1, 1) \
781  ST(i + 2, 2) \
782  ST(i + 3, 3) \
783 
784 
785  PF0(0)
786  PF0(2)
787 
788  " .align 32 ;\n"
789  " 1: ;\n"
790 
791  BLOCK(0)
792  BLOCK(4)
793  BLOCK(8)
794  BLOCK(12)
795 
796  " addl $256, %1 ;\n"
797  " addl $256, %2 ;\n"
798  " addl $256, %3 ;\n"
799  " addl $256, %4 ;\n"
800  " addl $256, %5 ;\n"
801  " decl %0 ;\n"
802  " jnz 1b ;\n"
803  : "+r" (lines),
804  "+r" (p1), "+r" (p2), "+r" (p3)
805  : "r" (p4), "r" (p5)
806  : "memory");
807 
808  /* p4 and p5 were modified, and now the variables are dead.
809  Clobber them just to be sure nobody does something stupid
810  like assuming they have some legal value. */
811  asm("" : "=r" (p4), "=r" (p5));
812 
813  kernel_fpu_end();
814 }
815 
816 static struct xor_block_template xor_block_pIII_sse = {
817  .name = "pIII_sse",
818  .do_2 = xor_sse_2,
819  .do_3 = xor_sse_3,
820  .do_4 = xor_sse_4,
821  .do_5 = xor_sse_5,
822 };
823 
824 /* Also try the AVX routines */
825 #include <asm/xor_avx.h>
826 
827 /* Also try the generic routines. */
828 #include <asm-generic/xor.h>
829 
830 #undef XOR_TRY_TEMPLATES
831 #define XOR_TRY_TEMPLATES \
832 do { \
833  xor_speed(&xor_block_8regs); \
834  xor_speed(&xor_block_8regs_p); \
835  xor_speed(&xor_block_32regs); \
836  xor_speed(&xor_block_32regs_p); \
837  AVX_XOR_SPEED; \
838  if (cpu_has_xmm) \
839  xor_speed(&xor_block_pIII_sse); \
840  if (cpu_has_mmx) { \
841  xor_speed(&xor_block_pII_mmx); \
842  xor_speed(&xor_block_p5_mmx); \
843  } \
844 } while (0)
845 
846 /* We force the use of the SSE xor block because it can write around L2.
847  We may also be able to load into the L1 only depending on how the cpu
848  deals with a load to a line that is being prefetched. */
849 #define XOR_SELECT_TEMPLATE(FASTEST) \
850  AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
851 
852 #endif /* _ASM_X86_XOR_32_H */