Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
xor_64.h
Go to the documentation of this file.
1 #ifndef _ASM_X86_XOR_64_H
2 #define _ASM_X86_XOR_64_H
3 
4 /*
5  * Optimized RAID-5 checksumming functions for MMX and SSE.
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2, or (at your option)
10  * any later version.
11  *
12  * You should have received a copy of the GNU General Public License
13  * (for example /usr/src/linux/COPYING); if not, write to the Free
14  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15  */
16 
17 
18 /*
19  * Cache avoiding checksumming functions utilizing KNI instructions
20  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
21  */
22 
23 /*
24  * Based on
25  * High-speed RAID5 checksumming functions utilizing SSE instructions.
26  * Copyright (C) 1998 Ingo Molnar.
27  */
28 
29 /*
30  * x86-64 changes / gcc fixes from Andi Kleen.
31  * Copyright 2002 Andi Kleen, SuSE Labs.
32  *
33  * This hasn't been optimized for the hammer yet, but there are likely
34  * no advantages to be gotten from x86-64 here anyways.
35  */
36 
37 #include <asm/i387.h>
38 
39 #define OFFS(x) "16*("#x")"
40 #define PF_OFFS(x) "256+16*("#x")"
41 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
42 #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
43 #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
44 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
45 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
46 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
47 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
48 #define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
49 #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
50 #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
51 #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
52 #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
53 #define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
54 
55 
56 static void
57 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
58 {
59  unsigned int lines = bytes >> 8;
60 
61  kernel_fpu_begin();
62 
63  asm volatile(
64 #undef BLOCK
65 #define BLOCK(i) \
66  LD(i, 0) \
67  LD(i + 1, 1) \
68  PF1(i) \
69  PF1(i + 2) \
70  LD(i + 2, 2) \
71  LD(i + 3, 3) \
72  PF0(i + 4) \
73  PF0(i + 6) \
74  XO1(i, 0) \
75  XO1(i + 1, 1) \
76  XO1(i + 2, 2) \
77  XO1(i + 3, 3) \
78  ST(i, 0) \
79  ST(i + 1, 1) \
80  ST(i + 2, 2) \
81  ST(i + 3, 3) \
82 
83 
84  PF0(0)
85  PF0(2)
86 
87  " .align 32 ;\n"
88  " 1: ;\n"
89 
90  BLOCK(0)
91  BLOCK(4)
92  BLOCK(8)
93  BLOCK(12)
94 
95  " addq %[inc], %[p1] ;\n"
96  " addq %[inc], %[p2] ;\n"
97  " decl %[cnt] ; jnz 1b"
98  : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
99  : [inc] "r" (256UL)
100  : "memory");
101 
102  kernel_fpu_end();
103 }
104 
105 static void
106 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
107  unsigned long *p3)
108 {
109  unsigned int lines = bytes >> 8;
110 
111  kernel_fpu_begin();
112  asm volatile(
113 #undef BLOCK
114 #define BLOCK(i) \
115  PF1(i) \
116  PF1(i + 2) \
117  LD(i, 0) \
118  LD(i + 1, 1) \
119  LD(i + 2, 2) \
120  LD(i + 3, 3) \
121  PF2(i) \
122  PF2(i + 2) \
123  PF0(i + 4) \
124  PF0(i + 6) \
125  XO1(i, 0) \
126  XO1(i + 1, 1) \
127  XO1(i + 2, 2) \
128  XO1(i + 3, 3) \
129  XO2(i, 0) \
130  XO2(i + 1, 1) \
131  XO2(i + 2, 2) \
132  XO2(i + 3, 3) \
133  ST(i, 0) \
134  ST(i + 1, 1) \
135  ST(i + 2, 2) \
136  ST(i + 3, 3) \
137 
138 
139  PF0(0)
140  PF0(2)
141 
142  " .align 32 ;\n"
143  " 1: ;\n"
144 
145  BLOCK(0)
146  BLOCK(4)
147  BLOCK(8)
148  BLOCK(12)
149 
150  " addq %[inc], %[p1] ;\n"
151  " addq %[inc], %[p2] ;\n"
152  " addq %[inc], %[p3] ;\n"
153  " decl %[cnt] ; jnz 1b"
154  : [cnt] "+r" (lines),
155  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
156  : [inc] "r" (256UL)
157  : "memory");
158  kernel_fpu_end();
159 }
160 
161 static void
162 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
163  unsigned long *p3, unsigned long *p4)
164 {
165  unsigned int lines = bytes >> 8;
166 
167  kernel_fpu_begin();
168 
169  asm volatile(
170 #undef BLOCK
171 #define BLOCK(i) \
172  PF1(i) \
173  PF1(i + 2) \
174  LD(i, 0) \
175  LD(i + 1, 1) \
176  LD(i + 2, 2) \
177  LD(i + 3, 3) \
178  PF2(i) \
179  PF2(i + 2) \
180  XO1(i, 0) \
181  XO1(i + 1, 1) \
182  XO1(i + 2, 2) \
183  XO1(i + 3, 3) \
184  PF3(i) \
185  PF3(i + 2) \
186  PF0(i + 4) \
187  PF0(i + 6) \
188  XO2(i, 0) \
189  XO2(i + 1, 1) \
190  XO2(i + 2, 2) \
191  XO2(i + 3, 3) \
192  XO3(i, 0) \
193  XO3(i + 1, 1) \
194  XO3(i + 2, 2) \
195  XO3(i + 3, 3) \
196  ST(i, 0) \
197  ST(i + 1, 1) \
198  ST(i + 2, 2) \
199  ST(i + 3, 3) \
200 
201 
202  PF0(0)
203  PF0(2)
204 
205  " .align 32 ;\n"
206  " 1: ;\n"
207 
208  BLOCK(0)
209  BLOCK(4)
210  BLOCK(8)
211  BLOCK(12)
212 
213  " addq %[inc], %[p1] ;\n"
214  " addq %[inc], %[p2] ;\n"
215  " addq %[inc], %[p3] ;\n"
216  " addq %[inc], %[p4] ;\n"
217  " decl %[cnt] ; jnz 1b"
218  : [cnt] "+c" (lines),
219  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
220  : [inc] "r" (256UL)
221  : "memory" );
222 
223  kernel_fpu_end();
224 }
225 
226 static void
227 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
228  unsigned long *p3, unsigned long *p4, unsigned long *p5)
229 {
230  unsigned int lines = bytes >> 8;
231 
232  kernel_fpu_begin();
233 
234  asm volatile(
235 #undef BLOCK
236 #define BLOCK(i) \
237  PF1(i) \
238  PF1(i + 2) \
239  LD(i, 0) \
240  LD(i + 1, 1) \
241  LD(i + 2, 2) \
242  LD(i + 3, 3) \
243  PF2(i) \
244  PF2(i + 2) \
245  XO1(i, 0) \
246  XO1(i + 1, 1) \
247  XO1(i + 2, 2) \
248  XO1(i + 3, 3) \
249  PF3(i) \
250  PF3(i + 2) \
251  XO2(i, 0) \
252  XO2(i + 1, 1) \
253  XO2(i + 2, 2) \
254  XO2(i + 3, 3) \
255  PF4(i) \
256  PF4(i + 2) \
257  PF0(i + 4) \
258  PF0(i + 6) \
259  XO3(i, 0) \
260  XO3(i + 1, 1) \
261  XO3(i + 2, 2) \
262  XO3(i + 3, 3) \
263  XO4(i, 0) \
264  XO4(i + 1, 1) \
265  XO4(i + 2, 2) \
266  XO4(i + 3, 3) \
267  ST(i, 0) \
268  ST(i + 1, 1) \
269  ST(i + 2, 2) \
270  ST(i + 3, 3) \
271 
272 
273  PF0(0)
274  PF0(2)
275 
276  " .align 32 ;\n"
277  " 1: ;\n"
278 
279  BLOCK(0)
280  BLOCK(4)
281  BLOCK(8)
282  BLOCK(12)
283 
284  " addq %[inc], %[p1] ;\n"
285  " addq %[inc], %[p2] ;\n"
286  " addq %[inc], %[p3] ;\n"
287  " addq %[inc], %[p4] ;\n"
288  " addq %[inc], %[p5] ;\n"
289  " decl %[cnt] ; jnz 1b"
290  : [cnt] "+c" (lines),
291  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
292  [p5] "+r" (p5)
293  : [inc] "r" (256UL)
294  : "memory");
295 
296  kernel_fpu_end();
297 }
298 
299 static struct xor_block_template xor_block_sse = {
300  .name = "generic_sse",
301  .do_2 = xor_sse_2,
302  .do_3 = xor_sse_3,
303  .do_4 = xor_sse_4,
304  .do_5 = xor_sse_5,
305 };
306 
307 
308 /* Also try the AVX routines */
309 #include <asm/xor_avx.h>
310 
311 #undef XOR_TRY_TEMPLATES
312 #define XOR_TRY_TEMPLATES \
313 do { \
314  AVX_XOR_SPEED; \
315  xor_speed(&xor_block_sse); \
316 } while (0)
317 
318 /* We force the use of the SSE xor block because it can write around L2.
319  We may also be able to load into the L1 only depending on how the cpu
320  deals with a load to a line that is being prefetched. */
321 #define XOR_SELECT_TEMPLATE(FASTEST) \
322  AVX_SELECT(&xor_block_sse)
323 
324 #endif /* _ASM_X86_XOR_64_H */