Go to the documentation of this file. 1 #ifndef _ASM_X86_XOR_32_H
2 #define _ASM_X86_XOR_32_H
22 #define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n"
23 #define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n"
24 #define XO1(x, y) " pxor 8*("#x")(%2), %%mm"#y" ;\n"
25 #define XO2(x, y) " pxor 8*("#x")(%3), %%mm"#y" ;\n"
26 #define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
27 #define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
32 xor_pII_mmx_2(
unsigned long bytes,
unsigned long *
p1,
unsigned long *p2)
34 unsigned long lines = bytes >> 7;
75 xor_pII_mmx_3(
unsigned long bytes,
unsigned long *p1,
unsigned long *p2,
78 unsigned long lines = bytes >> 7;
116 "+
r" (p1), "+
r" (p2), "+
r" (p3)
124 xor_pII_mmx_4(
unsigned long bytes,
unsigned long *p1,
unsigned long *p2,
125 unsigned long *p3,
unsigned long *p4)
127 unsigned long lines = bytes >> 7;
170 "+
r" (p1), "+
r" (p2), "+
r" (p3), "+
r" (p4)
179 xor_pII_mmx_5(
unsigned long bytes,
unsigned long *p1,
unsigned long *p2,
180 unsigned long *p3,
unsigned long *p4,
unsigned long *p5)
182 unsigned long lines = bytes >> 7;
192 asm(
"" :
"+r" (p4),
"+r" (p5));
238 "+
r" (p1), "+
r" (p2), "+
r" (p3)
245 asm("" : "=
r" (p4), "=
r" (p5));
259 xor_p5_mmx_2(
unsigned long bytes,
unsigned long *p1,
unsigned long *p2)
261 unsigned long lines = bytes >> 6;
268 " movq (%1), %%mm0 ;\n"
269 " movq 8(%1), %%mm1 ;\n"
270 " pxor (%2), %%mm0 ;\n"
271 " movq 16(%1), %%mm2 ;\n"
272 " movq %%mm0, (%1) ;\n"
273 " pxor 8(%2), %%mm1 ;\n"
274 " movq 24(%1), %%mm3 ;\n"
275 " movq %%mm1, 8(%1) ;\n"
276 " pxor 16(%2), %%mm2 ;\n"
277 " movq 32(%1), %%mm4 ;\n"
278 " movq %%mm2, 16(%1) ;\n"
279 " pxor 24(%2), %%mm3 ;\n"
280 " movq 40(%1), %%mm5 ;\n"
281 " movq %%mm3, 24(%1) ;\n"
282 " pxor 32(%2), %%mm4 ;\n"
283 " movq 48(%1), %%mm6 ;\n"
284 " movq %%mm4, 32(%1) ;\n"
285 " pxor 40(%2), %%mm5 ;\n"
286 " movq 56(%1), %%mm7 ;\n"
287 " movq %%mm5, 40(%1) ;\n"
288 " pxor 48(%2), %%mm6 ;\n"
289 " pxor 56(%2), %%mm7 ;\n"
290 " movq %%mm6, 48(%1) ;\n"
291 " movq %%mm7, 56(%1) ;\n"
306 xor_p5_mmx_3(
unsigned long bytes,
unsigned long *p1,
unsigned long *p2,
309 unsigned long lines = bytes >> 6;
314 " .align 32,0x90 ;\n"
316 " movq (%1), %%mm0 ;\n"
317 " movq 8(%1), %%mm1 ;\n"
318 " pxor (%2), %%mm0 ;\n"
319 " movq 16(%1), %%mm2 ;\n"
320 " pxor 8(%2), %%mm1 ;\n"
321 " pxor (%3), %%mm0 ;\n"
322 " pxor 16(%2), %%mm2 ;\n"
323 " movq %%mm0, (%1) ;\n"
324 " pxor 8(%3), %%mm1 ;\n"
325 " pxor 16(%3), %%mm2 ;\n"
326 " movq 24(%1), %%mm3 ;\n"
327 " movq %%mm1, 8(%1) ;\n"
328 " movq 32(%1), %%mm4 ;\n"
329 " movq 40(%1), %%mm5 ;\n"
330 " pxor 24(%2), %%mm3 ;\n"
331 " movq %%mm2, 16(%1) ;\n"
332 " pxor 32(%2), %%mm4 ;\n"
333 " pxor 24(%3), %%mm3 ;\n"
334 " pxor 40(%2), %%mm5 ;\n"
335 " movq %%mm3, 24(%1) ;\n"
336 " pxor 32(%3), %%mm4 ;\n"
337 " pxor 40(%3), %%mm5 ;\n"
338 " movq 48(%1), %%mm6 ;\n"
339 " movq %%mm4, 32(%1) ;\n"
340 " movq 56(%1), %%mm7 ;\n"
341 " pxor 48(%2), %%mm6 ;\n"
342 " movq %%mm5, 40(%1) ;\n"
343 " pxor 56(%2), %%mm7 ;\n"
344 " pxor 48(%3), %%mm6 ;\n"
345 " pxor 56(%3), %%mm7 ;\n"
346 " movq %%mm6, 48(%1) ;\n"
347 " movq %%mm7, 56(%1) ;\n"
355 "+r" (p1),
"+r" (p2),
"+r" (p3)
363 xor_p5_mmx_4(
unsigned long bytes,
unsigned long *p1,
unsigned long *p2,
364 unsigned long *p3,
unsigned long *p4)
366 unsigned long lines = bytes >> 6;
371 " .align 32,0x90 ;\n"
373 " movq (%1), %%mm0 ;\n"
374 " movq 8(%1), %%mm1 ;\n"
375 " pxor (%2), %%mm0 ;\n"
376 " movq 16(%1), %%mm2 ;\n"
377 " pxor 8(%2), %%mm1 ;\n"
378 " pxor (%3), %%mm0 ;\n"
379 " pxor 16(%2), %%mm2 ;\n"
380 " pxor 8(%3), %%mm1 ;\n"
381 " pxor (%4), %%mm0 ;\n"
382 " movq 24(%1), %%mm3 ;\n"
383 " pxor 16(%3), %%mm2 ;\n"
384 " pxor 8(%4), %%mm1 ;\n"
385 " movq %%mm0, (%1) ;\n"
386 " movq 32(%1), %%mm4 ;\n"
387 " pxor 24(%2), %%mm3 ;\n"
388 " pxor 16(%4), %%mm2 ;\n"
389 " movq %%mm1, 8(%1) ;\n"
390 " movq 40(%1), %%mm5 ;\n"
391 " pxor 32(%2), %%mm4 ;\n"
392 " pxor 24(%3), %%mm3 ;\n"
393 " movq %%mm2, 16(%1) ;\n"
394 " pxor 40(%2), %%mm5 ;\n"
395 " pxor 32(%3), %%mm4 ;\n"
396 " pxor 24(%4), %%mm3 ;\n"
397 " movq %%mm3, 24(%1) ;\n"
398 " movq 56(%1), %%mm7 ;\n"
399 " movq 48(%1), %%mm6 ;\n"
400 " pxor 40(%3), %%mm5 ;\n"
401 " pxor 32(%4), %%mm4 ;\n"
402 " pxor 48(%2), %%mm6 ;\n"
403 " movq %%mm4, 32(%1) ;\n"
404 " pxor 56(%2), %%mm7 ;\n"
405 " pxor 40(%4), %%mm5 ;\n"
406 " pxor 48(%3), %%mm6 ;\n"
407 " pxor 56(%3), %%mm7 ;\n"
408 " movq %%mm5, 40(%1) ;\n"
409 " pxor 48(%4), %%mm6 ;\n"
410 " pxor 56(%4), %%mm7 ;\n"
411 " movq %%mm6, 48(%1) ;\n"
412 " movq %%mm7, 56(%1) ;\n"
421 "+r" (p1),
"+r" (p2),
"+r" (p3),
"+r" (p4)
429 xor_p5_mmx_5(
unsigned long bytes,
unsigned long *p1,
unsigned long *p2,
430 unsigned long *p3,
unsigned long *p4,
unsigned long *p5)
432 unsigned long lines = bytes >> 6;
442 asm(
"" :
"+r" (p4),
"+r" (p5));
445 " .align 32,0x90 ;\n"
447 " movq (%1), %%mm0 ;\n"
448 " movq 8(%1), %%mm1 ;\n"
449 " pxor (%2), %%mm0 ;\n"
450 " pxor 8(%2), %%mm1 ;\n"
451 " movq 16(%1), %%mm2 ;\n"
452 " pxor (%3), %%mm0 ;\n"
453 " pxor 8(%3), %%mm1 ;\n"
454 " pxor 16(%2), %%mm2 ;\n"
455 " pxor (%4), %%mm0 ;\n"
456 " pxor 8(%4), %%mm1 ;\n"
457 " pxor 16(%3), %%mm2 ;\n"
458 " movq 24(%1), %%mm3 ;\n"
459 " pxor (%5), %%mm0 ;\n"
460 " pxor 8(%5), %%mm1 ;\n"
461 " movq %%mm0, (%1) ;\n"
462 " pxor 16(%4), %%mm2 ;\n"
463 " pxor 24(%2), %%mm3 ;\n"
464 " movq %%mm1, 8(%1) ;\n"
465 " pxor 16(%5), %%mm2 ;\n"
466 " pxor 24(%3), %%mm3 ;\n"
467 " movq 32(%1), %%mm4 ;\n"
468 " movq %%mm2, 16(%1) ;\n"
469 " pxor 24(%4), %%mm3 ;\n"
470 " pxor 32(%2), %%mm4 ;\n"
471 " movq 40(%1), %%mm5 ;\n"
472 " pxor 24(%5), %%mm3 ;\n"
473 " pxor 32(%3), %%mm4 ;\n"
474 " pxor 40(%2), %%mm5 ;\n"
475 " movq %%mm3, 24(%1) ;\n"
476 " pxor 32(%4), %%mm4 ;\n"
477 " pxor 40(%3), %%mm5 ;\n"
478 " movq 48(%1), %%mm6 ;\n"
479 " movq 56(%1), %%mm7 ;\n"
480 " pxor 32(%5), %%mm4 ;\n"
481 " pxor 40(%4), %%mm5 ;\n"
482 " pxor 48(%2), %%mm6 ;\n"
483 " pxor 56(%2), %%mm7 ;\n"
484 " movq %%mm4, 32(%1) ;\n"
485 " pxor 48(%3), %%mm6 ;\n"
486 " pxor 56(%3), %%mm7 ;\n"
487 " pxor 40(%5), %%mm5 ;\n"
488 " pxor 48(%4), %%mm6 ;\n"
489 " pxor 56(%4), %%mm7 ;\n"
490 " movq %%mm5, 40(%1) ;\n"
491 " pxor 48(%5), %%mm6 ;\n"
492 " pxor 56(%5), %%mm7 ;\n"
493 " movq %%mm6, 48(%1) ;\n"
494 " movq %%mm7, 56(%1) ;\n"
504 "+r" (p1),
"+r" (p2),
"+r" (p3)
511 asm(
"" :
"=r" (p4),
"=r" (p5));
518 .do_2 = xor_pII_mmx_2,
519 .do_3 = xor_pII_mmx_3,
520 .do_4 = xor_pII_mmx_4,
521 .do_5 = xor_pII_mmx_5,
526 .do_2 = xor_p5_mmx_2,
527 .do_3 = xor_p5_mmx_3,
528 .do_4 = xor_p5_mmx_4,
529 .do_5 = xor_p5_mmx_5,
537 #define OFFS(x) "16*("#x")"
538 #define PF_OFFS(x) "256+16*("#x")"
539 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n"
540 #define LD(x, y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
541 #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
542 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n"
543 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n"
544 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n"
545 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n"
546 #define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n"
547 #define XO1(x, y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
548 #define XO2(x, y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
549 #define XO3(x, y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
550 #define XO4(x, y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
551 #define XO5(x, y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
555 xor_sse_2(
unsigned long bytes,
unsigned long *p1,
unsigned long *p2)
557 unsigned long lines = bytes >> 8;
606 xor_sse_3(
unsigned long bytes,
unsigned long *p1,
unsigned long *p2,
609 unsigned long lines = bytes >> 8;
657 "+
r" (p1), "+
r"(p2), "+
r"(p3)
665 xor_sse_4(
unsigned long bytes,
unsigned long *p1,
unsigned long *p2,
666 unsigned long *p3,
unsigned long *p4)
668 unsigned long lines = bytes >> 8;
723 "+
r" (p1), "+
r" (p2), "+
r" (p3), "+
r" (p4)
731 xor_sse_5(
unsigned long bytes,
unsigned long *p1,
unsigned long *p2,
732 unsigned long *p3,
unsigned long *p4,
unsigned long *p5)
734 unsigned long lines = bytes >> 8;
744 asm(
"" :
"+r" (p4),
"+r" (p5));
804 "+
r" (p1), "+
r" (p2), "+
r" (p3)
811 asm("" : "=
r" (p4), "=
r" (p5));
830 #undef XOR_TRY_TEMPLATES
831 #define XOR_TRY_TEMPLATES \
833 xor_speed(&xor_block_8regs); \
834 xor_speed(&xor_block_8regs_p); \
835 xor_speed(&xor_block_32regs); \
836 xor_speed(&xor_block_32regs_p); \
839 xor_speed(&xor_block_pIII_sse); \
841 xor_speed(&xor_block_pII_mmx); \
842 xor_speed(&xor_block_p5_mmx); \
849 #define XOR_SELECT_TEMPLATE(FASTEST) \
850 AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)