x86/xor: Add alternative SSE implementation only prefetching once per 64-byte line

On CPUs with 64-byte last level cache lines, this yields roughly 10% better performance, independent of CPU vendor or specific model (as far as I was able to test). Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/5093E4B802000078000A615E@nat28.tlf.novell.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Jan Beulich <JBeulich@suse.com> 2012-11-02 14:20:24 +0000
committer: Ingo Molnar <mingo@kernel.org> 2013-01-25 09:23:50 +0100
commit: f317820cb6ee3fb173319bf76e0e62437be78ad2 (patch)
tree: fc57358da4ba9f11a8d80e508d01e99c2c62c1f9 /arch/x86/include/asm/xor_32.h
parent: e8f6e3f8a14bae98197c6d9f280cd23d22eb1a33 (diff)
1 files changed, 11 insertions, 12 deletions
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index b85dc87f3cc..ce05722e3c6 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -543,26 +543,25 @@ static struct xor_block_template xor_block_pIII_sse = {
 /* Also try the generic routines.  */
 #include <asm-generic/xor.h>
 
+/* We force the use of the SSE xor block because it can write around L2.
+   We may also be able to load into the L1 only depending on how the cpu
+   deals with a load to a line that is being prefetched.  */
 #undef XOR_TRY_TEMPLATES
 #define XOR_TRY_TEMPLATES				\
 do {							\
-	xor_speed(&xor_block_8regs);			\
-	xor_speed(&xor_block_8regs_p);			\
-	xor_speed(&xor_block_32regs);			\
-	xor_speed(&xor_block_32regs_p);			\
 	AVX_XOR_SPEED;					\
-	if (cpu_has_xmm)				\
+	if (cpu_has_xmm) {				\
 		xor_speed(&xor_block_pIII_sse);		\
-	if (cpu_has_mmx) {				\
+		xor_speed(&xor_block_sse_pf64);		\
+	} else if (cpu_has_mmx) {			\
 		xor_speed(&xor_block_pII_mmx);		\
 		xor_speed(&xor_block_p5_mmx);		\
+	} else {					\
+		xor_speed(&xor_block_8regs);		\
+		xor_speed(&xor_block_8regs_p);		\
+		xor_speed(&xor_block_32regs);		\
+		xor_speed(&xor_block_32regs_p);		\
 	}						\
 } while (0)
 
-/* We force the use of the SSE xor block because it can write around L2.
-   We may also be able to load into the L1 only depending on how the cpu
-   deals with a load to a line that is being prefetched.  */
-#define XOR_SELECT_TEMPLATE(FASTEST)			\
-	AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
-
 #endif /* _ASM_X86_XOR_32_H */
author	Jan Beulich <JBeulich@suse.com>	2012-11-02 14:20:24 +0000
committer	Ingo Molnar <mingo@kernel.org>	2013-01-25 09:23:50 +0100
commit	f317820cb6ee3fb173319bf76e0e62437be78ad2 (patch)
tree	fc57358da4ba9f11a8d80e508d01e99c2c62c1f9 /arch/x86/include/asm/xor_32.h
parent	e8f6e3f8a14bae98197c6d9f280cd23d22eb1a33 (diff)