From 7bcd3f34e262bbebffa954d80eab3a84f053da31 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Fri, 3 Feb 2006 21:51:02 +0100
Subject: [PATCH] x86_64: Undo the earlier changes to remove unrolled
 copy/memset functions

They cause quite bad performance regressions on Netburst
This is temporary until we can get new optimized functions
for these CPUs.

This undoes changes that were done in 2.6.15 and in 2.6.16-rc1,
essentially bringing the code back to 2.6.14 level. Only change
is I renamed the X86_FEATURE_K8_C flag to X86_FEATURE_REP_GOOD
and fixed the check for the flag and also fixed some comments.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/lib/memcpy.S | 93 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 91 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/lib/memcpy.S')

diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
index 92dd8054460..5554948b555 100644
--- a/arch/x86_64/lib/memcpy.S
+++ b/arch/x86_64/lib/memcpy.S
@@ -11,8 +11,6 @@
  * 
  * Output:
  * rax original destination
- * 
- * TODO: check best memcpy for PSC
  */	
 
  	.globl __memcpy
@@ -20,6 +18,95 @@
 	.p2align 4
 __memcpy:
 memcpy:		
+	pushq %rbx
+	movq %rdi,%rax
+
+	movl %edx,%ecx
+	shrl $6,%ecx
+	jz .Lhandle_tail
+
+	.p2align 4
+.Lloop_64:
+	decl %ecx
+
+	movq (%rsi),%r11
+	movq 8(%rsi),%r8
+
+	movq %r11,(%rdi)
+	movq %r8,1*8(%rdi)
+
+	movq 2*8(%rsi),%r9
+	movq 3*8(%rsi),%r10
+
+	movq %r9,2*8(%rdi)
+	movq %r10,3*8(%rdi)
+
+	movq 4*8(%rsi),%r11
+	movq 5*8(%rsi),%r8
+
+	movq %r11,4*8(%rdi)
+	movq %r8,5*8(%rdi)
+
+	movq 6*8(%rsi),%r9
+	movq 7*8(%rsi),%r10
+
+	movq %r9,6*8(%rdi)
+	movq %r10,7*8(%rdi)
+
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	jnz  .Lloop_64
+
+.Lhandle_tail:
+	movl %edx,%ecx
+	andl $63,%ecx
+	shrl $3,%ecx
+	jz   .Lhandle_7
+	.p2align 4
+.Lloop_8:
+	decl %ecx
+	movq (%rsi),%r8
+	movq %r8,(%rdi)
+	leaq 8(%rdi),%rdi
+	leaq 8(%rsi),%rsi
+	jnz  .Lloop_8
+
+.Lhandle_7:
+	movl %edx,%ecx
+	andl $7,%ecx
+	jz .Lende
+	.p2align 4
+.Lloop_1:
+	movb (%rsi),%r8b
+	movb %r8b,(%rdi)
+	incq %rdi
+	incq %rsi
+	decl %ecx
+	jnz .Lloop_1
+
+.Lende:
+	popq %rbx
+	ret
+.Lfinal:
+
+	/* Some CPUs run faster using the string copy instructions.
+	   It is also a lot simpler. Use this when possible */
+
+	.section .altinstructions,"a"
+	.align 8
+	.quad  memcpy
+	.quad  memcpy_c
+	.byte  X86_FEATURE_REP_GOOD
+	.byte  .Lfinal-memcpy
+	.byte  memcpy_c_end-memcpy_c
+	.previous
+
+	.section .altinstr_replacement,"ax"
+ /* rdi	destination
+  * rsi source
+  * rdx count
+  */
+memcpy_c:
 	movq %rdi,%rax
 	movl %edx,%ecx
 	shrl $3,%ecx
@@ -30,3 +117,5 @@ memcpy:
 	rep
 	movsb
 	ret
+memcpy_c_end:
+	.previous
-- 
cgit v1.2.3-70-g09d2