summaryrefslogtreecommitdiffstats
path: root/arch/x86/lib/copy_page_64.S
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2007-10-12 21:27:47 -0400
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2007-10-12 21:27:47 -0400
commitb981d8b3f5e008ff10d993be633ad00564fc22cd (patch)
treee292dc07b22308912cf6a58354a608b9e5e8e1fd /arch/x86/lib/copy_page_64.S
parentb11d2127c4893a7315d1e16273bc8560049fa3ca (diff)
parent2b9e0aae1d50e880c58d46788e5e3ebd89d75d62 (diff)
Merge master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6
Conflicts: drivers/macintosh/adbhid.c
Diffstat (limited to 'arch/x86/lib/copy_page_64.S')
-rw-r--r--arch/x86/lib/copy_page_64.S119
1 files changed, 119 insertions, 0 deletions
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
new file mode 100644
index 00000000000..727a5d46d2f
--- /dev/null
+++ b/arch/x86/lib/copy_page_64.S
@@ -0,0 +1,119 @@
+/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+ ALIGN
+copy_page_c:
+ CFI_STARTPROC
+ movl $4096/8,%ecx
+ rep movsq
+ ret
+ CFI_ENDPROC
+ENDPROC(copy_page_c)
+
+/* Don't use streaming store because it's better when the target
+ ends up in cache. */
+
+/* Could vary the prefetch distance based on SMP/UP */
+
+ENTRY(copy_page)
+ CFI_STARTPROC
+ subq $3*8,%rsp
+ CFI_ADJUST_CFA_OFFSET 3*8
+ movq %rbx,(%rsp)
+ CFI_REL_OFFSET rbx, 0
+ movq %r12,1*8(%rsp)
+ CFI_REL_OFFSET r12, 1*8
+ movq %r13,2*8(%rsp)
+ CFI_REL_OFFSET r13, 2*8
+
+ movl $(4096/64)-5,%ecx
+ .p2align 4
+.Loop64:
+ dec %rcx
+
+ movq (%rsi), %rax
+ movq 8 (%rsi), %rbx
+ movq 16 (%rsi), %rdx
+ movq 24 (%rsi), %r8
+ movq 32 (%rsi), %r9
+ movq 40 (%rsi), %r10
+ movq 48 (%rsi), %r11
+ movq 56 (%rsi), %r12
+
+ prefetcht0 5*64(%rsi)
+
+ movq %rax, (%rdi)
+ movq %rbx, 8 (%rdi)
+ movq %rdx, 16 (%rdi)
+ movq %r8, 24 (%rdi)
+ movq %r9, 32 (%rdi)
+ movq %r10, 40 (%rdi)
+ movq %r11, 48 (%rdi)
+ movq %r12, 56 (%rdi)
+
+ leaq 64 (%rsi), %rsi
+ leaq 64 (%rdi), %rdi
+
+ jnz .Loop64
+
+ movl $5,%ecx
+ .p2align 4
+.Loop2:
+ decl %ecx
+
+ movq (%rsi), %rax
+ movq 8 (%rsi), %rbx
+ movq 16 (%rsi), %rdx
+ movq 24 (%rsi), %r8
+ movq 32 (%rsi), %r9
+ movq 40 (%rsi), %r10
+ movq 48 (%rsi), %r11
+ movq 56 (%rsi), %r12
+
+ movq %rax, (%rdi)
+ movq %rbx, 8 (%rdi)
+ movq %rdx, 16 (%rdi)
+ movq %r8, 24 (%rdi)
+ movq %r9, 32 (%rdi)
+ movq %r10, 40 (%rdi)
+ movq %r11, 48 (%rdi)
+ movq %r12, 56 (%rdi)
+
+ leaq 64(%rdi),%rdi
+ leaq 64(%rsi),%rsi
+
+ jnz .Loop2
+
+ movq (%rsp),%rbx
+ CFI_RESTORE rbx
+ movq 1*8(%rsp),%r12
+ CFI_RESTORE r12
+ movq 2*8(%rsp),%r13
+ CFI_RESTORE r13
+ addq $3*8,%rsp
+ CFI_ADJUST_CFA_OFFSET -3*8
+ ret
+.Lcopy_page_end:
+ CFI_ENDPROC
+ENDPROC(copy_page)
+
+ /* Some CPUs run faster using the string copy instructions.
+ It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+ .section .altinstr_replacement,"ax"
+1: .byte 0xeb /* jmp <disp8> */
+ .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */
+2:
+ .previous
+ .section .altinstructions,"a"
+ .align 8
+ .quad copy_page
+ .quad 1b
+ .byte X86_FEATURE_REP_GOOD
+ .byte .Lcopy_page_end - copy_page
+ .byte 2b - 1b
+ .previous