summaryrefslogtreecommitdiffstats
path: root/arch/x86_64/lib/memset.S
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/x86_64/lib/memset.S
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'arch/x86_64/lib/memset.S')
-rw-r--r--arch/x86_64/lib/memset.S125
1 files changed, 125 insertions, 0 deletions
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
new file mode 100644
index 00000000000..4b4c4063864
--- /dev/null
+++ b/arch/x86_64/lib/memset.S
@@ -0,0 +1,125 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs */
+/*
+ * ISO C memset - set a memory block to a byte value.
+ *
+ * rdi destination
+ * rsi value (char)
+ * rdx count (bytes)
+ *
+ * rax original destination
+ */
+ .globl __memset
+ .globl memset
+ .p2align 4
+memset:
+__memset:
+ movq %rdi,%r10
+ movq %rdx,%r11
+
+ /* expand byte value */
+ movzbl %sil,%ecx
+ movabs $0x0101010101010101,%rax
+ mul %rcx /* with rax, clobbers rdx */
+
+ /* align dst */
+ movl %edi,%r9d
+ andl $7,%r9d
+ jnz .Lbad_alignment
+.Lafter_bad_alignment:
+
+ movl %r11d,%ecx
+ shrl $6,%ecx
+ jz .Lhandle_tail
+
+ .p2align 4
+.Lloop_64:
+ decl %ecx
+ movq %rax,(%rdi)
+ movq %rax,8(%rdi)
+ movq %rax,16(%rdi)
+ movq %rax,24(%rdi)
+ movq %rax,32(%rdi)
+ movq %rax,40(%rdi)
+ movq %rax,48(%rdi)
+ movq %rax,56(%rdi)
+ leaq 64(%rdi),%rdi
+ jnz .Lloop_64
+
+ /* Handle tail in loops. The loops should be faster than hard
+ to predict jump tables. */
+ .p2align 4
+.Lhandle_tail:
+ movl %r11d,%ecx
+ andl $63&(~7),%ecx
+ jz .Lhandle_7
+ shrl $3,%ecx
+ .p2align 4
+.Lloop_8:
+ decl %ecx
+ movq %rax,(%rdi)
+ leaq 8(%rdi),%rdi
+ jnz .Lloop_8
+
+.Lhandle_7:
+ movl %r11d,%ecx
+ andl $7,%ecx
+ jz .Lende
+ .p2align 4
+.Lloop_1:
+ decl %ecx
+ movb %al,(%rdi)
+ leaq 1(%rdi),%rdi
+ jnz .Lloop_1
+
+.Lende:
+ movq %r10,%rax
+ ret
+
+.Lbad_alignment:
+ cmpq $7,%r11
+ jbe .Lhandle_7
+ movq %rax,(%rdi) /* unaligned store */
+ movq $8,%r8
+ subq %r9,%r8
+ addq %r8,%rdi
+ subq %r8,%r11
+ jmp .Lafter_bad_alignment
+
+ /* C stepping K8 run faster using the string instructions.
+ It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+ .section .altinstructions,"a"
+ .align 8
+ .quad memset
+ .quad memset_c
+ .byte X86_FEATURE_K8_C
+ .byte memset_c_end-memset_c
+ .byte memset_c_end-memset_c
+ .previous
+
+ .section .altinstr_replacement,"ax"
+ /* rdi destination
+ * rsi value
+ * rdx count
+ */
+memset_c:
+ movq %rdi,%r9
+ movl %edx,%r8d
+ andl $7,%r8d
+ movl %edx,%ecx
+ shrl $3,%ecx
+ /* expand byte value */
+ movzbl %sil,%esi
+ movabs $0x0101010101010101,%rax
+ mulq %rsi /* with rax, clobbers rdx */
+ rep
+ stosq
+ movl %r8d,%ecx
+ rep
+ stosb
+ movq %r9,%rax
+ ret
+memset_c_end:
+ .previous