summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2011-05-18 20:59:27 +0200
committerIngo Molnar <mingo@elte.hu>2011-05-18 20:59:30 +0200
commit01ed58abec07633791f03684b937a7e22e00c9bb (patch)
tree7bb5b60c102aa08e404928ffcb3edf1e8404b5a2
parentaf2d03d4aaa847ef41a229dfee098a47908437c6 (diff)
parent26afb7c661080ae3f1f13ddf7f0c58c4f931c22b (diff)
Merge branch 'x86/mem' into perf/core
Merge reason: memcpy_64.S changes an assumption perf bench has, so merge this here so we can fix it. Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--arch/x86/include/asm/alternative-asm.h9
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/uaccess.h2
-rw-r--r--arch/x86/kernel/alternative.c9
-rw-r--r--arch/x86/kernel/cpu/common.c3
-rw-r--r--arch/x86/kernel/cpu/intel.c19
-rw-r--r--arch/x86/lib/clear_page_64.S33
-rw-r--r--arch/x86/lib/copy_user_64.S69
-rw-r--r--arch/x86/lib/memcpy_64.S45
-rw-r--r--arch/x86/lib/memmove_64.S29
-rw-r--r--arch/x86/lib/memset_64.S54
11 files changed, 219 insertions, 54 deletions
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index a63a68be1cc..94d420b360d 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -15,4 +15,13 @@
.endm
#endif
+.macro altinstruction_entry orig alt feature orig_len alt_len
+ .align 8
+ .quad \orig
+ .quad \alt
+ .word \feature
+ .byte \orig_len
+ .byte \alt_len
+.endm
+
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 91f3e087cf2..7f2f7b12329 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -195,6 +195,7 @@
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index abd3e0ea762..99f0ad753f3 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -42,7 +42,7 @@
* Returns 0 if the range is valid, nonzero otherwise.
*
* This is equivalent to the following test:
- * (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64)
+ * (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64)
*
* This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
*/
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 651454b0c81..1eeeafcb441 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -210,6 +210,15 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
u8 insnbuf[MAX_PATCH_LEN];
DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
+ /*
+ * The scan order should be from start to end. A later scanned
+ * alternative code can overwrite a previous scanned alternative code.
+ * Some kernel functions (e.g. memcpy, memset, etc) use this order to
+ * patch code.
+ *
+ * So be careful if you want to change the scan order to any other
+ * order.
+ */
for (a = start; a < end; a++) {
u8 *instr = a->instr;
BUG_ON(a->replacementlen > a->instrlen);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e2ced0074a4..173f3a3fa1a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -565,8 +565,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
- if (eax > 0)
- c->x86_capability[9] = ebx;
+ c->x86_capability[9] = ebx;
}
/* AMD-defined flags: level 0x80000001 */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index df86bc8c859..fc73a34ba8c 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -29,10 +29,10 @@
static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
{
+ u64 misc_enable;
+
/* Unmask CPUID levels if masked: */
if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
- u64 misc_enable;
-
rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
@@ -118,8 +118,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
* (model 2) with the same problem.
*/
if (c->x86 == 15) {
- u64 misc_enable;
-
rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
@@ -130,6 +128,19 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
}
}
#endif
+
+ /*
+ * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
+ * clear the fast string and enhanced fast string CPU capabilities.
+ */
+ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
+ rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+ if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
+ printk(KERN_INFO "Disabled fast string operations\n");
+ setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
+ setup_clear_cpu_cap(X86_FEATURE_ERMS);
+ }
+ }
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index aa4326bfb24..f2145cfa12a 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,5 +1,6 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
/*
* Zero a page.
@@ -14,6 +15,15 @@ ENTRY(clear_page_c)
CFI_ENDPROC
ENDPROC(clear_page_c)
+ENTRY(clear_page_c_e)
+ CFI_STARTPROC
+ movl $4096,%ecx
+ xorl %eax,%eax
+ rep stosb
+ ret
+ CFI_ENDPROC
+ENDPROC(clear_page_c_e)
+
ENTRY(clear_page)
CFI_STARTPROC
xorl %eax,%eax
@@ -38,21 +48,26 @@ ENTRY(clear_page)
.Lclear_page_end:
ENDPROC(clear_page)
- /* Some CPUs run faster using the string instructions.
- It is also a lot simpler. Use this when possible */
+ /*
+ * Some CPUs support enhanced REP MOVSB/STOSB instructions.
+ * It is recommended to use this when possible.
+ * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
+ * Otherwise, use original function.
+ *
+ */
#include <asm/cpufeature.h>
.section .altinstr_replacement,"ax"
1: .byte 0xeb /* jmp <disp8> */
.byte (clear_page_c - clear_page) - (2f - 1b) /* offset */
-2:
+2: .byte 0xeb /* jmp <disp8> */
+ .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */
+3:
.previous
.section .altinstructions,"a"
- .align 8
- .quad clear_page
- .quad 1b
- .word X86_FEATURE_REP_GOOD
- .byte .Lclear_page_end - clear_page
- .byte 2b - 1b
+ altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
+ .Lclear_page_end-clear_page, 2b-1b
+ altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \
+ .Lclear_page_end-clear_page,3b-2b
.previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 99e48261519..024840266ba 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -15,23 +15,30 @@
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
- .macro ALTERNATIVE_JUMP feature,orig,alt
+/*
+ * By placing feature2 after feature1 in altinstructions section, we logically
+ * implement:
+ * If CPU has feature2, jmp to alt2 is used
+ * else if CPU has feature1, jmp to alt1 is used
+ * else jmp to orig is used.
+ */
+ .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
0:
.byte 0xe9 /* 32bit jump */
.long \orig-1f /* by default jump to orig */
1:
.section .altinstr_replacement,"ax"
2: .byte 0xe9 /* near jump with 32bit immediate */
- .long \alt-1b /* offset */ /* or alternatively to alt */
+ .long \alt1-1b /* offset */ /* or alternatively to alt1 */
+3: .byte 0xe9 /* near jump with 32bit immediate */
+ .long \alt2-1b /* offset */ /* or alternatively to alt2 */
.previous
+
.section .altinstructions,"a"
- .align 8
- .quad 0b
- .quad 2b
- .word \feature /* when feature is set */
- .byte 5
- .byte 5
+ altinstruction_entry 0b,2b,\feature1,5,5
+ altinstruction_entry 0b,3b,\feature2,5,5
.previous
.endm
@@ -72,8 +79,10 @@ ENTRY(_copy_to_user)
addq %rdx,%rcx
jc bad_to_user
cmpq TI_addr_limit(%rax),%rcx
- jae bad_to_user
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ ja bad_to_user
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
+ copy_user_generic_unrolled,copy_user_generic_string, \
+ copy_user_enhanced_fast_string
CFI_ENDPROC
ENDPROC(_copy_to_user)
@@ -85,8 +94,10 @@ ENTRY(_copy_from_user)
addq %rdx,%rcx
jc bad_from_user
cmpq TI_addr_limit(%rax),%rcx
- jae bad_from_user
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ ja bad_from_user
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
+ copy_user_generic_unrolled,copy_user_generic_string, \
+ copy_user_enhanced_fast_string
CFI_ENDPROC
ENDPROC(_copy_from_user)
@@ -255,3 +266,37 @@ ENTRY(copy_user_generic_string)
.previous
CFI_ENDPROC
ENDPROC(copy_user_generic_string)
+
+/*
+ * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
+ * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(copy_user_enhanced_fast_string)
+ CFI_STARTPROC
+ andl %edx,%edx
+ jz 2f
+ movl %edx,%ecx
+1: rep
+ movsb
+2: xorl %eax,%eax
+ ret
+
+ .section .fixup,"ax"
+12: movl %ecx,%edx /* ecx is zerorest also */
+ jmp copy_user_handle_tail
+ .previous
+
+ .section __ex_table,"a"
+ .align 8
+ .quad 1b,12b
+ .previous
+ CFI_ENDPROC
+ENDPROC(copy_user_enhanced_fast_string)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 75ef61e35e3..daab21dae2d 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -4,6 +4,7 @@
#include <asm/cpufeature.h>
#include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
/*
* memcpy - Copy a memory block.
@@ -37,6 +38,23 @@
.Lmemcpy_e:
.previous
+/*
+ * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
+ * memcpy_c. Use memcpy_c_e when possible.
+ *
+ * This gets patched over the unrolled variant (below) via the
+ * alternative instructions framework:
+ */
+ .section .altinstr_replacement, "ax", @progbits
+.Lmemcpy_c_e:
+ movq %rdi, %rax
+
+ movl %edx, %ecx
+ rep movsb
+ ret
+.Lmemcpy_e_e:
+ .previous
+
ENTRY(__memcpy)
ENTRY(memcpy)
CFI_STARTPROC
@@ -171,21 +189,22 @@ ENDPROC(memcpy)
ENDPROC(__memcpy)
/*
- * Some CPUs run faster using the string copy instructions.
- * It is also a lot simpler. Use this when possible:
- */
-
- .section .altinstructions, "a"
- .align 8
- .quad memcpy
- .quad .Lmemcpy_c
- .word X86_FEATURE_REP_GOOD
-
- /*
+ * Some CPUs are adding enhanced REP MOVSB/STOSB feature
+ * If the feature is supported, memcpy_c_e() is the first choice.
+ * If enhanced rep movsb copy is not available, use fast string copy
+ * memcpy_c() when possible. This is faster and code is simpler than
+ * original memcpy().
+ * Otherwise, original memcpy() is used.
+ * In .altinstructions section, ERMS feature is placed after REG_GOOD
+ * feature to implement the right patch order.
+ *
* Replace only beginning, memcpy is used to apply alternatives,
* so it is silly to overwrite itself with nops - reboot is the
* only outcome...
*/
- .byte .Lmemcpy_e - .Lmemcpy_c
- .byte .Lmemcpy_e - .Lmemcpy_c
+ .section .altinstructions, "a"
+ altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
+ .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
+ altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
+ .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
.previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 0ecb8433e5a..d0ec9c2936d 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -8,6 +8,7 @@
#define _STRING_C
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
#undef memmove
@@ -24,6 +25,7 @@
*/
ENTRY(memmove)
CFI_STARTPROC
+
/* Handle more 32bytes in loop */
mov %rdi, %rax
cmp $0x20, %rdx
@@ -31,8 +33,13 @@ ENTRY(memmove)
/* Decide forward/backward copy mode */
cmp %rdi, %rsi
- jb 2f
+ jge .Lmemmove_begin_forward
+ mov %rsi, %r8
+ add %rdx, %r8
+ cmp %rdi, %r8
+ jg 2f
+.Lmemmove_begin_forward:
/*
* movsq instruction have many startup latency
* so we handle small size by general register.
@@ -78,6 +85,8 @@ ENTRY(memmove)
rep movsq
movq %r11, (%r10)
jmp 13f
+.Lmemmove_end_forward:
+
/*
* Handle data backward by movsq.
*/
@@ -194,4 +203,22 @@ ENTRY(memmove)
13:
retq
CFI_ENDPROC
+
+ .section .altinstr_replacement,"ax"
+.Lmemmove_begin_forward_efs:
+ /* Forward moving data. */
+ movq %rdx, %rcx
+ rep movsb
+ retq
+.Lmemmove_end_forward_efs:
+ .previous
+
+ .section .altinstructions,"a"
+ .align 8
+ .quad .Lmemmove_begin_forward
+ .quad .Lmemmove_begin_forward_efs
+ .word X86_FEATURE_ERMS
+ .byte .Lmemmove_end_forward-.Lmemmove_begin_forward
+ .byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
+ .previous
ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 09d34426965..79bd454b78a 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -2,9 +2,13 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
/*
- * ISO C memset - set a memory block to a byte value.
+ * ISO C memset - set a memory block to a byte value. This function uses fast
+ * string to get better performance than the original function. The code is
+ * simpler and shorter than the orignal function as well.
*
* rdi destination
* rsi value (char)
@@ -31,6 +35,28 @@
.Lmemset_e:
.previous
+/*
+ * ISO C memset - set a memory block to a byte value. This function uses
+ * enhanced rep stosb to override the fast string function.
+ * The code is simpler and shorter than the fast string function as well.
+ *
+ * rdi destination
+ * rsi value (char)
+ * rdx count (bytes)
+ *
+ * rax original destination
+ */
+ .section .altinstr_replacement, "ax", @progbits
+.Lmemset_c_e:
+ movq %rdi,%r9
+ movb %sil,%al
+ movl %edx,%ecx
+ rep stosb
+ movq %r9,%rax
+ ret
+.Lmemset_e_e:
+ .previous
+
ENTRY(memset)
ENTRY(__memset)
CFI_STARTPROC
@@ -112,16 +138,20 @@ ENTRY(__memset)
ENDPROC(memset)
ENDPROC(__memset)
- /* Some CPUs run faster using the string instructions.
- It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
+ /* Some CPUs support enhanced REP MOVSB/STOSB feature.
+ * It is recommended to use this when possible.
+ *
+ * If enhanced REP MOVSB/STOSB feature is not available, use fast string
+ * instructions.
+ *
+ * Otherwise, use original memset function.
+ *
+ * In .altinstructions section, ERMS feature is placed after REG_GOOD
+ * feature to implement the right patch order.
+ */
.section .altinstructions,"a"
- .align 8
- .quad memset
- .quad .Lmemset_c
- .word X86_FEATURE_REP_GOOD
- .byte .Lfinal - memset
- .byte .Lmemset_e - .Lmemset_c
+ altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
+ .Lfinal-memset,.Lmemset_e-.Lmemset_c
+ altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
+ .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
.previous