diff options
Diffstat (limited to 'arch/x86/lib')
-rw-r--r-- | arch/x86/lib/clear_page_64.S | 33 | ||||
-rw-r--r-- | arch/x86/lib/copy_user_64.S | 69 | ||||
-rw-r--r-- | arch/x86/lib/memcpy_64.S | 45 | ||||
-rw-r--r-- | arch/x86/lib/memmove_64.S | 29 | ||||
-rw-r--r-- | arch/x86/lib/memset_64.S | 54 |
5 files changed, 183 insertions, 47 deletions
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index aa4326bfb24..f2145cfa12a 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,5 +1,6 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> +#include <asm/alternative-asm.h> /* * Zero a page. @@ -14,6 +15,15 @@ ENTRY(clear_page_c) CFI_ENDPROC ENDPROC(clear_page_c) +ENTRY(clear_page_c_e) + CFI_STARTPROC + movl $4096,%ecx + xorl %eax,%eax + rep stosb + ret + CFI_ENDPROC +ENDPROC(clear_page_c_e) + ENTRY(clear_page) CFI_STARTPROC xorl %eax,%eax @@ -38,21 +48,26 @@ ENTRY(clear_page) .Lclear_page_end: ENDPROC(clear_page) - /* Some CPUs run faster using the string instructions. - It is also a lot simpler. Use this when possible */ + /* + * Some CPUs support enhanced REP MOVSB/STOSB instructions. + * It is recommended to use this when possible. + * If enhanced REP MOVSB/STOSB is not available, try to use fast string. + * Otherwise, use original function. + * + */ #include <asm/cpufeature.h> .section .altinstr_replacement,"ax" 1: .byte 0xeb /* jmp <disp8> */ .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ -2: +2: .byte 0xeb /* jmp <disp8> */ + .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */ +3: .previous .section .altinstructions,"a" - .align 8 - .quad clear_page - .quad 1b - .word X86_FEATURE_REP_GOOD - .byte .Lclear_page_end - clear_page - .byte 2b - 1b + altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\ + .Lclear_page_end-clear_page, 2b-1b + altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \ + .Lclear_page_end-clear_page,3b-2b .previous diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 99e48261519..024840266ba 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -15,23 +15,30 @@ #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/cpufeature.h> +#include <asm/alternative-asm.h> - .macro ALTERNATIVE_JUMP feature,orig,alt +/* + * By placing feature2 after feature1 in altinstructions section, we logically + * implement: + * If CPU has feature2, jmp to alt2 is used + * else if CPU has feature1, jmp to alt1 is used + * else jmp to orig is used. + */ + .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2 0: .byte 0xe9 /* 32bit jump */ .long \orig-1f /* by default jump to orig */ 1: .section .altinstr_replacement,"ax" 2: .byte 0xe9 /* near jump with 32bit immediate */ - .long \alt-1b /* offset */ /* or alternatively to alt */ + .long \alt1-1b /* offset */ /* or alternatively to alt1 */ +3: .byte 0xe9 /* near jump with 32bit immediate */ + .long \alt2-1b /* offset */ /* or alternatively to alt2 */ .previous + .section .altinstructions,"a" - .align 8 - .quad 0b - .quad 2b - .word \feature /* when feature is set */ - .byte 5 - .byte 5 + altinstruction_entry 0b,2b,\feature1,5,5 + altinstruction_entry 0b,3b,\feature2,5,5 .previous .endm @@ -72,8 +79,10 @@ ENTRY(_copy_to_user) addq %rdx,%rcx jc bad_to_user cmpq TI_addr_limit(%rax),%rcx - jae bad_to_user - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + ja bad_to_user + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ + copy_user_generic_unrolled,copy_user_generic_string, \ + copy_user_enhanced_fast_string CFI_ENDPROC ENDPROC(_copy_to_user) @@ -85,8 +94,10 @@ ENTRY(_copy_from_user) addq %rdx,%rcx jc bad_from_user cmpq TI_addr_limit(%rax),%rcx - jae bad_from_user - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + ja bad_from_user + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ + copy_user_generic_unrolled,copy_user_generic_string, \ + copy_user_enhanced_fast_string CFI_ENDPROC ENDPROC(_copy_from_user) @@ -255,3 +266,37 @@ ENTRY(copy_user_generic_string) .previous CFI_ENDPROC ENDPROC(copy_user_generic_string) + +/* + * Some CPUs are adding enhanced REP MOVSB/STOSB instructions. + * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled. + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ +ENTRY(copy_user_enhanced_fast_string) + CFI_STARTPROC + andl %edx,%edx + jz 2f + movl %edx,%ecx +1: rep + movsb +2: xorl %eax,%eax + ret + + .section .fixup,"ax" +12: movl %ecx,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail + .previous + + .section __ex_table,"a" + .align 8 + .quad 1b,12b + .previous + CFI_ENDPROC +ENDPROC(copy_user_enhanced_fast_string) diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 75ef61e35e3..daab21dae2d 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -4,6 +4,7 @@ #include <asm/cpufeature.h> #include <asm/dwarf2.h> +#include <asm/alternative-asm.h> /* * memcpy - Copy a memory block. @@ -37,6 +38,23 @@ .Lmemcpy_e: .previous +/* + * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than + * memcpy_c. Use memcpy_c_e when possible. + * + * This gets patched over the unrolled variant (below) via the + * alternative instructions framework: + */ + .section .altinstr_replacement, "ax", @progbits +.Lmemcpy_c_e: + movq %rdi, %rax + + movl %edx, %ecx + rep movsb + ret +.Lmemcpy_e_e: + .previous + ENTRY(__memcpy) ENTRY(memcpy) CFI_STARTPROC @@ -171,21 +189,22 @@ ENDPROC(memcpy) ENDPROC(__memcpy) /* - * Some CPUs run faster using the string copy instructions. - * It is also a lot simpler. Use this when possible: - */ - - .section .altinstructions, "a" - .align 8 - .quad memcpy - .quad .Lmemcpy_c - .word X86_FEATURE_REP_GOOD - - /* + * Some CPUs are adding enhanced REP MOVSB/STOSB feature + * If the feature is supported, memcpy_c_e() is the first choice. + * If enhanced rep movsb copy is not available, use fast string copy + * memcpy_c() when possible. This is faster and code is simpler than + * original memcpy(). + * Otherwise, original memcpy() is used. + * In .altinstructions section, ERMS feature is placed after REG_GOOD + * feature to implement the right patch order. + * * Replace only beginning, memcpy is used to apply alternatives, * so it is silly to overwrite itself with nops - reboot is the * only outcome... */ - .byte .Lmemcpy_e - .Lmemcpy_c - .byte .Lmemcpy_e - .Lmemcpy_c + .section .altinstructions, "a" + altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ + .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c + altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ + .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e .previous diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 0ecb8433e5a..d0ec9c2936d 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -8,6 +8,7 @@ #define _STRING_C #include <linux/linkage.h> #include <asm/dwarf2.h> +#include <asm/cpufeature.h> #undef memmove @@ -24,6 +25,7 @@ */ ENTRY(memmove) CFI_STARTPROC + /* Handle more 32bytes in loop */ mov %rdi, %rax cmp $0x20, %rdx @@ -31,8 +33,13 @@ ENTRY(memmove) /* Decide forward/backward copy mode */ cmp %rdi, %rsi - jb 2f + jge .Lmemmove_begin_forward + mov %rsi, %r8 + add %rdx, %r8 + cmp %rdi, %r8 + jg 2f +.Lmemmove_begin_forward: /* * movsq instruction have many startup latency * so we handle small size by general register. @@ -78,6 +85,8 @@ ENTRY(memmove) rep movsq movq %r11, (%r10) jmp 13f +.Lmemmove_end_forward: + /* * Handle data backward by movsq. */ @@ -194,4 +203,22 @@ ENTRY(memmove) 13: retq CFI_ENDPROC + + .section .altinstr_replacement,"ax" +.Lmemmove_begin_forward_efs: + /* Forward moving data. */ + movq %rdx, %rcx + rep movsb + retq +.Lmemmove_end_forward_efs: + .previous + + .section .altinstructions,"a" + .align 8 + .quad .Lmemmove_begin_forward + .quad .Lmemmove_begin_forward_efs + .word X86_FEATURE_ERMS + .byte .Lmemmove_end_forward-.Lmemmove_begin_forward + .byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs + .previous ENDPROC(memmove) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 09d34426965..79bd454b78a 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -2,9 +2,13 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> +#include <asm/cpufeature.h> +#include <asm/alternative-asm.h> /* - * ISO C memset - set a memory block to a byte value. + * ISO C memset - set a memory block to a byte value. This function uses fast + * string to get better performance than the original function. The code is + * simpler and shorter than the orignal function as well. * * rdi destination * rsi value (char) @@ -31,6 +35,28 @@ .Lmemset_e: .previous +/* + * ISO C memset - set a memory block to a byte value. This function uses + * enhanced rep stosb to override the fast string function. + * The code is simpler and shorter than the fast string function as well. + * + * rdi destination + * rsi value (char) + * rdx count (bytes) + * + * rax original destination + */ + .section .altinstr_replacement, "ax", @progbits +.Lmemset_c_e: + movq %rdi,%r9 + movb %sil,%al + movl %edx,%ecx + rep stosb + movq %r9,%rax + ret +.Lmemset_e_e: + .previous + ENTRY(memset) ENTRY(__memset) CFI_STARTPROC @@ -112,16 +138,20 @@ ENTRY(__memset) ENDPROC(memset) ENDPROC(__memset) - /* Some CPUs run faster using the string instructions. - It is also a lot simpler. Use this when possible */ - -#include <asm/cpufeature.h> - + /* Some CPUs support enhanced REP MOVSB/STOSB feature. + * It is recommended to use this when possible. + * + * If enhanced REP MOVSB/STOSB feature is not available, use fast string + * instructions. + * + * Otherwise, use original memset function. + * + * In .altinstructions section, ERMS feature is placed after REG_GOOD + * feature to implement the right patch order. + */ .section .altinstructions,"a" - .align 8 - .quad memset - .quad .Lmemset_c - .word X86_FEATURE_REP_GOOD - .byte .Lfinal - memset - .byte .Lmemset_e - .Lmemset_c + altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ + .Lfinal-memset,.Lmemset_e-.Lmemset_c + altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ + .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e .previous |