From 17968fbbd19f1bb281ee4eb2548764ac5664c4ec Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Sun, 27 May 2012 19:54:03 +0000 Subject: powerpc: 64bit optimised __clear_user I noticed __clear_user high up in a profile of one of my RAID stress tests. The testcase was doing a dd from /dev/zero which ends up calling __clear_user. __clear_user is basically a loop with a single 4 byte store which is horribly slow. We can do much better by aligning the desination and doing 32 bytes of 8 byte stores in a loop. The following testcase was used to verify the patch: http://ozlabs.org/~anton/junkcode/stress_clear_user.c To show the improvement in performance I ran a dd from /dev/zero to /dev/null on a POWER7 box: Before: # dd if=/dev/zero of=/dev/null bs=1M count=10000 10485760000 bytes (10 GB) copied, 3.72379 s, 2.8 GB/s After: # time dd if=/dev/zero of=/dev/null bs=1M count=10000 10485760000 bytes (10 GB) copied, 0.728318 s, 14.4 GB/s Over 5x faster. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/lib/string_64.S | 141 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 arch/powerpc/lib/string_64.S (limited to 'arch/powerpc/lib/string_64.S') diff --git a/arch/powerpc/lib/string_64.S b/arch/powerpc/lib/string_64.S new file mode 100644 index 00000000000..6613b904700 --- /dev/null +++ b/arch/powerpc/lib/string_64.S @@ -0,0 +1,141 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard + */ + +#include + +/** + * __clear_user: - Zero a block of memory in user space, with less checking. + * @to: Destination address, in user space. + * @n: Number of bytes to zero. + * + * Zero a block of memory in user space. Caller must check + * the specified block with access_ok() before calling this function. + * + * Returns number of bytes that could not be cleared. + * On success, this will be zero. + */ + + .macro err1 +100: + .section __ex_table,"a" + .align 3 + .llong 100b,.Ldo_err1 + .previous + .endm + + .macro err2 +200: + .section __ex_table,"a" + .align 3 + .llong 200b,.Ldo_err2 + .previous + .endm + + .macro err3 +300: + .section __ex_table,"a" + .align 3 + .llong 300b,.Ldo_err3 + .previous + .endm + +.Ldo_err1: + mr r3,r8 + +.Ldo_err2: + mtctr r4 +1: +err3; stb r0,0(r3) + addi r3,r3,1 + addi r4,r4,-1 + bdnz 1b + +.Ldo_err3: + mr r3,r4 + blr + +_GLOBAL(__clear_user) + cmpdi r4,32 + neg r6,r3 + li r0,0 + blt .Lshort_clear + mr r8,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-3) + + /* Get the destination 8 byte aligned */ + bf cr7*4+3,1f +err1; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err1; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err1; stw r0,0(r3) + addi r3,r3,4 + +3: sub r4,r4,r6 + srdi r6,r4,5 + cmpdi r4,32 + blt .Lshort_clear + mtctr r6 + + /* Do 32 byte chunks */ +4: +err2; std r0,0(r3) +err2; std r0,8(r3) +err2; std r0,16(r3) +err2; std r0,24(r3) + addi r3,r3,32 + addi r4,r4,-32 + bdnz 4b + +.Lshort_clear: + /* up to 31 bytes to go */ + cmpdi r4,16 + blt 6f +err2; std r0,0(r3) +err2; std r0,8(r3) + addi r3,r3,16 + addi r4,r4,-16 + + /* Up to 15 bytes to go */ +6: mr r8,r3 + clrldi r4,r4,(64-4) + mtocrf 0x01,r4 + bf cr7*4+0,7f +err1; std r0,0(r3) + addi r3,r3,8 + +7: bf cr7*4+1,8f +err1; stw r0,0(r3) + addi r3,r3,4 + +8: bf cr7*4+2,9f +err1; sth r0,0(r3) + addi r3,r3,2 + +9: bf cr7*4+3,10f +err1; stb r0,0(r3) + +10: li r3,0 + blr -- cgit v1.2.3-70-g09d2 From cf8fb5533f35709ba7e31560264b565a9c7a090f Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 4 Jun 2012 16:02:22 +0000 Subject: powerpc: Optimise the 64bit optimised __clear_user I blame Mikey for this. He elevated my slightly dubious testcase: to benchmark status. And naturally we need to be number 1 at creating zeros. So lets improve __clear_user some more. As Paul suggests we can use dcbz for large lengths. This patch gets the destination cacheline aligned then uses dcbz on whole cachelines. Before: 10485760000 bytes (10 GB) copied, 0.414744 s, 25.3 GB/s After: 10485760000 bytes (10 GB) copied, 0.268597 s, 39.0 GB/s 39 GB/s, a new record. Signed-off-by: Anton Blanchard Tested-by: Olof Johansson Acked-by: Olof Johansson Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/lib/string_64.S | 63 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/lib/string_64.S') diff --git a/arch/powerpc/lib/string_64.S b/arch/powerpc/lib/string_64.S index 6613b904700..3b1e48049fa 100644 --- a/arch/powerpc/lib/string_64.S +++ b/arch/powerpc/lib/string_64.S @@ -19,6 +19,12 @@ */ #include +#include + + .section ".toc","aw" +PPC64_CACHES: + .tc ppc64_caches[TC],ppc64_caches + .section ".text" /** * __clear_user: - Zero a block of memory in user space, with less checking. @@ -94,9 +100,14 @@ err1; stw r0,0(r3) addi r3,r3,4 3: sub r4,r4,r6 - srdi r6,r4,5 + cmpdi r4,32 + cmpdi cr1,r4,512 blt .Lshort_clear + bgt cr1,.Llong_clear + +.Lmedium_clear: + srdi r6,r4,5 mtctr r6 /* Do 32 byte chunks */ @@ -139,3 +150,53 @@ err1; stb r0,0(r3) 10: li r3,0 blr + +.Llong_clear: + ld r5,PPC64_CACHES@toc(r2) + + bf cr7*4+0,11f +err2; std r0,0(r3) + addi r3,r3,8 + addi r4,r4,-8 + + /* Destination is 16 byte aligned, need to get it cacheline aligned */ +11: lwz r7,DCACHEL1LOGLINESIZE(r5) + lwz r9,DCACHEL1LINESIZE(r5) + + /* + * With worst case alignment the long clear loop takes a minimum + * of 1 byte less than 2 cachelines. + */ + sldi r10,r9,2 + cmpd r4,r10 + blt .Lmedium_clear + + neg r6,r3 + addi r10,r9,-1 + and. r5,r6,r10 + beq 13f + + srdi r6,r5,4 + mtctr r6 + mr r8,r3 +12: +err1; std r0,0(r3) +err1; std r0,8(r3) + addi r3,r3,16 + bdnz 12b + + sub r4,r4,r5 + +13: srd r6,r4,r7 + mtctr r6 + mr r8,r3 +14: +err1; dcbz r0,r3 + add r3,r3,r9 + bdnz 14b + + and r4,r4,r10 + + cmpdi r4,32 + blt .Lshort_clear + b .Lmedium_clear -- cgit v1.2.3-70-g09d2