From 62d6b66edc68f906138df7ba01efd41a45981586 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 9 Nov 2007 14:06:24 +0900 Subject: sh: Move sh32 optimized I/O routines to arch/sh/lib/ Signed-off-by: Paul Mundt --- arch/sh/lib/Makefile | 2 +- arch/sh/lib/io.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 arch/sh/lib/io.c (limited to 'arch/sh/lib') diff --git a/arch/sh/lib/Makefile b/arch/sh/lib/Makefile index 9dc7b698505..6f7ac9eeb54 100644 --- a/arch/sh/lib/Makefile +++ b/arch/sh/lib/Makefile @@ -2,7 +2,7 @@ # Makefile for SuperH-specific library files.. # -lib-y = delay.o memset.o memmove.o memchr.o \ +lib-y = delay.o io.o memset.o memmove.o memchr.o \ checksum.o strlen.o div64.o div64-generic.o memcpy-y := memcpy.o diff --git a/arch/sh/lib/io.c b/arch/sh/lib/io.c new file mode 100644 index 00000000000..4f54ec43516 --- /dev/null +++ b/arch/sh/lib/io.c @@ -0,0 +1,82 @@ +/* + * arch/sh/lib/io.c - SH32 optimized I/O routines + * + * Copyright (C) 2000 Stuart Menefy + * Copyright (C) 2005 Paul Mundt + * + * Provide real functions which expand to whatever the header file defined. + * Also definitions of machine independent IO functions. + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ +#include +#include + +void __raw_readsl(unsigned long addr, void *datap, int len) +{ + u32 *data; + + for (data = datap; (len != 0) && (((u32)data & 0x1f) != 0); len--) + *data++ = ctrl_inl(addr); + + if (likely(len >= (0x20 >> 2))) { + int tmp2, tmp3, tmp4, tmp5, tmp6; + + __asm__ __volatile__( + "1: \n\t" + "mov.l @%7, r0 \n\t" + "mov.l @%7, %2 \n\t" +#ifdef CONFIG_CPU_SH4 + "movca.l r0, @%0 \n\t" +#else + "mov.l r0, @%0 \n\t" +#endif + "mov.l @%7, %3 \n\t" + "mov.l @%7, %4 \n\t" + "mov.l @%7, %5 \n\t" + "mov.l @%7, %6 \n\t" + "mov.l @%7, r7 \n\t" + "mov.l @%7, r0 \n\t" + "mov.l %2, @(0x04,%0) \n\t" + "mov #0x20>>2, %2 \n\t" + "mov.l %3, @(0x08,%0) \n\t" + "sub %2, %1 \n\t" + "mov.l %4, @(0x0c,%0) \n\t" + "cmp/hi %1, %2 ! T if 32 > len \n\t" + "mov.l %5, @(0x10,%0) \n\t" + "mov.l %6, @(0x14,%0) \n\t" + "mov.l r7, @(0x18,%0) \n\t" + "mov.l r0, @(0x1c,%0) \n\t" + "bf.s 1b \n\t" + " add #0x20, %0 \n\t" + : "=&r" (data), "=&r" (len), + "=&r" (tmp2), "=&r" (tmp3), "=&r" (tmp4), + "=&r" (tmp5), "=&r" (tmp6) + : "r"(addr), "0" (data), "1" (len) + : "r0", "r7", "t", "memory"); + } + + for (; len != 0; len--) + *data++ = ctrl_inl(addr); +} +EXPORT_SYMBOL(__raw_readsl); + +void __raw_writesl(unsigned long addr, const void *data, int len) +{ + if (likely(len != 0)) { + int tmp1; + + __asm__ __volatile__ ( + "1: \n\t" + "mov.l @%0+, %1 \n\t" + "dt %3 \n\t" + "bf.s 1b \n\t" + " mov.l %1, @%4 \n\t" + : "=&r" (data), "=&r" (tmp1) + : "0" (data), "r" (len), "r"(addr) + : "t", "memory"); + } +} +EXPORT_SYMBOL(__raw_writesl); -- cgit v1.2.3-70-g09d2 From 9895f9429cb489ba271c06767531083ae4c4bcbe Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 21 Nov 2007 22:46:14 +0900 Subject: sh: clear/copy_page renames in lib and lib64. Signed-off-by: Paul Mundt --- arch/sh/lib/Makefile | 3 +- arch/sh/lib/clear_page.S | 152 ++++++++++++++++++ arch/sh/lib/copy_page.S | 388 +++++++++++++++++++++++++++++++++++++++++++++ arch/sh/lib64/Makefile | 2 +- arch/sh/lib64/clear_page.S | 54 +++++++ arch/sh/lib64/copy_page.S | 89 +++++++++++ arch/sh/lib64/page_clear.S | 54 ------- arch/sh/lib64/page_copy.S | 89 ----------- arch/sh/mm/Makefile_32 | 3 +- arch/sh/mm/clear_page.S | 152 ------------------ arch/sh/mm/copy_page.S | 388 --------------------------------------------- 11 files changed, 687 insertions(+), 687 deletions(-) create mode 100644 arch/sh/lib/clear_page.S create mode 100644 arch/sh/lib/copy_page.S create mode 100644 arch/sh/lib64/clear_page.S create mode 100644 arch/sh/lib64/copy_page.S delete mode 100644 arch/sh/lib64/page_clear.S delete mode 100644 arch/sh/lib64/page_copy.S delete mode 100644 arch/sh/mm/clear_page.S delete mode 100644 arch/sh/mm/copy_page.S (limited to 'arch/sh/lib') diff --git a/arch/sh/lib/Makefile b/arch/sh/lib/Makefile index 6f7ac9eeb54..ebb55d1149f 100644 --- a/arch/sh/lib/Makefile +++ b/arch/sh/lib/Makefile @@ -8,6 +8,7 @@ lib-y = delay.o io.o memset.o memmove.o memchr.o \ memcpy-y := memcpy.o memcpy-$(CONFIG_CPU_SH4) := memcpy-sh4.o -lib-y += $(memcpy-y) +lib-$(CONFIG_MMU) += copy_page.o clear_page.o +lib-y += $(memcpy-y) EXTRA_CFLAGS += -Werror diff --git a/arch/sh/lib/clear_page.S b/arch/sh/lib/clear_page.S new file mode 100644 index 00000000000..7a7c81ee3f0 --- /dev/null +++ b/arch/sh/lib/clear_page.S @@ -0,0 +1,152 @@ +/* + * __clear_user_page, __clear_user, clear_page implementation of SuperH + * + * Copyright (C) 2001 Kaz Kojima + * Copyright (C) 2001, 2002 Niibe Yutaka + * Copyright (C) 2006 Paul Mundt + */ +#include +#include + +/* + * clear_page_slow + * @to: P1 address + * + * void clear_page_slow(void *to) + */ + +/* + * r0 --- scratch + * r4 --- to + * r5 --- to + PAGE_SIZE + */ +ENTRY(clear_page_slow) + mov r4,r5 + mov.l .Llimit,r0 + add r0,r5 + mov #0,r0 + ! +1: +#if defined(CONFIG_CPU_SH3) + mov.l r0,@r4 +#elif defined(CONFIG_CPU_SH4) + movca.l r0,@r4 + mov r4,r1 +#endif + add #32,r4 + mov.l r0,@-r4 + mov.l r0,@-r4 + mov.l r0,@-r4 + mov.l r0,@-r4 + mov.l r0,@-r4 + mov.l r0,@-r4 + mov.l r0,@-r4 +#if defined(CONFIG_CPU_SH4) + ocbwb @r1 +#endif + cmp/eq r5,r4 + bf/s 1b + add #28,r4 + ! + rts + nop +.Llimit: .long (PAGE_SIZE-28) + +ENTRY(__clear_user) + ! + mov #0, r0 + mov #0xe0, r1 ! 0xffffffe0 + ! + ! r4..(r4+31)&~32 -------- not aligned [ Area 0 ] + ! (r4+31)&~32..(r4+r5)&~32 -------- aligned [ Area 1 ] + ! (r4+r5)&~32..r4+r5 -------- not aligned [ Area 2 ] + ! + ! Clear area 0 + mov r4, r2 + ! + tst r1, r5 ! length < 32 + bt .Larea2 ! skip to remainder + ! + add #31, r2 + and r1, r2 + cmp/eq r4, r2 + bt .Larea1 + mov r2, r3 + sub r4, r3 + mov r3, r7 + mov r4, r2 + ! +.L0: dt r3 +0: mov.b r0, @r2 + bf/s .L0 + add #1, r2 + ! + sub r7, r5 + mov r2, r4 +.Larea1: + mov r4, r3 + add r5, r3 + and r1, r3 + cmp/hi r2, r3 + bf .Larea2 + ! + ! Clear area 1 +#if defined(CONFIG_CPU_SH4) +1: movca.l r0, @r2 +#else +1: mov.l r0, @r2 +#endif + add #4, r2 +2: mov.l r0, @r2 + add #4, r2 +3: mov.l r0, @r2 + add #4, r2 +4: mov.l r0, @r2 + add #4, r2 +5: mov.l r0, @r2 + add #4, r2 +6: mov.l r0, @r2 + add #4, r2 +7: mov.l r0, @r2 + add #4, r2 +8: mov.l r0, @r2 + add #4, r2 + cmp/hi r2, r3 + bt/s 1b + nop + ! + ! Clear area 2 +.Larea2: + mov r4, r3 + add r5, r3 + cmp/hs r3, r2 + bt/s .Ldone + sub r2, r3 +.L2: dt r3 +9: mov.b r0, @r2 + bf/s .L2 + add #1, r2 + ! +.Ldone: rts + mov #0, r0 ! return 0 as normal return + + ! return the number of bytes remained +.Lbad_clear_user: + mov r4, r0 + add r5, r0 + rts + sub r2, r0 + +.section __ex_table,"a" + .align 2 + .long 0b, .Lbad_clear_user + .long 1b, .Lbad_clear_user + .long 2b, .Lbad_clear_user + .long 3b, .Lbad_clear_user + .long 4b, .Lbad_clear_user + .long 5b, .Lbad_clear_user + .long 6b, .Lbad_clear_user + .long 7b, .Lbad_clear_user + .long 8b, .Lbad_clear_user + .long 9b, .Lbad_clear_user +.previous diff --git a/arch/sh/lib/copy_page.S b/arch/sh/lib/copy_page.S new file mode 100644 index 00000000000..b879545fa28 --- /dev/null +++ b/arch/sh/lib/copy_page.S @@ -0,0 +1,388 @@ +/* + * copy_page, __copy_user_page, __copy_user implementation of SuperH + * + * Copyright (C) 2001 Niibe Yutaka & Kaz Kojima + * Copyright (C) 2002 Toshinobu Sugioka + * Copyright (C) 2006 Paul Mundt + */ +#include +#include + +/* + * copy_page + * @to: P1 address + * @from: P1 address + * + * void copy_page(void *to, void *from) + */ + +/* + * r0, r1, r2, r3, r4, r5, r6, r7 --- scratch + * r8 --- from + PAGE_SIZE + * r9 --- not used + * r10 --- to + * r11 --- from + */ +ENTRY(copy_page) + mov.l r8,@-r15 + mov.l r10,@-r15 + mov.l r11,@-r15 + mov r4,r10 + mov r5,r11 + mov r5,r8 + mov.l .Lpsz,r0 + add r0,r8 + ! +1: mov.l @r11+,r0 + mov.l @r11+,r1 + mov.l @r11+,r2 + mov.l @r11+,r3 + mov.l @r11+,r4 + mov.l @r11+,r5 + mov.l @r11+,r6 + mov.l @r11+,r7 +#if defined(CONFIG_CPU_SH3) + mov.l r0,@r10 +#elif defined(CONFIG_CPU_SH4) + movca.l r0,@r10 + mov r10,r0 +#endif + add #32,r10 + mov.l r7,@-r10 + mov.l r6,@-r10 + mov.l r5,@-r10 + mov.l r4,@-r10 + mov.l r3,@-r10 + mov.l r2,@-r10 + mov.l r1,@-r10 +#if defined(CONFIG_CPU_SH4) + ocbwb @r0 +#endif + cmp/eq r11,r8 + bf/s 1b + add #28,r10 + ! + mov.l @r15+,r11 + mov.l @r15+,r10 + mov.l @r15+,r8 + rts + nop + + .align 2 +.Lpsz: .long PAGE_SIZE +/* + * __kernel_size_t __copy_user(void *to, const void *from, __kernel_size_t n); + * Return the number of bytes NOT copied + */ +#define EX(...) \ + 9999: __VA_ARGS__ ; \ + .section __ex_table, "a"; \ + .long 9999b, 6000f ; \ + .previous +ENTRY(__copy_user) + ! Check if small number of bytes + mov #11,r0 + mov r4,r3 + cmp/gt r0,r6 ! r6 (len) > r0 (11) + bf/s .L_cleanup_loop_no_pop + add r6,r3 ! last destination address + + ! Calculate bytes needed to align to src + mov.l r11,@-r15 + neg r5,r0 + mov.l r10,@-r15 + add #4,r0 + mov.l r9,@-r15 + and #3,r0 + mov.l r8,@-r15 + tst r0,r0 + bt 2f + +1: + ! Copy bytes to long word align src +EX( mov.b @r5+,r1 ) + dt r0 + add #-1,r6 +EX( mov.b r1,@r4 ) + bf/s 1b + add #1,r4 + + ! Jump to appropriate routine depending on dest +2: mov #3,r1 + mov r6, r2 + and r4,r1 + shlr2 r2 + shll2 r1 + mova .L_jump_tbl,r0 + mov.l @(r0,r1),r1 + jmp @r1 + nop + + .align 2 +.L_jump_tbl: + .long .L_dest00 + .long .L_dest01 + .long .L_dest10 + .long .L_dest11 + +/* + * Come here if there are less than 12 bytes to copy + * + * Keep the branch target close, so the bf/s callee doesn't overflow + * and result in a more expensive branch being inserted. This is the + * fast-path for small copies, the jump via the jump table will hit the + * default slow-path cleanup. -PFM. + */ +.L_cleanup_loop_no_pop: + tst r6,r6 ! Check explicitly for zero + bt 1f + +2: +EX( mov.b @r5+,r0 ) + dt r6 +EX( mov.b r0,@r4 ) + bf/s 2b + add #1,r4 + +1: mov #0,r0 ! normal return +5000: + +# Exception handler: +.section .fixup, "ax" +6000: + mov.l 8000f,r1 + mov r3,r0 + jmp @r1 + sub r4,r0 + .align 2 +8000: .long 5000b + +.previous + rts + nop + +! Destination = 00 + +.L_dest00: + ! Skip the large copy for small transfers + mov #(32+32-4), r0 + cmp/gt r6, r0 ! r0 (60) > r6 (len) + bt 1f + + ! Align dest to a 32 byte boundary + neg r4,r0 + add #0x20, r0 + and #0x1f, r0 + tst r0, r0 + bt 2f + + sub r0, r6 + shlr2 r0 +3: +EX( mov.l @r5+,r1 ) + dt r0 +EX( mov.l r1,@r4 ) + bf/s 3b + add #4,r4 + +2: +EX( mov.l @r5+,r0 ) +EX( mov.l @r5+,r1 ) +EX( mov.l @r5+,r2 ) +EX( mov.l @r5+,r7 ) +EX( mov.l @r5+,r8 ) +EX( mov.l @r5+,r9 ) +EX( mov.l @r5+,r10 ) +EX( mov.l @r5+,r11 ) +#ifdef CONFIG_CPU_SH4 +EX( movca.l r0,@r4 ) +#else +EX( mov.l r0,@r4 ) +#endif + add #-32, r6 +EX( mov.l r1,@(4,r4) ) + mov #32, r0 +EX( mov.l r2,@(8,r4) ) + cmp/gt r6, r0 ! r0 (32) > r6 (len) +EX( mov.l r7,@(12,r4) ) +EX( mov.l r8,@(16,r4) ) +EX( mov.l r9,@(20,r4) ) +EX( mov.l r10,@(24,r4) ) +EX( mov.l r11,@(28,r4) ) + bf/s 2b + add #32,r4 + +1: mov r6, r0 + shlr2 r0 + tst r0, r0 + bt .L_cleanup +1: +EX( mov.l @r5+,r1 ) + dt r0 +EX( mov.l r1,@r4 ) + bf/s 1b + add #4,r4 + + bra .L_cleanup + nop + +! Destination = 10 + +.L_dest10: + mov r2,r7 + shlr2 r7 + shlr r7 + tst r7,r7 + mov #7,r0 + bt/s 1f + and r0,r2 +2: + dt r7 +#ifdef CONFIG_CPU_LITTLE_ENDIAN +EX( mov.l @r5+,r0 ) +EX( mov.l @r5+,r1 ) +EX( mov.l @r5+,r8 ) +EX( mov.l @r5+,r9 ) +EX( mov.l @r5+,r10 ) +EX( mov.w r0,@r4 ) + add #2,r4 + xtrct r1,r0 + xtrct r8,r1 + xtrct r9,r8 + xtrct r10,r9 + +EX( mov.l r0,@r4 ) +EX( mov.l r1,@(4,r4) ) +EX( mov.l r8,@(8,r4) ) +EX( mov.l r9,@(12,r4) ) + +EX( mov.l @r5+,r1 ) +EX( mov.l @r5+,r8 ) +EX( mov.l @r5+,r0 ) + xtrct r1,r10 + xtrct r8,r1 + xtrct r0,r8 + shlr16 r0 +EX( mov.l r10,@(16,r4) ) +EX( mov.l r1,@(20,r4) ) +EX( mov.l r8,@(24,r4) ) +EX( mov.w r0,@(28,r4) ) + bf/s 2b + add #30,r4 +#else +EX( mov.l @(28,r5),r0 ) +EX( mov.l @(24,r5),r8 ) +EX( mov.l @(20,r5),r9 ) +EX( mov.l @(16,r5),r10 ) +EX( mov.w r0,@(30,r4) ) + add #-2,r4 + xtrct r8,r0 + xtrct r9,r8 + xtrct r10,r9 +EX( mov.l r0,@(28,r4) ) +EX( mov.l r8,@(24,r4) ) +EX( mov.l r9,@(20,r4) ) + +EX( mov.l @(12,r5),r0 ) +EX( mov.l @(8,r5),r8 ) + xtrct r0,r10 +EX( mov.l @(4,r5),r9 ) + mov.l r10,@(16,r4) +EX( mov.l @r5,r10 ) + xtrct r8,r0 + xtrct r9,r8 + xtrct r10,r9 +EX( mov.l r0,@(12,r4) ) +EX( mov.l r8,@(8,r4) ) + swap.w r10,r0 +EX( mov.l r9,@(4,r4) ) +EX( mov.w r0,@(2,r4) ) + + add #32,r5 + bf/s 2b + add #34,r4 +#endif + tst r2,r2 + bt .L_cleanup + +1: ! Read longword, write two words per iteration +EX( mov.l @r5+,r0 ) + dt r2 +#ifdef CONFIG_CPU_LITTLE_ENDIAN +EX( mov.w r0,@r4 ) + shlr16 r0 +EX( mov.w r0,@(2,r4) ) +#else +EX( mov.w r0,@(2,r4) ) + shlr16 r0 +EX( mov.w r0,@r4 ) +#endif + bf/s 1b + add #4,r4 + + bra .L_cleanup + nop + +! Destination = 01 or 11 + +.L_dest01: +.L_dest11: + ! Read longword, write byte, word, byte per iteration +EX( mov.l @r5+,r0 ) + dt r2 +#ifdef CONFIG_CPU_LITTLE_ENDIAN +EX( mov.b r0,@r4 ) + shlr8 r0 + add #1,r4 +EX( mov.w r0,@r4 ) + shlr16 r0 +EX( mov.b r0,@(2,r4) ) + bf/s .L_dest01 + add #3,r4 +#else +EX( mov.b r0,@(3,r4) ) + shlr8 r0 + swap.w r0,r7 +EX( mov.b r7,@r4 ) + add #1,r4 +EX( mov.w r0,@r4 ) + bf/s .L_dest01 + add #3,r4 +#endif + +! Cleanup last few bytes +.L_cleanup: + mov r6,r0 + and #3,r0 + tst r0,r0 + bt .L_exit + mov r0,r6 + +.L_cleanup_loop: +EX( mov.b @r5+,r0 ) + dt r6 +EX( mov.b r0,@r4 ) + bf/s .L_cleanup_loop + add #1,r4 + +.L_exit: + mov #0,r0 ! normal return + +5000: + +# Exception handler: +.section .fixup, "ax" +6000: + mov.l 8000f,r1 + mov r3,r0 + jmp @r1 + sub r4,r0 + .align 2 +8000: .long 5000b + +.previous + mov.l @r15+,r8 + mov.l @r15+,r9 + mov.l @r15+,r10 + rts + mov.l @r15+,r11 diff --git a/arch/sh/lib64/Makefile b/arch/sh/lib64/Makefile index 2f4086ac6f9..9950966923a 100644 --- a/arch/sh/lib64/Makefile +++ b/arch/sh/lib64/Makefile @@ -11,5 +11,5 @@ # Panic should really be compiled as PIC lib-y := udelay.o c-checksum.o dbg.o panic.o memcpy.o copy_user_memcpy.o \ - page_copy.o page_clear.o + copy_page.o clear_page.o diff --git a/arch/sh/lib64/clear_page.S b/arch/sh/lib64/clear_page.S new file mode 100644 index 00000000000..007ab48ecc1 --- /dev/null +++ b/arch/sh/lib64/clear_page.S @@ -0,0 +1,54 @@ +/* + Copyright 2003 Richard Curnow, SuperH (UK) Ltd. + + This file is subject to the terms and conditions of the GNU General Public + License. See the file "COPYING" in the main directory of this archive + for more details. + + Tight version of memset for the case of just clearing a page. It turns out + that having the alloco's spaced out slightly due to the increment/branch + pair causes them to contend less for access to the cache. Similarly, + keeping the stores apart from the allocos causes less contention. => Do two + separate loops. Do multiple stores per loop to amortise the + increment/branch cost a little. + + Parameters: + r2 : source effective address (start of page) + + Always clears 4096 bytes. + + Note : alloco guarded by synco to avoid TAKum03020 erratum + +*/ + + .section .text..SHmedia32,"ax" + .little + + .balign 8 + .global clear_page +clear_page: + pta/l 1f, tr1 + pta/l 2f, tr2 + ptabs/l r18, tr0 + + movi 4096, r7 + add r2, r7, r7 + add r2, r63, r6 +1: + alloco r6, 0 + synco ! TAKum03020 + addi r6, 32, r6 + bgt/l r7, r6, tr1 + + add r2, r63, r6 +2: + st.q r6, 0, r63 + st.q r6, 8, r63 + st.q r6, 16, r63 + st.q r6, 24, r63 + addi r6, 32, r6 + bgt/l r7, r6, tr2 + + blink tr0, r63 + + diff --git a/arch/sh/lib64/copy_page.S b/arch/sh/lib64/copy_page.S new file mode 100644 index 00000000000..0ec6fca63b5 --- /dev/null +++ b/arch/sh/lib64/copy_page.S @@ -0,0 +1,89 @@ +/* + Copyright 2003 Richard Curnow, SuperH (UK) Ltd. + + This file is subject to the terms and conditions of the GNU General Public + License. See the file "COPYING" in the main directory of this archive + for more details. + + Tight version of mempy for the case of just copying a page. + Prefetch strategy empirically optimised against RTL simulations + of SH5-101 cut2 eval chip with Cayman board DDR memory. + + Parameters: + r2 : destination effective address (start of page) + r3 : source effective address (start of page) + + Always copies 4096 bytes. + + Points to review. + * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead. + It seems like the prefetch needs to be at at least 4 lines ahead to get + the data into the cache in time, and the allocos contend with outstanding + prefetches for the same cache set, so it's better to have the numbers + different. + */ + + .section .text..SHmedia32,"ax" + .little + + .balign 8 + .global copy_page +copy_page: + + /* Copy 4096 bytes worth of data from r3 to r2. + Do prefetches 4 lines ahead. + Do alloco 2 lines ahead */ + + pta 1f, tr1 + pta 2f, tr2 + pta 3f, tr3 + ptabs r18, tr0 + +#if 0 + /* TAKum03020 */ + ld.q r3, 0x00, r63 + ld.q r3, 0x20, r63 + ld.q r3, 0x40, r63 + ld.q r3, 0x60, r63 +#endif + alloco r2, 0x00 + synco ! TAKum03020 + alloco r2, 0x20 + synco ! TAKum03020 + + movi 3968, r6 + add r2, r6, r6 + addi r6, 64, r7 + addi r7, 64, r8 + sub r3, r2, r60 + addi r60, 8, r61 + addi r61, 8, r62 + addi r62, 8, r23 + addi r60, 0x80, r22 + +/* Minimal code size. The extra branches inside the loop don't cost much + because they overlap with the time spent waiting for prefetches to + complete. */ +1: +#if 0 + /* TAKum03020 */ + bge/u r2, r6, tr2 ! skip prefetch for last 4 lines + ldx.q r2, r22, r63 ! prefetch 4 lines hence +#endif +2: + bge/u r2, r7, tr3 ! skip alloco for last 2 lines + alloco r2, 0x40 ! alloc destination line 2 lines ahead + synco ! TAKum03020 +3: + ldx.q r2, r60, r36 + ldx.q r2, r61, r37 + ldx.q r2, r62, r38 + ldx.q r2, r23, r39 + st.q r2, 0, r36 + st.q r2, 8, r37 + st.q r2, 16, r38 + st.q r2, 24, r39 + addi r2, 32, r2 + bgt/l r8, r2, tr1 + + blink tr0, r63 ! return diff --git a/arch/sh/lib64/page_clear.S b/arch/sh/lib64/page_clear.S deleted file mode 100644 index 007ab48ecc1..00000000000 --- a/arch/sh/lib64/page_clear.S +++ /dev/null @@ -1,54 +0,0 @@ -/* - Copyright 2003 Richard Curnow, SuperH (UK) Ltd. - - This file is subject to the terms and conditions of the GNU General Public - License. See the file "COPYING" in the main directory of this archive - for more details. - - Tight version of memset for the case of just clearing a page. It turns out - that having the alloco's spaced out slightly due to the increment/branch - pair causes them to contend less for access to the cache. Similarly, - keeping the stores apart from the allocos causes less contention. => Do two - separate loops. Do multiple stores per loop to amortise the - increment/branch cost a little. - - Parameters: - r2 : source effective address (start of page) - - Always clears 4096 bytes. - - Note : alloco guarded by synco to avoid TAKum03020 erratum - -*/ - - .section .text..SHmedia32,"ax" - .little - - .balign 8 - .global clear_page -clear_page: - pta/l 1f, tr1 - pta/l 2f, tr2 - ptabs/l r18, tr0 - - movi 4096, r7 - add r2, r7, r7 - add r2, r63, r6 -1: - alloco r6, 0 - synco ! TAKum03020 - addi r6, 32, r6 - bgt/l r7, r6, tr1 - - add r2, r63, r6 -2: - st.q r6, 0, r63 - st.q r6, 8, r63 - st.q r6, 16, r63 - st.q r6, 24, r63 - addi r6, 32, r6 - bgt/l r7, r6, tr2 - - blink tr0, r63 - - diff --git a/arch/sh/lib64/page_copy.S b/arch/sh/lib64/page_copy.S deleted file mode 100644 index 0ec6fca63b5..00000000000 --- a/arch/sh/lib64/page_copy.S +++ /dev/null @@ -1,89 +0,0 @@ -/* - Copyright 2003 Richard Curnow, SuperH (UK) Ltd. - - This file is subject to the terms and conditions of the GNU General Public - License. See the file "COPYING" in the main directory of this archive - for more details. - - Tight version of mempy for the case of just copying a page. - Prefetch strategy empirically optimised against RTL simulations - of SH5-101 cut2 eval chip with Cayman board DDR memory. - - Parameters: - r2 : destination effective address (start of page) - r3 : source effective address (start of page) - - Always copies 4096 bytes. - - Points to review. - * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead. - It seems like the prefetch needs to be at at least 4 lines ahead to get - the data into the cache in time, and the allocos contend with outstanding - prefetches for the same cache set, so it's better to have the numbers - different. - */ - - .section .text..SHmedia32,"ax" - .little - - .balign 8 - .global copy_page -copy_page: - - /* Copy 4096 bytes worth of data from r3 to r2. - Do prefetches 4 lines ahead. - Do alloco 2 lines ahead */ - - pta 1f, tr1 - pta 2f, tr2 - pta 3f, tr3 - ptabs r18, tr0 - -#if 0 - /* TAKum03020 */ - ld.q r3, 0x00, r63 - ld.q r3, 0x20, r63 - ld.q r3, 0x40, r63 - ld.q r3, 0x60, r63 -#endif - alloco r2, 0x00 - synco ! TAKum03020 - alloco r2, 0x20 - synco ! TAKum03020 - - movi 3968, r6 - add r2, r6, r6 - addi r6, 64, r7 - addi r7, 64, r8 - sub r3, r2, r60 - addi r60, 8, r61 - addi r61, 8, r62 - addi r62, 8, r23 - addi r60, 0x80, r22 - -/* Minimal code size. The extra branches inside the loop don't cost much - because they overlap with the time spent waiting for prefetches to - complete. */ -1: -#if 0 - /* TAKum03020 */ - bge/u r2, r6, tr2 ! skip prefetch for last 4 lines - ldx.q r2, r22, r63 ! prefetch 4 lines hence -#endif -2: - bge/u r2, r7, tr3 ! skip alloco for last 2 lines - alloco r2, 0x40 ! alloc destination line 2 lines ahead - synco ! TAKum03020 -3: - ldx.q r2, r60, r36 - ldx.q r2, r61, r37 - ldx.q r2, r62, r38 - ldx.q r2, r23, r39 - st.q r2, 0, r36 - st.q r2, 8, r37 - st.q r2, 16, r38 - st.q r2, 24, r39 - addi r2, 32, r2 - bgt/l r8, r2, tr1 - - blink tr0, r63 ! return diff --git a/arch/sh/mm/Makefile_32 b/arch/sh/mm/Makefile_32 index 095abd14592..e295db60b91 100644 --- a/arch/sh/mm/Makefile_32 +++ b/arch/sh/mm/Makefile_32 @@ -12,8 +12,7 @@ obj-$(CONFIG_SH7705_CACHE_32KB) += cache-sh7705.o endif mmu-y := tlb-nommu.o pg-nommu.o -mmu-$(CONFIG_MMU) := fault_32.o clear_page.o copy_page.o tlbflush_32.o \ - ioremap_32.o +mmu-$(CONFIG_MMU) := fault_32.o tlbflush_32.o ioremap_32.o obj-y += $(mmu-y) diff --git a/arch/sh/mm/clear_page.S b/arch/sh/mm/clear_page.S deleted file mode 100644 index 7a7c81ee3f0..00000000000 --- a/arch/sh/mm/clear_page.S +++ /dev/null @@ -1,152 +0,0 @@ -/* - * __clear_user_page, __clear_user, clear_page implementation of SuperH - * - * Copyright (C) 2001 Kaz Kojima - * Copyright (C) 2001, 2002 Niibe Yutaka - * Copyright (C) 2006 Paul Mundt - */ -#include -#include - -/* - * clear_page_slow - * @to: P1 address - * - * void clear_page_slow(void *to) - */ - -/* - * r0 --- scratch - * r4 --- to - * r5 --- to + PAGE_SIZE - */ -ENTRY(clear_page_slow) - mov r4,r5 - mov.l .Llimit,r0 - add r0,r5 - mov #0,r0 - ! -1: -#if defined(CONFIG_CPU_SH3) - mov.l r0,@r4 -#elif defined(CONFIG_CPU_SH4) - movca.l r0,@r4 - mov r4,r1 -#endif - add #32,r4 - mov.l r0,@-r4 - mov.l r0,@-r4 - mov.l r0,@-r4 - mov.l r0,@-r4 - mov.l r0,@-r4 - mov.l r0,@-r4 - mov.l r0,@-r4 -#if defined(CONFIG_CPU_SH4) - ocbwb @r1 -#endif - cmp/eq r5,r4 - bf/s 1b - add #28,r4 - ! - rts - nop -.Llimit: .long (PAGE_SIZE-28) - -ENTRY(__clear_user) - ! - mov #0, r0 - mov #0xe0, r1 ! 0xffffffe0 - ! - ! r4..(r4+31)&~32 -------- not aligned [ Area 0 ] - ! (r4+31)&~32..(r4+r5)&~32 -------- aligned [ Area 1 ] - ! (r4+r5)&~32..r4+r5 -------- not aligned [ Area 2 ] - ! - ! Clear area 0 - mov r4, r2 - ! - tst r1, r5 ! length < 32 - bt .Larea2 ! skip to remainder - ! - add #31, r2 - and r1, r2 - cmp/eq r4, r2 - bt .Larea1 - mov r2, r3 - sub r4, r3 - mov r3, r7 - mov r4, r2 - ! -.L0: dt r3 -0: mov.b r0, @r2 - bf/s .L0 - add #1, r2 - ! - sub r7, r5 - mov r2, r4 -.Larea1: - mov r4, r3 - add r5, r3 - and r1, r3 - cmp/hi r2, r3 - bf .Larea2 - ! - ! Clear area 1 -#if defined(CONFIG_CPU_SH4) -1: movca.l r0, @r2 -#else -1: mov.l r0, @r2 -#endif - add #4, r2 -2: mov.l r0, @r2 - add #4, r2 -3: mov.l r0, @r2 - add #4, r2 -4: mov.l r0, @r2 - add #4, r2 -5: mov.l r0, @r2 - add #4, r2 -6: mov.l r0, @r2 - add #4, r2 -7: mov.l r0, @r2 - add #4, r2 -8: mov.l r0, @r2 - add #4, r2 - cmp/hi r2, r3 - bt/s 1b - nop - ! - ! Clear area 2 -.Larea2: - mov r4, r3 - add r5, r3 - cmp/hs r3, r2 - bt/s .Ldone - sub r2, r3 -.L2: dt r3 -9: mov.b r0, @r2 - bf/s .L2 - add #1, r2 - ! -.Ldone: rts - mov #0, r0 ! return 0 as normal return - - ! return the number of bytes remained -.Lbad_clear_user: - mov r4, r0 - add r5, r0 - rts - sub r2, r0 - -.section __ex_table,"a" - .align 2 - .long 0b, .Lbad_clear_user - .long 1b, .Lbad_clear_user - .long 2b, .Lbad_clear_user - .long 3b, .Lbad_clear_user - .long 4b, .Lbad_clear_user - .long 5b, .Lbad_clear_user - .long 6b, .Lbad_clear_user - .long 7b, .Lbad_clear_user - .long 8b, .Lbad_clear_user - .long 9b, .Lbad_clear_user -.previous diff --git a/arch/sh/mm/copy_page.S b/arch/sh/mm/copy_page.S deleted file mode 100644 index b879545fa28..00000000000 --- a/arch/sh/mm/copy_page.S +++ /dev/null @@ -1,388 +0,0 @@ -/* - * copy_page, __copy_user_page, __copy_user implementation of SuperH - * - * Copyright (C) 2001 Niibe Yutaka & Kaz Kojima - * Copyright (C) 2002 Toshinobu Sugioka - * Copyright (C) 2006 Paul Mundt - */ -#include -#include - -/* - * copy_page - * @to: P1 address - * @from: P1 address - * - * void copy_page(void *to, void *from) - */ - -/* - * r0, r1, r2, r3, r4, r5, r6, r7 --- scratch - * r8 --- from + PAGE_SIZE - * r9 --- not used - * r10 --- to - * r11 --- from - */ -ENTRY(copy_page) - mov.l r8,@-r15 - mov.l r10,@-r15 - mov.l r11,@-r15 - mov r4,r10 - mov r5,r11 - mov r5,r8 - mov.l .Lpsz,r0 - add r0,r8 - ! -1: mov.l @r11+,r0 - mov.l @r11+,r1 - mov.l @r11+,r2 - mov.l @r11+,r3 - mov.l @r11+,r4 - mov.l @r11+,r5 - mov.l @r11+,r6 - mov.l @r11+,r7 -#if defined(CONFIG_CPU_SH3) - mov.l r0,@r10 -#elif defined(CONFIG_CPU_SH4) - movca.l r0,@r10 - mov r10,r0 -#endif - add #32,r10 - mov.l r7,@-r10 - mov.l r6,@-r10 - mov.l r5,@-r10 - mov.l r4,@-r10 - mov.l r3,@-r10 - mov.l r2,@-r10 - mov.l r1,@-r10 -#if defined(CONFIG_CPU_SH4) - ocbwb @r0 -#endif - cmp/eq r11,r8 - bf/s 1b - add #28,r10 - ! - mov.l @r15+,r11 - mov.l @r15+,r10 - mov.l @r15+,r8 - rts - nop - - .align 2 -.Lpsz: .long PAGE_SIZE -/* - * __kernel_size_t __copy_user(void *to, const void *from, __kernel_size_t n); - * Return the number of bytes NOT copied - */ -#define EX(...) \ - 9999: __VA_ARGS__ ; \ - .section __ex_table, "a"; \ - .long 9999b, 6000f ; \ - .previous -ENTRY(__copy_user) - ! Check if small number of bytes - mov #11,r0 - mov r4,r3 - cmp/gt r0,r6 ! r6 (len) > r0 (11) - bf/s .L_cleanup_loop_no_pop - add r6,r3 ! last destination address - - ! Calculate bytes needed to align to src - mov.l r11,@-r15 - neg r5,r0 - mov.l r10,@-r15 - add #4,r0 - mov.l r9,@-r15 - and #3,r0 - mov.l r8,@-r15 - tst r0,r0 - bt 2f - -1: - ! Copy bytes to long word align src -EX( mov.b @r5+,r1 ) - dt r0 - add #-1,r6 -EX( mov.b r1,@r4 ) - bf/s 1b - add #1,r4 - - ! Jump to appropriate routine depending on dest -2: mov #3,r1 - mov r6, r2 - and r4,r1 - shlr2 r2 - shll2 r1 - mova .L_jump_tbl,r0 - mov.l @(r0,r1),r1 - jmp @r1 - nop - - .align 2 -.L_jump_tbl: - .long .L_dest00 - .long .L_dest01 - .long .L_dest10 - .long .L_dest11 - -/* - * Come here if there are less than 12 bytes to copy - * - * Keep the branch target close, so the bf/s callee doesn't overflow - * and result in a more expensive branch being inserted. This is the - * fast-path for small copies, the jump via the jump table will hit the - * default slow-path cleanup. -PFM. - */ -.L_cleanup_loop_no_pop: - tst r6,r6 ! Check explicitly for zero - bt 1f - -2: -EX( mov.b @r5+,r0 ) - dt r6 -EX( mov.b r0,@r4 ) - bf/s 2b - add #1,r4 - -1: mov #0,r0 ! normal return -5000: - -# Exception handler: -.section .fixup, "ax" -6000: - mov.l 8000f,r1 - mov r3,r0 - jmp @r1 - sub r4,r0 - .align 2 -8000: .long 5000b - -.previous - rts - nop - -! Destination = 00 - -.L_dest00: - ! Skip the large copy for small transfers - mov #(32+32-4), r0 - cmp/gt r6, r0 ! r0 (60) > r6 (len) - bt 1f - - ! Align dest to a 32 byte boundary - neg r4,r0 - add #0x20, r0 - and #0x1f, r0 - tst r0, r0 - bt 2f - - sub r0, r6 - shlr2 r0 -3: -EX( mov.l @r5+,r1 ) - dt r0 -EX( mov.l r1,@r4 ) - bf/s 3b - add #4,r4 - -2: -EX( mov.l @r5+,r0 ) -EX( mov.l @r5+,r1 ) -EX( mov.l @r5+,r2 ) -EX( mov.l @r5+,r7 ) -EX( mov.l @r5+,r8 ) -EX( mov.l @r5+,r9 ) -EX( mov.l @r5+,r10 ) -EX( mov.l @r5+,r11 ) -#ifdef CONFIG_CPU_SH4 -EX( movca.l r0,@r4 ) -#else -EX( mov.l r0,@r4 ) -#endif - add #-32, r6 -EX( mov.l r1,@(4,r4) ) - mov #32, r0 -EX( mov.l r2,@(8,r4) ) - cmp/gt r6, r0 ! r0 (32) > r6 (len) -EX( mov.l r7,@(12,r4) ) -EX( mov.l r8,@(16,r4) ) -EX( mov.l r9,@(20,r4) ) -EX( mov.l r10,@(24,r4) ) -EX( mov.l r11,@(28,r4) ) - bf/s 2b - add #32,r4 - -1: mov r6, r0 - shlr2 r0 - tst r0, r0 - bt .L_cleanup -1: -EX( mov.l @r5+,r1 ) - dt r0 -EX( mov.l r1,@r4 ) - bf/s 1b - add #4,r4 - - bra .L_cleanup - nop - -! Destination = 10 - -.L_dest10: - mov r2,r7 - shlr2 r7 - shlr r7 - tst r7,r7 - mov #7,r0 - bt/s 1f - and r0,r2 -2: - dt r7 -#ifdef CONFIG_CPU_LITTLE_ENDIAN -EX( mov.l @r5+,r0 ) -EX( mov.l @r5+,r1 ) -EX( mov.l @r5+,r8 ) -EX( mov.l @r5+,r9 ) -EX( mov.l @r5+,r10 ) -EX( mov.w r0,@r4 ) - add #2,r4 - xtrct r1,r0 - xtrct r8,r1 - xtrct r9,r8 - xtrct r10,r9 - -EX( mov.l r0,@r4 ) -EX( mov.l r1,@(4,r4) ) -EX( mov.l r8,@(8,r4) ) -EX( mov.l r9,@(12,r4) ) - -EX( mov.l @r5+,r1 ) -EX( mov.l @r5+,r8 ) -EX( mov.l @r5+,r0 ) - xtrct r1,r10 - xtrct r8,r1 - xtrct r0,r8 - shlr16 r0 -EX( mov.l r10,@(16,r4) ) -EX( mov.l r1,@(20,r4) ) -EX( mov.l r8,@(24,r4) ) -EX( mov.w r0,@(28,r4) ) - bf/s 2b - add #30,r4 -#else -EX( mov.l @(28,r5),r0 ) -EX( mov.l @(24,r5),r8 ) -EX( mov.l @(20,r5),r9 ) -EX( mov.l @(16,r5),r10 ) -EX( mov.w r0,@(30,r4) ) - add #-2,r4 - xtrct r8,r0 - xtrct r9,r8 - xtrct r10,r9 -EX( mov.l r0,@(28,r4) ) -EX( mov.l r8,@(24,r4) ) -EX( mov.l r9,@(20,r4) ) - -EX( mov.l @(12,r5),r0 ) -EX( mov.l @(8,r5),r8 ) - xtrct r0,r10 -EX( mov.l @(4,r5),r9 ) - mov.l r10,@(16,r4) -EX( mov.l @r5,r10 ) - xtrct r8,r0 - xtrct r9,r8 - xtrct r10,r9 -EX( mov.l r0,@(12,r4) ) -EX( mov.l r8,@(8,r4) ) - swap.w r10,r0 -EX( mov.l r9,@(4,r4) ) -EX( mov.w r0,@(2,r4) ) - - add #32,r5 - bf/s 2b - add #34,r4 -#endif - tst r2,r2 - bt .L_cleanup - -1: ! Read longword, write two words per iteration -EX( mov.l @r5+,r0 ) - dt r2 -#ifdef CONFIG_CPU_LITTLE_ENDIAN -EX( mov.w r0,@r4 ) - shlr16 r0 -EX( mov.w r0,@(2,r4) ) -#else -EX( mov.w r0,@(2,r4) ) - shlr16 r0 -EX( mov.w r0,@r4 ) -#endif - bf/s 1b - add #4,r4 - - bra .L_cleanup - nop - -! Destination = 01 or 11 - -.L_dest01: -.L_dest11: - ! Read longword, write byte, word, byte per iteration -EX( mov.l @r5+,r0 ) - dt r2 -#ifdef CONFIG_CPU_LITTLE_ENDIAN -EX( mov.b r0,@r4 ) - shlr8 r0 - add #1,r4 -EX( mov.w r0,@r4 ) - shlr16 r0 -EX( mov.b r0,@(2,r4) ) - bf/s .L_dest01 - add #3,r4 -#else -EX( mov.b r0,@(3,r4) ) - shlr8 r0 - swap.w r0,r7 -EX( mov.b r7,@r4 ) - add #1,r4 -EX( mov.w r0,@r4 ) - bf/s .L_dest01 - add #3,r4 -#endif - -! Cleanup last few bytes -.L_cleanup: - mov r6,r0 - and #3,r0 - tst r0,r0 - bt .L_exit - mov r0,r6 - -.L_cleanup_loop: -EX( mov.b @r5+,r0 ) - dt r6 -EX( mov.b r0,@r4 ) - bf/s .L_cleanup_loop - add #1,r4 - -.L_exit: - mov #0,r0 ! normal return - -5000: - -# Exception handler: -.section .fixup, "ax" -6000: - mov.l 8000f,r1 - mov r3,r0 - jmp @r1 - sub r4,r0 - .align 2 -8000: .long 5000b - -.previous - mov.l @r15+,r8 - mov.l @r15+,r9 - mov.l @r15+,r10 - rts - mov.l @r15+,r11 -- cgit v1.2.3-70-g09d2 From 0dcb957db5eb658d636097a9283dabdbf59de283 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 23 Nov 2007 13:55:02 +0900 Subject: sh: Build fixes for lib32 clear_page. Signed-off-by: Paul Mundt --- arch/sh/lib/clear_page.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/sh/lib') diff --git a/arch/sh/lib/clear_page.S b/arch/sh/lib/clear_page.S index 7a7c81ee3f0..36d44ef2b8a 100644 --- a/arch/sh/lib/clear_page.S +++ b/arch/sh/lib/clear_page.S @@ -9,10 +9,10 @@ #include /* - * clear_page_slow + * clear_page * @to: P1 address * - * void clear_page_slow(void *to) + * void clear_page(void *to) */ /* @@ -20,7 +20,7 @@ * r4 --- to * r5 --- to + PAGE_SIZE */ -ENTRY(clear_page_slow) +ENTRY(clear_page) mov r4,r5 mov.l .Llimit,r0 add r0,r5 -- cgit v1.2.3-70-g09d2 From 325df7f20467da07901c4f2b006d3457bba0adec Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 30 Nov 2007 16:34:26 +0900 Subject: sh: Explicit alignment for PAGE_SIZE in copy/clear_page(). Signed-off-by: Paul Mundt --- arch/sh/lib/clear_page.S | 2 ++ arch/sh/lib/copy_page.S | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/sh/lib') diff --git a/arch/sh/lib/clear_page.S b/arch/sh/lib/clear_page.S index 36d44ef2b8a..3539123fe51 100644 --- a/arch/sh/lib/clear_page.S +++ b/arch/sh/lib/clear_page.S @@ -50,6 +50,8 @@ ENTRY(clear_page) ! rts nop + + .balign 4 .Llimit: .long (PAGE_SIZE-28) ENTRY(__clear_user) diff --git a/arch/sh/lib/copy_page.S b/arch/sh/lib/copy_page.S index b879545fa28..e002b91c875 100644 --- a/arch/sh/lib/copy_page.S +++ b/arch/sh/lib/copy_page.S @@ -68,8 +68,9 @@ ENTRY(copy_page) rts nop - .align 2 + .balign 4 .Lpsz: .long PAGE_SIZE + /* * __kernel_size_t __copy_user(void *to, const void *from, __kernel_size_t n); * Return the number of bytes NOT copied -- cgit v1.2.3-70-g09d2