From 3c726f8dee6f55e96475574e9f645327e461884c Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 7 Nov 2005 11:06:55 +1100
Subject: [PATCH] ppc64: support 64k pages

Adds a new CONFIG_PPC_64K_PAGES which, when enabled, changes the kernel
base page size to 64K.  The resulting kernel still boots on any
hardware.  On current machines with 4K pages support only, the kernel
will maintain 16 "subpages" for each 64K page transparently.

Note that while real 64K capable HW has been tested, the current patch
will not enable it yet as such hardware is not released yet, and I'm
still verifying with the firmware architects the proper to get the
information from the newer hypervisors.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/powerpc/mm/slb_low.S | 220 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 152 insertions(+), 68 deletions(-)

(limited to 'arch/powerpc/mm/slb_low.S')

diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index a3a03da503b..3e18241b6f3 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -18,61 +18,28 @@
 
 #include <linux/config.h>
 #include <asm/processor.h>
-#include <asm/page.h>
-#include <asm/mmu.h>
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
 #include <asm/cputable.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
 
-/* void slb_allocate(unsigned long ea);
+/* void slb_allocate_realmode(unsigned long ea);
  *
  * Create an SLB entry for the given EA (user or kernel).
  * 	r3 = faulting address, r13 = PACA
  *	r9, r10, r11 are clobbered by this function
  * No other registers are examined or changed.
  */
-_GLOBAL(slb_allocate)
-	/*
-	 * First find a slot, round robin. Previously we tried to find
-	 * a free slot first but that took too long. Unfortunately we
-	 * dont have any LRU information to help us choose a slot.
-	 */
-#ifdef CONFIG_PPC_ISERIES
-	/*
-	 * On iSeries, the "bolted" stack segment can be cast out on
-	 * shared processor switch so we need to check for a miss on
-	 * it and restore it to the right slot.
-	 */
-	ld	r9,PACAKSAVE(r13)
-	clrrdi	r9,r9,28
-	clrrdi	r11,r3,28
-	li	r10,SLB_NUM_BOLTED-1	/* Stack goes in last bolted slot */
-	cmpld	r9,r11
-	beq	3f
-#endif /* CONFIG_PPC_ISERIES */
-
-	ld	r10,PACASTABRR(r13)
-	addi	r10,r10,1
-	/* use a cpu feature mask if we ever change our slb size */
-	cmpldi	r10,SLB_NUM_ENTRIES
-
-	blt+	4f
-	li	r10,SLB_NUM_BOLTED
-
-4:
-	std	r10,PACASTABRR(r13)
-3:
-	/* r3 = faulting address, r10 = entry */
+_GLOBAL(slb_allocate_realmode)
+	/* r3 = faulting address */
 
 	srdi	r9,r3,60		/* get region */
-	srdi	r3,r3,28		/* get esid */
+	srdi	r10,r3,28		/* get esid */
 	cmpldi	cr7,r9,0xc		/* cmp KERNELBASE for later use */
 
-	rldimi	r10,r3,28,0		/* r10= ESID<<28 | entry */
-	oris	r10,r10,SLB_ESID_V@h	/* r10 |= SLB_ESID_V */
-
-	/* r3 = esid, r10 = esid_data, cr7 = <>KERNELBASE */
-
+	/* r3 = address, r10 = esid, cr7 = <>KERNELBASE */
 	blt	cr7,0f			/* user or kernel? */
 
 	/* kernel address: proto-VSID = ESID */
@@ -81,43 +48,161 @@ _GLOBAL(slb_allocate)
 	 * top segment.  That's ok, the scramble below will translate
 	 * it to VSID 0, which is reserved as a bad VSID - one which
 	 * will never have any pages in it.  */
-	li	r11,SLB_VSID_KERNEL
-BEGIN_FTR_SECTION
-	bne	cr7,9f
-	li	r11,(SLB_VSID_KERNEL|SLB_VSID_L)
-END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
-	b	9f
 
-0:	/* user address: proto-VSID = context<<15 | ESID */
-	srdi.	r9,r3,USER_ESID_BITS
+	/* Check if hitting the linear mapping of the vmalloc/ioremap
+	 * kernel space
+	*/
+	bne	cr7,1f
+
+	/* Linear mapping encoding bits, the "li" instruction below will
+	 * be patched by the kernel at boot
+	 */
+_GLOBAL(slb_miss_kernel_load_linear)
+	li	r11,0
+	b	slb_finish_load
+
+1:	/* vmalloc/ioremap mapping encoding bits, the "li" instruction below
+	 * will be patched by the kernel at boot
+	 */
+_GLOBAL(slb_miss_kernel_load_virtual)
+	li	r11,0
+	b	slb_finish_load
+
+
+0:	/* user address: proto-VSID = context << 15 | ESID. First check
+	 * if the address is within the boundaries of the user region
+	 */
+	srdi.	r9,r10,USER_ESID_BITS
 	bne-	8f			/* invalid ea bits set */
 
+	/* Figure out if the segment contains huge pages */
 #ifdef CONFIG_HUGETLB_PAGE
 BEGIN_FTR_SECTION
+	b	1f
+END_FTR_SECTION_IFCLR(CPU_FTR_16M_PAGE)
 	lhz	r9,PACAHIGHHTLBAREAS(r13)
-	srdi	r11,r3,(HTLB_AREA_SHIFT-SID_SHIFT)
+	srdi	r11,r10,(HTLB_AREA_SHIFT-SID_SHIFT)
 	srd	r9,r9,r11
 	lhz	r11,PACALOWHTLBAREAS(r13)
-	srd	r11,r11,r3
-	or	r9,r9,r11
-END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
+	srd	r11,r11,r10
+	or.	r9,r9,r11
+	beq	1f
+_GLOBAL(slb_miss_user_load_huge)
+	li	r11,0
+	b	2f
+1:
 #endif /* CONFIG_HUGETLB_PAGE */
 
-	li	r11,SLB_VSID_USER
+_GLOBAL(slb_miss_user_load_normal)
+	li	r11,0
 
-#ifdef CONFIG_HUGETLB_PAGE
-BEGIN_FTR_SECTION
-	rldimi	r11,r9,8,55		/* shift masked bit into SLB_VSID_L */
-END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
-#endif /* CONFIG_HUGETLB_PAGE */
+2:
+	ld	r9,PACACONTEXTID(r13)
+	rldimi	r10,r9,USER_ESID_BITS,0
+	b	slb_finish_load
+
+8:	/* invalid EA */
+	li	r10,0			/* BAD_VSID */
+	li	r11,SLB_VSID_USER	/* flags don't much matter */
+	b	slb_finish_load
+
+#ifdef __DISABLED__
+
+/* void slb_allocate_user(unsigned long ea);
+ *
+ * Create an SLB entry for the given EA (user or kernel).
+ * 	r3 = faulting address, r13 = PACA
+ *	r9, r10, r11 are clobbered by this function
+ * No other registers are examined or changed.
+ *
+ * It is called with translation enabled in order to be able to walk the
+ * page tables. This is not currently used.
+ */
+_GLOBAL(slb_allocate_user)
+	/* r3 = faulting address */
+	srdi	r10,r3,28		/* get esid */
+
+	crset	4*cr7+lt		/* set "user" flag for later */
+
+	/* check if we fit in the range covered by the pagetables*/
+	srdi.	r9,r3,PGTABLE_EADDR_SIZE
+	crnot	4*cr0+eq,4*cr0+eq
+	beqlr
 
+	/* now we need to get to the page tables in order to get the page
+	 * size encoding from the PMD. In the future, we'll be able to deal
+	 * with 1T segments too by getting the encoding from the PGD instead
+	 */
+	ld	r9,PACAPGDIR(r13)
+	cmpldi	cr0,r9,0
+	beqlr
+	rlwinm	r11,r10,8,25,28
+	ldx	r9,r9,r11		/* get pgd_t */
+	cmpldi	cr0,r9,0
+	beqlr
+	rlwinm	r11,r10,3,17,28
+	ldx	r9,r9,r11		/* get pmd_t */
+	cmpldi	cr0,r9,0
+	beqlr
+
+	/* build vsid flags */
+	andi.	r11,r9,SLB_VSID_LLP
+	ori	r11,r11,SLB_VSID_USER
+
+	/* get context to calculate proto-VSID */
 	ld	r9,PACACONTEXTID(r13)
-	rldimi	r3,r9,USER_ESID_BITS,0
+	rldimi	r10,r9,USER_ESID_BITS,0
+
+	/* fall through slb_finish_load */
+
+#endif /* __DISABLED__ */
 
-9:	/* r3 = protovsid, r11 = flags, r10 = esid_data, cr7 = <>KERNELBASE */
-	ASM_VSID_SCRAMBLE(r3,r9)
 
-	rldimi	r11,r3,SLB_VSID_SHIFT,16	/* combine VSID and flags */
+/*
+ * Finish loading of an SLB entry and return
+ *
+ * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9, cr7 = <>KERNELBASE
+ */
+slb_finish_load:
+	ASM_VSID_SCRAMBLE(r10,r9)
+	rldimi	r11,r10,SLB_VSID_SHIFT,16	/* combine VSID and flags */
+
+	/* r3 = EA, r11 = VSID data */
+	/*
+	 * Find a slot, round robin. Previously we tried to find a
+	 * free slot first but that took too long. Unfortunately we
+ 	 * dont have any LRU information to help us choose a slot.
+ 	 */
+#ifdef CONFIG_PPC_ISERIES
+	/*
+	 * On iSeries, the "bolted" stack segment can be cast out on
+	 * shared processor switch so we need to check for a miss on
+	 * it and restore it to the right slot.
+	 */
+	ld	r9,PACAKSAVE(r13)
+	clrrdi	r9,r9,28
+	clrrdi	r3,r3,28
+	li	r10,SLB_NUM_BOLTED-1	/* Stack goes in last bolted slot */
+	cmpld	r9,r3
+	beq	3f
+#endif /* CONFIG_PPC_ISERIES */
+
+	ld	r10,PACASTABRR(r13)
+	addi	r10,r10,1
+	/* use a cpu feature mask if we ever change our slb size */
+	cmpldi	r10,SLB_NUM_ENTRIES
+
+	blt+	4f
+	li	r10,SLB_NUM_BOLTED
+
+4:
+	std	r10,PACASTABRR(r13)
+
+3:
+	rldimi	r3,r10,0,36		/* r3= EA[0:35] | entry */
+	oris	r10,r3,SLB_ESID_V@h	/* r3 |= SLB_ESID_V */
+
+	/* r3 = ESID data, r11 = VSID data */
 
 	/*
 	 * No need for an isync before or after this slbmte. The exception
@@ -125,7 +210,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
 	 */
 	slbmte	r11,r10
 
-	bgelr	cr7			/* we're done for kernel addresses */
+	/* we're done for kernel addresses */
+	crclr	4*cr0+eq		/* set result to "success" */
+	bgelr	cr7
 
 	/* Update the slb cache */
 	lhz	r3,PACASLBCACHEPTR(r13)	/* offset = paca->slb_cache_ptr */
@@ -143,9 +230,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
 	li	r3,SLB_CACHE_ENTRIES+1
 2:
 	sth	r3,PACASLBCACHEPTR(r13)	/* paca->slb_cache_ptr = offset */
+	crclr	4*cr0+eq		/* set result to "success" */
 	blr
 
-8:	/* invalid EA */
-	li	r3,0			/* BAD_VSID */
-	li	r11,SLB_VSID_USER	/* flags don't much matter */
-	b	9b
-- 
cgit v1.2.3-70-g09d2


From 7d24f0b8a53261709938ffabe3e00f88f6498df9 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 7 Nov 2005 00:57:52 -0800
Subject: [PATCH] ppc64: Fix bug in SLB miss handler for hugepages

This patch, however, should be applied on top of the 64k-page-size patch to
fix some problems with hugepage (some pre-existing, another introduced by
this patch).

The patch fixes a bug in the SLB miss handler for hugepages on ppc64
introduced by the dynamic hugepage patch (commit id
c594adad5653491813959277fb87a2fef54c4e05) due to a misunderstanding of the
srd instruction's behaviour (mea culpa).  The problem arises when a 64-bit
process maps some hugepages in the low 4GB of the address space (unusual).
In this case, as well as the 256M segment in question being marked for
hugepages, other segments at 32G intervals will be incorrectly marked for
hugepages.

In the process, this patch tweaks the semantics of the hugepage bitmaps to
be more sensible.  Previously, an address below 4G was marked for hugepages
if the appropriate segment bit in the "low areas" bitmask was set *or* if
the low bit in the "high areas" bitmap was set (which would mark all
addresses below 1TB for hugepage).  With this patch, any given address is
governed by a single bitmap.  Addresses below 4GB are marked for hugepage
if and only if their bit is set in the "low areas" bitmap (256M
granularity).  Addresses between 4GB and 1TB are marked for hugepage iff
the low bit in the "high areas" bitmap is set.  Higher addresses are marked
for hugepage iff their bit in the "high areas" bitmap is set (1TB
granularity).

To avoid conflicts, this patch must be applied on top of BenH's pending
patch for 64k base page size [0].  As such, this patch also addresses a
hugepage problem introduced by that patch.  That patch allows hugepages of
1MB in size on hardware which supports it, however, that won't work when
using 4k pages (4 level pagetable), because in that case hugepage PTEs are
stored at the PMD level, and each PMD entry maps 2MB.  This patch simply
disallows hugepages in that case (we can do something cleverer to re-enable
them some other day).

Built, booted, and a handful of hugepage related tests passed on POWER5
LPAR (both ARCH=powerpc and ARCH=ppc64).

[0] http://gate.crashing.org/~benh/ppc64-64k-pages.diff

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/powerpc/mm/hash_utils_64.c |  6 ++++--
 arch/powerpc/mm/hugetlbpage.c   |  6 ++++++
 arch/powerpc/mm/slb_low.S       | 13 +++++++++----
 include/asm-ppc64/pgtable-4k.h  |  3 +++
 include/asm-ppc64/pgtable-64k.h |  3 +++
 5 files changed, 25 insertions(+), 6 deletions(-)

(limited to 'arch/powerpc/mm/slb_low.S')

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index b2f3dbca695..f15dfb92dec 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -329,12 +329,14 @@ static void __init htab_init_page_sizes(void)
 	 */
 	if (mmu_psize_defs[MMU_PAGE_16M].shift)
 		mmu_huge_psize = MMU_PAGE_16M;
+	/* With 4k/4level pagetables, we can't (for now) cope with a
+	 * huge page size < PMD_SIZE */
 	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
 		mmu_huge_psize = MMU_PAGE_1M;
 
 	/* Calculate HPAGE_SHIFT and sanity check it */
-	if (mmu_psize_defs[mmu_huge_psize].shift > 16 &&
-	    mmu_psize_defs[mmu_huge_psize].shift < 28)
+	if (mmu_psize_defs[mmu_huge_psize].shift > MIN_HUGEPTE_SHIFT &&
+	    mmu_psize_defs[mmu_huge_psize].shift < SID_SHIFT)
 		HPAGE_SHIFT = mmu_psize_defs[mmu_huge_psize].shift;
 	else
 		HPAGE_SHIFT = 0; /* No huge pages dude ! */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0073a04047e..426c269e552 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -212,6 +212,12 @@ static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
 
 	BUG_ON(area >= NUM_HIGH_AREAS);
 
+	/* Hack, so that each addresses is controlled by exactly one
+	 * of the high or low area bitmaps, the first high area starts
+	 * at 4GB, not 0 */
+	if (start == 0)
+		start = 0x100000000UL;
+
 	/* Check no VMAs are in the region */
 	vma = find_vma(mm, start);
 	if (vma && (vma->vm_start < end))
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index 3e18241b6f3..950ffc5848c 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -80,12 +80,17 @@ _GLOBAL(slb_miss_kernel_load_virtual)
 BEGIN_FTR_SECTION
 	b	1f
 END_FTR_SECTION_IFCLR(CPU_FTR_16M_PAGE)
+	cmpldi	r10,16
+
+	lhz	r9,PACALOWHTLBAREAS(r13)
+	mr	r11,r10
+	blt	5f
+
 	lhz	r9,PACAHIGHHTLBAREAS(r13)
 	srdi	r11,r10,(HTLB_AREA_SHIFT-SID_SHIFT)
-	srd	r9,r9,r11
-	lhz	r11,PACALOWHTLBAREAS(r13)
-	srd	r11,r11,r10
-	or.	r9,r9,r11
+
+5:	srd	r9,r9,r11
+	andi.	r9,r9,1
 	beq	1f
 _GLOBAL(slb_miss_user_load_huge)
 	li	r11,0
diff --git a/include/asm-ppc64/pgtable-4k.h b/include/asm-ppc64/pgtable-4k.h
index c883a274855..e9590c06ad9 100644
--- a/include/asm-ppc64/pgtable-4k.h
+++ b/include/asm-ppc64/pgtable-4k.h
@@ -23,6 +23,9 @@
 #define PMD_SIZE	(1UL << PMD_SHIFT)
 #define PMD_MASK	(~(PMD_SIZE-1))
 
+/* With 4k base page size, hugepage PTEs go at the PMD level */
+#define MIN_HUGEPTE_SHIFT	PMD_SHIFT
+
 /* PUD_SHIFT determines what a third-level page table entry can map */
 #define PUD_SHIFT	(PMD_SHIFT + PMD_INDEX_SIZE)
 #define PUD_SIZE	(1UL << PUD_SHIFT)
diff --git a/include/asm-ppc64/pgtable-64k.h b/include/asm-ppc64/pgtable-64k.h
index c5f437c86b3..154f1840ece 100644
--- a/include/asm-ppc64/pgtable-64k.h
+++ b/include/asm-ppc64/pgtable-64k.h
@@ -14,6 +14,9 @@
 #define PTRS_PER_PMD	(1 << PMD_INDEX_SIZE)
 #define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
 
+/* With 4k base page size, hugepage PTEs go at the PMD level */
+#define MIN_HUGEPTE_SHIFT	PAGE_SHIFT
+
 /* PMD_SHIFT determines what a second-level page table entry can map */
 #define PMD_SHIFT	(PAGE_SHIFT + PTE_INDEX_SIZE)
 #define PMD_SIZE	(1UL << PMD_SHIFT)
-- 
cgit v1.2.3-70-g09d2