From e28f7faf05159f1cfd564596f5e6178edba6bd49 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 5 Aug 2005 19:39:06 +1000
Subject: [PATCH] Four level pagetables for ppc64

Implement 4-level pagetables for ppc64

This patch implements full four-level page tables for ppc64, thereby
extending the usable user address range to 44 bits (16T).

The patch uses a full page for the tables at the bottom and top level,
and a quarter page for the intermediate levels.  It uses full 64-bit
pointers at every level, thus also increasing the addressable range of
physical memory.  This patch also tweaks the VSID allocation to allow
matching range for user addresses (this halves the number of available
contexts) and adds some #if and BUILD_BUG sanity checks.

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/ppc64/mm/hash_utils.c  |   2 +-
 arch/ppc64/mm/hugetlbpage.c | 187 ++++++++++++++------------------------------
 arch/ppc64/mm/imalloc.c     |   2 +-
 arch/ppc64/mm/init.c        |  62 ++++++++++-----
 arch/ppc64/mm/slb_low.S     |   2 +-
 arch/ppc64/mm/tlb.c         |  95 ++++++++++++----------
 6 files changed, 158 insertions(+), 192 deletions(-)

(limited to 'arch/ppc64/mm')

diff --git a/arch/ppc64/mm/hash_utils.c b/arch/ppc64/mm/hash_utils.c
index 623b5d130c3..65d6e852794 100644
--- a/arch/ppc64/mm/hash_utils.c
+++ b/arch/ppc64/mm/hash_utils.c
@@ -302,7 +302,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 	int local = 0;
 	cpumask_t tmp;
 
-	if ((ea & ~REGION_MASK) > EADDR_MASK)
+	if ((ea & ~REGION_MASK) >= PGTABLE_RANGE)
 		return 1;
 
  	switch (REGION_ID(ea)) {
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
index f9524602818..a13e44230a6 100644
--- a/arch/ppc64/mm/hugetlbpage.c
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -27,124 +27,91 @@
 
 #include <linux/sysctl.h>
 
-#define	HUGEPGDIR_SHIFT		(HPAGE_SHIFT + PAGE_SHIFT - 3)
-#define HUGEPGDIR_SIZE		(1UL << HUGEPGDIR_SHIFT)
-#define HUGEPGDIR_MASK		(~(HUGEPGDIR_SIZE-1))
-
-#define HUGEPTE_INDEX_SIZE	9
-#define HUGEPGD_INDEX_SIZE	10
-
-#define PTRS_PER_HUGEPTE	(1 << HUGEPTE_INDEX_SIZE)
-#define PTRS_PER_HUGEPGD	(1 << HUGEPGD_INDEX_SIZE)
-
-static inline int hugepgd_index(unsigned long addr)
-{
-	return (addr & ~REGION_MASK) >> HUGEPGDIR_SHIFT;
-}
-
-static pud_t *hugepgd_offset(struct mm_struct *mm, unsigned long addr)
+/* Modelled after find_linux_pte() */
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
-	int index;
+	pgd_t *pg;
+	pud_t *pu;
+	pmd_t *pm;
+	pte_t *pt;
 
-	if (! mm->context.huge_pgdir)
-		return NULL;
+	BUG_ON(! in_hugepage_area(mm->context, addr));
 
+	addr &= HPAGE_MASK;
+
+	pg = pgd_offset(mm, addr);
+	if (!pgd_none(*pg)) {
+		pu = pud_offset(pg, addr);
+		if (!pud_none(*pu)) {
+			pm = pmd_offset(pu, addr);
+			pt = (pte_t *)pm;
+			BUG_ON(!pmd_none(*pm)
+			       && !(pte_present(*pt) && pte_huge(*pt)));
+			return pt;
+		}
+	}
 
-	index = hugepgd_index(addr);
-	BUG_ON(index >= PTRS_PER_HUGEPGD);
-	return (pud_t *)(mm->context.huge_pgdir + index);
+	return NULL;
 }
 
-static inline pte_t *hugepte_offset(pud_t *dir, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
-	int index;
-
-	if (pud_none(*dir))
-		return NULL;
-
-	index = (addr >> HPAGE_SHIFT) % PTRS_PER_HUGEPTE;
-	return (pte_t *)pud_page(*dir) + index;
-}
+	pgd_t *pg;
+	pud_t *pu;
+	pmd_t *pm;
+	pte_t *pt;
 
-static pud_t *hugepgd_alloc(struct mm_struct *mm, unsigned long addr)
-{
 	BUG_ON(! in_hugepage_area(mm->context, addr));
 
-	if (! mm->context.huge_pgdir) {
-		pgd_t *new;
-		spin_unlock(&mm->page_table_lock);
-		/* Don't use pgd_alloc(), because we want __GFP_REPEAT */
-		new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT);
-		BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));
-		spin_lock(&mm->page_table_lock);
-
-		/*
-		 * Because we dropped the lock, we should re-check the
-		 * entry, as somebody else could have populated it..
-		 */
-		if (mm->context.huge_pgdir)
-			pgd_free(new);
-		else
-			mm->context.huge_pgdir = new;
-	}
-	return hugepgd_offset(mm, addr);
-}
+	addr &= HPAGE_MASK;
 
-static pte_t *hugepte_alloc(struct mm_struct *mm, pud_t *dir, unsigned long addr)
-{
-	if (! pud_present(*dir)) {
-		pte_t *new;
+	pg = pgd_offset(mm, addr);
+	pu = pud_alloc(mm, pg, addr);
 
-		spin_unlock(&mm->page_table_lock);
-		new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT);
-		BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));
-		spin_lock(&mm->page_table_lock);
-		/*
-		 * Because we dropped the lock, we should re-check the
-		 * entry, as somebody else could have populated it..
-		 */
-		if (pud_present(*dir)) {
-			if (new)
-				kmem_cache_free(zero_cache, new);
-		} else {
-			struct page *ptepage;
-
-			if (! new)
-				return NULL;
-			ptepage = virt_to_page(new);
-			ptepage->mapping = (void *) mm;
-			ptepage->index = addr & HUGEPGDIR_MASK;
-			pud_populate(mm, dir, new);
+	if (pu) {
+		pm = pmd_alloc(mm, pu, addr);
+		if (pm) {
+			pt = (pte_t *)pm;
+			BUG_ON(!pmd_none(*pm)
+			       && !(pte_present(*pt) && pte_huge(*pt)));
+			return pt;
 		}
 	}
 
-	return hugepte_offset(dir, addr);
+	return NULL;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
-{
-	pud_t *pud;
+#define HUGEPTE_BATCH_SIZE	(HPAGE_SIZE / PMD_SIZE)
 
-	BUG_ON(! in_hugepage_area(mm->context, addr));
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t pte)
+{
+	int i;
 
-	pud = hugepgd_offset(mm, addr);
-	if (! pud)
-		return NULL;
+	if (pte_present(*ptep)) {
+		pte_clear(mm, addr, ptep);
+		flush_tlb_pending();
+	}
 
-	return hugepte_offset(pud, addr);
+	for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) {
+		*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
+		ptep++;
+	}
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+			      pte_t *ptep)
 {
-	pud_t *pud;
+	unsigned long old = pte_update(ptep, ~0UL);
+	int i;
 
-	BUG_ON(! in_hugepage_area(mm->context, addr));
+	if (old & _PAGE_HASHPTE)
+		hpte_update(mm, addr, old, 0);
 
-	pud = hugepgd_alloc(mm, addr);
-	if (! pud)
-		return NULL;
+	for (i = 1; i < HUGEPTE_BATCH_SIZE; i++)
+		ptep[i] = __pte(0);
 
-	return hugepte_alloc(mm, pud, addr);
+	return __pte(old);
 }
 
 /*
@@ -541,42 +508,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	}
 }
 
-void hugetlb_mm_free_pgd(struct mm_struct *mm)
-{
-	int i;
-	pgd_t *pgdir;
-
-	spin_lock(&mm->page_table_lock);
-
-	pgdir = mm->context.huge_pgdir;
-	if (! pgdir)
-		goto out;
-
-	mm->context.huge_pgdir = NULL;
-
-	/* cleanup any hugepte pages leftover */
-	for (i = 0; i < PTRS_PER_HUGEPGD; i++) {
-		pud_t *pud = (pud_t *)(pgdir + i);
-
-		if (! pud_none(*pud)) {
-			pte_t *pte = (pte_t *)pud_page(*pud);
-			struct page *ptepage = virt_to_page(pte);
-
-			ptepage->mapping = NULL;
-
-			BUG_ON(memcmp(pte, empty_zero_page, PAGE_SIZE));
-			kmem_cache_free(zero_cache, pte);
-		}
-		pud_clear(pud);
-	}
-
-	BUG_ON(memcmp(pgdir, empty_zero_page, PAGE_SIZE));
-	kmem_cache_free(zero_cache, pgdir);
-
- out:
-	spin_unlock(&mm->page_table_lock);
-}
-
 int hash_huge_page(struct mm_struct *mm, unsigned long access,
 		   unsigned long ea, unsigned long vsid, int local)
 {
diff --git a/arch/ppc64/mm/imalloc.c b/arch/ppc64/mm/imalloc.c
index b6e75b891ac..c65b87b9275 100644
--- a/arch/ppc64/mm/imalloc.c
+++ b/arch/ppc64/mm/imalloc.c
@@ -31,7 +31,7 @@ static int get_free_im_addr(unsigned long size, unsigned long *im_addr)
 			break;
 		if ((unsigned long)tmp->addr >= ioremap_bot)
 			addr = tmp->size + (unsigned long) tmp->addr;
-		if (addr > IMALLOC_END-size) 
+		if (addr >= IMALLOC_END-size)
 			return 1;
 	}
 	*im_addr = addr;
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
index e58a24d4287..87f256df8de 100644
--- a/arch/ppc64/mm/init.c
+++ b/arch/ppc64/mm/init.c
@@ -66,6 +66,14 @@
 #include <asm/vdso.h>
 #include <asm/imalloc.h>
 
+#if PGTABLE_RANGE > USER_VSID_RANGE
+#warning Limited user VSID range means pagetable space is wasted
+#endif
+
+#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
+#warning TASK_SIZE is smaller than it needs to be.
+#endif
+
 int mem_init_done;
 unsigned long ioremap_bot = IMALLOC_BASE;
 static unsigned long phbs_io_bot = PHBS_IO_BASE;
@@ -226,7 +234,7 @@ void __iomem * __ioremap(unsigned long addr, unsigned long size,
 	 * Before that, we map using addresses going
 	 * up from ioremap_bot.  imalloc will use
 	 * the addresses from ioremap_bot through
-	 * IMALLOC_END (0xE000001fffffffff)
+	 * IMALLOC_END
 	 * 
 	 */
 	pa = addr & PAGE_MASK;
@@ -417,12 +425,6 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 	int index;
 	int err;
 
-#ifdef CONFIG_HUGETLB_PAGE
-	/* We leave htlb_segs as it was, but for a fork, we need to
-	 * clear the huge_pgdir. */
-	mm->context.huge_pgdir = NULL;
-#endif
-
 again:
 	if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
 		return -ENOMEM;
@@ -453,8 +455,6 @@ void destroy_context(struct mm_struct *mm)
 	spin_unlock(&mmu_context_lock);
 
 	mm->context.id = NO_CONTEXT;
-
-	hugetlb_mm_free_pgd(mm);
 }
 
 /*
@@ -833,23 +833,43 @@ void __iomem * reserve_phb_iospace(unsigned long size)
 	return virt_addr;
 }
 
-kmem_cache_t *zero_cache;
-
-static void zero_ctor(void *pte, kmem_cache_t *cache, unsigned long flags)
+static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
 {
-	memset(pte, 0, PAGE_SIZE);
+	memset(addr, 0, kmem_cache_size(cache));
 }
 
+static const int pgtable_cache_size[2] = {
+	PTE_TABLE_SIZE, PMD_TABLE_SIZE
+};
+static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
+	"pgd_pte_cache", "pud_pmd_cache",
+};
+
+kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
+
 void pgtable_cache_init(void)
 {
-	zero_cache = kmem_cache_create("zero",
-				PAGE_SIZE,
-				0,
-				SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
-				zero_ctor,
-				NULL);
-	if (!zero_cache)
-		panic("pgtable_cache_init(): could not create zero_cache!\n");
+	int i;
+
+	BUILD_BUG_ON(PTE_TABLE_SIZE != pgtable_cache_size[PTE_CACHE_NUM]);
+	BUILD_BUG_ON(PMD_TABLE_SIZE != pgtable_cache_size[PMD_CACHE_NUM]);
+	BUILD_BUG_ON(PUD_TABLE_SIZE != pgtable_cache_size[PUD_CACHE_NUM]);
+	BUILD_BUG_ON(PGD_TABLE_SIZE != pgtable_cache_size[PGD_CACHE_NUM]);
+
+	for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
+		int size = pgtable_cache_size[i];
+		const char *name = pgtable_cache_name[i];
+
+		pgtable_cache[i] = kmem_cache_create(name,
+						     size, size,
+						     SLAB_HWCACHE_ALIGN
+						     | SLAB_MUST_HWCACHE_ALIGN,
+						     zero_ctor,
+						     NULL);
+		if (! pgtable_cache[i])
+			panic("pgtable_cache_init(): could not create %s!\n",
+			      name);
+	}
 }
 
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
diff --git a/arch/ppc64/mm/slb_low.S b/arch/ppc64/mm/slb_low.S
index 8379d678f70..f20fc52483a 100644
--- a/arch/ppc64/mm/slb_low.S
+++ b/arch/ppc64/mm/slb_low.S
@@ -91,7 +91,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
 0:	/* user address: proto-VSID = context<<15 | ESID */
 	li	r11,SLB_VSID_USER
 
-	srdi.	r9,r3,13
+	srdi.	r9,r3,USER_ESID_BITS
 	bne-	8f			/* invalid ea bits set */
 
 #ifdef CONFIG_HUGETLB_PAGE
diff --git a/arch/ppc64/mm/tlb.c b/arch/ppc64/mm/tlb.c
index 26f0172c452..d8a6593a13f 100644
--- a/arch/ppc64/mm/tlb.c
+++ b/arch/ppc64/mm/tlb.c
@@ -41,7 +41,58 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
 unsigned long pte_freelist_forced_free;
 
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage)
+struct pte_freelist_batch
+{
+	struct rcu_head	rcu;
+	unsigned int	index;
+	pgtable_free_t	tables[0];
+};
+
+DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
+unsigned long pte_freelist_forced_free;
+
+#define PTE_FREELIST_SIZE \
+	((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
+	  / sizeof(pgtable_free_t))
+
+#ifdef CONFIG_SMP
+static void pte_free_smp_sync(void *arg)
+{
+	/* Do nothing, just ensure we sync with all CPUs */
+}
+#endif
+
+/* This is only called when we are critically out of memory
+ * (and fail to get a page in pte_free_tlb).
+ */
+static void pgtable_free_now(pgtable_free_t pgf)
+{
+	pte_freelist_forced_free++;
+
+	smp_call_function(pte_free_smp_sync, NULL, 0, 1);
+
+	pgtable_free(pgf);
+}
+
+static void pte_free_rcu_callback(struct rcu_head *head)
+{
+	struct pte_freelist_batch *batch =
+		container_of(head, struct pte_freelist_batch, rcu);
+	unsigned int i;
+
+	for (i = 0; i < batch->index; i++)
+		pgtable_free(batch->tables[i]);
+
+	free_page((unsigned long)batch);
+}
+
+static void pte_free_submit(struct pte_freelist_batch *batch)
+{
+	INIT_RCU_HEAD(&batch->rcu);
+	call_rcu(&batch->rcu, pte_free_rcu_callback);
+}
+
+void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
 {
 	/* This is safe as we are holding page_table_lock */
         cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
@@ -49,19 +100,19 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage)
 
 	if (atomic_read(&tlb->mm->mm_users) < 2 ||
 	    cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) {
-		pte_free(ptepage);
+		pgtable_free(pgf);
 		return;
 	}
 
 	if (*batchp == NULL) {
 		*batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
 		if (*batchp == NULL) {
-			pte_free_now(ptepage);
+			pgtable_free_now(pgf);
 			return;
 		}
 		(*batchp)->index = 0;
 	}
-	(*batchp)->pages[(*batchp)->index++] = ptepage;
+	(*batchp)->tables[(*batchp)->index++] = pgf;
 	if ((*batchp)->index == PTE_FREELIST_SIZE) {
 		pte_free_submit(*batchp);
 		*batchp = NULL;
@@ -132,42 +183,6 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
 	put_cpu();
 }
 
-#ifdef CONFIG_SMP
-static void pte_free_smp_sync(void *arg)
-{
-	/* Do nothing, just ensure we sync with all CPUs */
-}
-#endif
-
-/* This is only called when we are critically out of memory
- * (and fail to get a page in pte_free_tlb).
- */
-void pte_free_now(struct page *ptepage)
-{
-	pte_freelist_forced_free++;
-
-	smp_call_function(pte_free_smp_sync, NULL, 0, 1);
-
-	pte_free(ptepage);
-}
-
-static void pte_free_rcu_callback(struct rcu_head *head)
-{
-	struct pte_freelist_batch *batch =
-		container_of(head, struct pte_freelist_batch, rcu);
-	unsigned int i;
-
-	for (i = 0; i < batch->index; i++)
-		pte_free(batch->pages[i]);
-	free_page((unsigned long)batch);
-}
-
-void pte_free_submit(struct pte_freelist_batch *batch)
-{
-	INIT_RCU_HEAD(&batch->rcu);
-	call_rcu(&batch->rcu, pte_free_rcu_callback);
-}
-
 void pte_free_finish(void)
 {
 	/* This is safe as we are holding page_table_lock */
-- 
cgit v1.2.3-70-g09d2


From 8c65b5c955b8598d9c63b4e97392377269873a54 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Tue, 12 Jul 2005 17:42:49 +1000
Subject: [PATCH] ppc64: move iSeries vio iommu init

Since the iSeries vio iommu tables cannot be used until after the vio bus has
been initialised, move the initialisation of the tables to there.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/ppc64/kernel/iSeries_vio.c | 3 ++-
 arch/ppc64/mm/init.c            | 3 ---
 include/asm-ppc64/iommu.h       | 3 ---
 3 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'arch/ppc64/mm')

diff --git a/arch/ppc64/kernel/iSeries_vio.c b/arch/ppc64/kernel/iSeries_vio.c
index e876b438027..48f0ebf4405 100644
--- a/arch/ppc64/kernel/iSeries_vio.c
+++ b/arch/ppc64/kernel/iSeries_vio.c
@@ -27,7 +27,7 @@ EXPORT_SYMBOL(iSeries_vio_dev);
 static struct iommu_table veth_iommu_table;
 static struct iommu_table vio_iommu_table;
 
-void __init iommu_vio_init(void)
+static void __init iommu_vio_init(void)
 {
 	struct iommu_table *t;
 	struct iommu_table_cb cb;
@@ -123,6 +123,7 @@ static int __init vio_bus_init_iseries(void)
 
 	err = vio_bus_init();
 	if (err == 0) {
+		iommu_vio_init();
 		vio_bus_device.iommu_table = &vio_iommu_table;
 		iSeries_vio_dev = &vio_bus_device.dev;
 		probe_bus_iseries();
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
index 87f256df8de..9edfe267123 100644
--- a/arch/ppc64/mm/init.c
+++ b/arch/ppc64/mm/init.c
@@ -686,9 +686,6 @@ void __init mem_init(void)
 
 	mem_init_done = 1;
 
-#ifdef CONFIG_PPC_ISERIES
-	iommu_vio_init();
-#endif
 	/* Initialize the vDSO */
 	vdso_init();
 }
diff --git a/include/asm-ppc64/iommu.h b/include/asm-ppc64/iommu.h
index 729de5cc21d..72dcf8116b0 100644
--- a/include/asm-ppc64/iommu.h
+++ b/include/asm-ppc64/iommu.h
@@ -104,9 +104,6 @@ extern void iommu_devnode_init_pSeries(struct device_node *dn);
 
 #ifdef CONFIG_PPC_ISERIES
 
-/* Initializes tables for bio buses */
-extern void __init iommu_vio_init(void);
-
 struct iSeries_Device_Node;
 /* Creates table for an individual device node */
 extern void iommu_devnode_init_iSeries(struct iSeries_Device_Node *dn);
-- 
cgit v1.2.3-70-g09d2


From aefd16b0c5a594b5feaba23954ad74061f45c8a5 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Wed, 3 Aug 2005 20:21:24 +1000
Subject: [PATCH] ppc64: Remove redundant uses of physRpn_to_absRpn

physRpn_to_absRpn is a no-op on non-iSeries platforms, remove the two
redundant calls.

There's only one caller on iSeries so fold the logic in there so we can get
rid of it completely.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/ppc64/kernel/iSeries_htab.c | 5 ++++-
 arch/ppc64/kernel/pSeries_lpar.c | 3 +--
 arch/ppc64/mm/hash_native.c      | 3 +--
 include/asm-ppc64/abs_addr.h     | 8 --------
 4 files changed, 6 insertions(+), 13 deletions(-)

(limited to 'arch/ppc64/mm')

diff --git a/arch/ppc64/kernel/iSeries_htab.c b/arch/ppc64/kernel/iSeries_htab.c
index b0250ae4a72..2192055a90a 100644
--- a/arch/ppc64/kernel/iSeries_htab.c
+++ b/arch/ppc64/kernel/iSeries_htab.c
@@ -41,6 +41,7 @@ static long iSeries_hpte_insert(unsigned long hpte_group, unsigned long va,
 				unsigned long prpn, unsigned long vflags,
 				unsigned long rflags)
 {
+	unsigned long arpn;
 	long slot;
 	hpte_t lhpte;
 	int secondary = 0;
@@ -70,8 +71,10 @@ static long iSeries_hpte_insert(unsigned long hpte_group, unsigned long va,
 		slot &= 0x7fffffffffffffff;
 	}
 
+	arpn = phys_to_abs(prpn << PAGE_SHIFT) >> PAGE_SHIFT;
+
 	lhpte.v = (va >> 23) << HPTE_V_AVPN_SHIFT | vflags | HPTE_V_VALID;
-	lhpte.r = (physRpn_to_absRpn(prpn) << HPTE_R_RPN_SHIFT) | rflags;
+	lhpte.r = (arpn << HPTE_R_RPN_SHIFT) | rflags;
 
 	/* Now fill in the actual HPTE */
 	HvCallHpt_addValidate(slot, secondary, &lhpte);
diff --git a/arch/ppc64/kernel/pSeries_lpar.c b/arch/ppc64/kernel/pSeries_lpar.c
index 56845543c89..0a3ddc9227c 100644
--- a/arch/ppc64/kernel/pSeries_lpar.c
+++ b/arch/ppc64/kernel/pSeries_lpar.c
@@ -278,7 +278,6 @@ long pSeries_lpar_hpte_insert(unsigned long hpte_group,
 			      unsigned long va, unsigned long prpn,
 			      unsigned long vflags, unsigned long rflags)
 {
-	unsigned long arpn = physRpn_to_absRpn(prpn);
 	unsigned long lpar_rc;
 	unsigned long flags;
 	unsigned long slot;
@@ -289,7 +288,7 @@ long pSeries_lpar_hpte_insert(unsigned long hpte_group,
 	if (vflags & HPTE_V_LARGE)
 		hpte_v &= ~(1UL << HPTE_V_AVPN_SHIFT);
 
-	hpte_r = (arpn << HPTE_R_RPN_SHIFT) | rflags;
+	hpte_r = (prpn << HPTE_R_RPN_SHIFT) | rflags;
 
 	/* Now fill in the actual HPTE */
 	/* Set CEC cookie to 0         */
diff --git a/arch/ppc64/mm/hash_native.c b/arch/ppc64/mm/hash_native.c
index a6abd3a979b..7626bb59954 100644
--- a/arch/ppc64/mm/hash_native.c
+++ b/arch/ppc64/mm/hash_native.c
@@ -51,7 +51,6 @@ long native_hpte_insert(unsigned long hpte_group, unsigned long va,
 			unsigned long prpn, unsigned long vflags,
 			unsigned long rflags)
 {
-	unsigned long arpn = physRpn_to_absRpn(prpn);
 	hpte_t *hptep = htab_address + hpte_group;
 	unsigned long hpte_v, hpte_r;
 	int i;
@@ -74,7 +73,7 @@ long native_hpte_insert(unsigned long hpte_group, unsigned long va,
 	hpte_v = (va >> 23) << HPTE_V_AVPN_SHIFT | vflags | HPTE_V_VALID;
 	if (vflags & HPTE_V_LARGE)
 		va &= ~(1UL << HPTE_V_AVPN_SHIFT);
-	hpte_r = (arpn << HPTE_R_RPN_SHIFT) | rflags;
+	hpte_r = (prpn << HPTE_R_RPN_SHIFT) | rflags;
 
 	hptep->r = hpte_r;
 	/* Guarantee the second dword is visible before the valid bit */
diff --git a/include/asm-ppc64/abs_addr.h b/include/asm-ppc64/abs_addr.h
index 025527742fe..ab4320c1cf5 100644
--- a/include/asm-ppc64/abs_addr.h
+++ b/include/asm-ppc64/abs_addr.h
@@ -56,14 +56,6 @@ static inline unsigned long phys_to_abs(unsigned long pa)
 	return chunk_to_addr(chunk) + (pa & MSCHUNKS_OFFSET_MASK);
 }
 
-static inline unsigned long
-physRpn_to_absRpn(unsigned long rpn)
-{
-	unsigned long pa = rpn << PAGE_SHIFT;
-	unsigned long aa = phys_to_abs(pa);
-	return (aa >> PAGE_SHIFT);
-}
-
 /* A macro so it can take pointers or unsigned long. */
 #define abs_to_phys(aa) lmb_abs_to_phys((unsigned long)(aa))
 
-- 
cgit v1.2.3-70-g09d2


From e88bcd1b29f63738b702e57d831758706162347e Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Wed, 3 Aug 2005 20:21:25 +1000
Subject: [PATCH] ppc64: Remove redundant abs_to_phys() macro

abs_to_phys() is a macro that turns out to do nothing, and also has the
unfortunate property that it's not the inverse of phys_to_abs() on iSeries.

The following is for my benefit as much as everyone else.

With CONFIG_MSCHUNKS enabled, the lmb code is changed such that it keeps
a physbase variable for each lmb region. This is used to take the possibly
discontiguous lmb regions and present them as a contiguous address space
beginning from zero.

In this context each lmb region's base address is its "absolute" base
address, and its physbase is it's "physical" address (from Linux's point of
view). The abs_to_phys() macro does the mapping from "absolute" to "physical".

Note: This is not related to the iSeries mapping of physical to absolute
(ie. Hypervisor) addresses which is maintained with the msChunks structure.
And the msChunks structure is not controlled via CONFIG_MSCHUNKS.

Once upon a time you could compile for non-iSeries with CONFIG_MSCHUNKS
enabled. But these days CONFIG_MSCHUNKS depends on CONFIG_PPC_ISERIES, so
for non-iSeries code abs_to_phys() is a no-op.

On iSeries we always have one lmb region which spans from 0 to
systemcfg->physicalMemorySize (arch/ppc64/kernel/iSeries_setup.c line 383).
This region has a base (ie. absolute) address of 0, and a physbase address
of 0 (as calculated in lmb_analyze() (arch/ppc64/kernel/lmb.c line 144)).

On iSeries, abs_to_phys(aa) is defined as lmb_abs_to_phys(aa), which finds
the lmb region containing aa (and there's only one, ie. 0), and then does:

 return lmb.memory.region[0].physbase + (aa - lmb.memory.region[0].base)

physbase == base == 0, so you're left with "return aa".

So remove abs_to_phys(), and lmb_abs_to_phys() which is the implementation
of abs_to_phys() for iSeries.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/ppc64/kernel/lmb.c      | 19 -------------------
 arch/ppc64/mm/init.c         |  4 +---
 include/asm-ppc64/abs_addr.h |  6 +-----
 3 files changed, 2 insertions(+), 27 deletions(-)

(limited to 'arch/ppc64/mm')

diff --git a/arch/ppc64/kernel/lmb.c b/arch/ppc64/kernel/lmb.c
index 6cb275615fc..111da736652 100644
--- a/arch/ppc64/kernel/lmb.c
+++ b/arch/ppc64/kernel/lmb.c
@@ -313,25 +313,6 @@ lmb_end_of_DRAM(void)
 	return 0;
 }
 
-unsigned long __init
-lmb_abs_to_phys(unsigned long aa)
-{
-	unsigned long i, pa = aa;
-	struct lmb *_lmb = &lmb;
-	struct lmb_region *_mem = &(_lmb->memory);
-
-	for (i=0; i < _mem->cnt; i++) {
-		unsigned long lmbbase = _mem->region[i].base;
-		unsigned long lmbsize = _mem->region[i].size;
-		if ( lmb_addrs_overlap(aa,1,lmbbase,lmbsize) ) {
-			pa = _mem->region[i].physbase + (aa - lmbbase);
-			break;
-		}
-	}
-
-	return pa;
-}
-
 /*
  * Truncate the lmb list to memory_limit if it's set
  * You must call lmb_analyze() after this.
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
index 9edfe267123..a16cf12c586 100644
--- a/arch/ppc64/mm/init.c
+++ b/arch/ppc64/mm/init.c
@@ -42,7 +42,6 @@
 
 #include <asm/pgalloc.h>
 #include <asm/page.h>
-#include <asm/abs_addr.h>
 #include <asm/prom.h>
 #include <asm/lmb.h>
 #include <asm/rtas.h>
@@ -167,7 +166,6 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
 		ptep = pte_alloc_kernel(&init_mm, pmdp, ea);
 		if (!ptep)
 			return -ENOMEM;
-		pa = abs_to_phys(pa);
 		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
 							  __pgprot(flags)));
 		spin_unlock(&init_mm.page_table_lock);
@@ -547,7 +545,7 @@ void __init do_init_bootmem(void)
 	 */
 	bootmap_pages = bootmem_bootmap_pages(total_pages);
 
-	start = abs_to_phys(lmb_alloc(bootmap_pages<<PAGE_SHIFT, PAGE_SIZE));
+	start = lmb_alloc(bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
 	BUG_ON(!start);
 
 	boot_mapsize = init_bootmem(start >> PAGE_SHIFT, total_pages);
diff --git a/include/asm-ppc64/abs_addr.h b/include/asm-ppc64/abs_addr.h
index ab4320c1cf5..200db1c45f2 100644
--- a/include/asm-ppc64/abs_addr.h
+++ b/include/asm-ppc64/abs_addr.h
@@ -56,9 +56,6 @@ static inline unsigned long phys_to_abs(unsigned long pa)
 	return chunk_to_addr(chunk) + (pa & MSCHUNKS_OFFSET_MASK);
 }
 
-/* A macro so it can take pointers or unsigned long. */
-#define abs_to_phys(aa) lmb_abs_to_phys((unsigned long)(aa))
-
 #else  /* !CONFIG_MSCHUNKS */
 
 #define chunk_to_addr(chunk) ((unsigned long)(chunk))
@@ -68,12 +65,11 @@ static inline unsigned long phys_to_abs(unsigned long pa)
 
 #define phys_to_abs(pa) (pa)
 #define physRpn_to_absRpn(rpn) (rpn)
-#define abs_to_phys(aa) (aa)
 
 #endif /* !CONFIG_MSCHUNKS */
 
 /* Convenience macros */
 #define virt_to_abs(va) phys_to_abs(__pa(va))
-#define abs_to_virt(aa) __va(abs_to_phys(aa))
+#define abs_to_virt(aa) __va(aa)
 
 #endif /* _ABS_ADDR_H */
-- 
cgit v1.2.3-70-g09d2


From 180379dcefb39e8bd05d562b0685e9084dffcc0a Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Wed, 3 Aug 2005 20:21:26 +1000
Subject: [PATCH] ppc64: Remove physbase from the lmb_property struct

We no longer need the lmb code to know about abs and phys addresses, so
remove the physbase variable from the lmb_property struct.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/ppc64/kernel/lmb.c    | 23 -----------------------
 arch/ppc64/mm/hash_utils.c |  2 +-
 arch/ppc64/mm/init.c       | 27 ++++++++++++---------------
 arch/ppc64/mm/numa.c       |  2 +-
 include/asm-ppc64/lmb.h    |  1 -
 5 files changed, 14 insertions(+), 41 deletions(-)

(limited to 'arch/ppc64/mm')

diff --git a/arch/ppc64/kernel/lmb.c b/arch/ppc64/kernel/lmb.c
index 111da736652..6ed6312d848 100644
--- a/arch/ppc64/kernel/lmb.c
+++ b/arch/ppc64/kernel/lmb.c
@@ -37,8 +37,6 @@ void lmb_dump_all(void)
 	for (i=0; i < lmb.memory.cnt ;i++) {
 		udbg_printf("    memory.region[0x%x].base       = 0x%lx\n",
 			    i, lmb.memory.region[i].base);
-		udbg_printf("		      .physbase = 0x%lx\n",
-			    lmb.memory.region[i].physbase);
 		udbg_printf("		      .size     = 0x%lx\n",
 			    lmb.memory.region[i].size);
 	}
@@ -50,8 +48,6 @@ void lmb_dump_all(void)
 	for (i=0; i < lmb.reserved.cnt ;i++) {
 		udbg_printf("    reserved.region[0x%x].base       = 0x%lx\n",
 			    i, lmb.reserved.region[i].base);
-		udbg_printf("		      .physbase = 0x%lx\n",
-			    lmb.reserved.region[i].physbase);
 		udbg_printf("		      .size     = 0x%lx\n",
 			    lmb.reserved.region[i].size);
 	}
@@ -97,7 +93,6 @@ lmb_coalesce_regions(struct lmb_region *rgn, unsigned long r1, unsigned long r2)
 	rgn->region[r1].size += rgn->region[r2].size;
 	for (i=r2; i < rgn->cnt-1; i++) {
 		rgn->region[i].base = rgn->region[i+1].base;
-		rgn->region[i].physbase = rgn->region[i+1].physbase;
 		rgn->region[i].size = rgn->region[i+1].size;
 	}
 	rgn->cnt--;
@@ -127,21 +122,12 @@ lmb_analyze(void)
 	unsigned long i;
 	unsigned long mem_size = 0;
 	unsigned long size_mask = 0;
-#ifdef CONFIG_MSCHUNKS
-	unsigned long physbase = 0;
-#endif
 
 	for (i=0; i < lmb.memory.cnt; i++) {
 		unsigned long lmb_size;
 
 		lmb_size = lmb.memory.region[i].size;
 
-#ifdef CONFIG_MSCHUNKS
-		lmb.memory.region[i].physbase = physbase;
-		physbase += lmb_size;
-#else
-		lmb.memory.region[i].physbase = lmb.memory.region[i].base;
-#endif
 		mem_size += lmb_size;
 		size_mask |= lmb_size;
 	}
@@ -164,7 +150,6 @@ lmb_add_region(struct lmb_region *rgn, unsigned long base, unsigned long size)
 		adjacent = lmb_addrs_adjacent(base,size,rgnbase,rgnsize);
 		if ( adjacent > 0 ) {
 			rgn->region[i].base -= size;
-			rgn->region[i].physbase -= size;
 			rgn->region[i].size += size;
 			coalesced++;
 			break;
@@ -191,11 +176,9 @@ lmb_add_region(struct lmb_region *rgn, unsigned long base, unsigned long size)
 	for (i=rgn->cnt-1; i >= 0; i--) {
 		if (base < rgn->region[i].base) {
 			rgn->region[i+1].base = rgn->region[i].base;
-			rgn->region[i+1].physbase = rgn->region[i].physbase;
 			rgn->region[i+1].size = rgn->region[i].size;
 		}  else {
 			rgn->region[i+1].base = base;
-			rgn->region[i+1].physbase = lmb_abs_to_phys(base);
 			rgn->region[i+1].size = size;
 			break;
 		}
@@ -304,13 +287,7 @@ lmb_end_of_DRAM(void)
 {
 	int idx = lmb.memory.cnt - 1;
 
-#ifdef CONFIG_MSCHUNKS
-	return (lmb.memory.region[idx].physbase + lmb.memory.region[idx].size);
-#else
 	return (lmb.memory.region[idx].base + lmb.memory.region[idx].size);
-#endif /* CONFIG_MSCHUNKS */
-
-	return 0;
 }
 
 /*
diff --git a/arch/ppc64/mm/hash_utils.c b/arch/ppc64/mm/hash_utils.c
index 65d6e852794..09475c8edf7 100644
--- a/arch/ppc64/mm/hash_utils.c
+++ b/arch/ppc64/mm/hash_utils.c
@@ -210,7 +210,7 @@ void __init htab_initialize(void)
 
 	/* create bolted the linear mapping in the hash table */
 	for (i=0; i < lmb.memory.cnt; i++) {
-		base = lmb.memory.region[i].physbase + KERNELBASE;
+		base = lmb.memory.region[i].base + KERNELBASE;
 		size = lmb.memory.region[i].size;
 
 		DBG("creating mapping for region: %lx : %lx\n", base, size);
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
index a16cf12c586..c02dc9809ca 100644
--- a/arch/ppc64/mm/init.c
+++ b/arch/ppc64/mm/init.c
@@ -482,9 +482,9 @@ void __init mm_init_ppc64(void)
 	for (i = 1; i < lmb.memory.cnt; i++) {
 		unsigned long base, prevbase, prevsize;
 
-		prevbase = lmb.memory.region[i-1].physbase;
+		prevbase = lmb.memory.region[i-1].base;
 		prevsize = lmb.memory.region[i-1].size;
-		base = lmb.memory.region[i].physbase;
+		base = lmb.memory.region[i].base;
 		if (base > (prevbase + prevsize)) {
 			io_hole_start = prevbase + prevsize;
 			io_hole_size = base  - (prevbase + prevsize);
@@ -511,11 +511,8 @@ int page_is_ram(unsigned long pfn)
 	for (i=0; i < lmb.memory.cnt; i++) {
 		unsigned long base;
 
-#ifdef CONFIG_MSCHUNKS
-		base = lmb.memory.region[i].physbase;
-#else
 		base = lmb.memory.region[i].base;
-#endif
+
 		if ((paddr >= base) &&
 			(paddr < (base + lmb.memory.region[i].size))) {
 			return 1;
@@ -556,25 +553,25 @@ void __init do_init_bootmem(void)
 	 * present.
 	 */
 	for (i=0; i < lmb.memory.cnt; i++) {
-		unsigned long physbase, size;
+		unsigned long base, size;
 		unsigned long start_pfn, end_pfn;
 
-		physbase = lmb.memory.region[i].physbase;
+		base = lmb.memory.region[i].base;
 		size = lmb.memory.region[i].size;
 
-		start_pfn = physbase >> PAGE_SHIFT;
+		start_pfn = base >> PAGE_SHIFT;
 		end_pfn = start_pfn + (size >> PAGE_SHIFT);
 		memory_present(0, start_pfn, end_pfn);
 
-		free_bootmem(physbase, size);
+		free_bootmem(base, size);
 	}
 
 	/* reserve the sections we're already using */
 	for (i=0; i < lmb.reserved.cnt; i++) {
-		unsigned long physbase = lmb.reserved.region[i].physbase;
+		unsigned long base = lmb.reserved.region[i].base;
 		unsigned long size = lmb.reserved.region[i].size;
 
-		reserve_bootmem(physbase, size);
+		reserve_bootmem(base, size);
 	}
 }
 
@@ -613,10 +610,10 @@ static int __init setup_kcore(void)
 	int i;
 
 	for (i=0; i < lmb.memory.cnt; i++) {
-		unsigned long physbase, size;
+		unsigned long base, size;
 		struct kcore_list *kcore_mem;
 
-		physbase = lmb.memory.region[i].physbase;
+		base = lmb.memory.region[i].base;
 		size = lmb.memory.region[i].size;
 
 		/* GFP_ATOMIC to avoid might_sleep warnings during boot */
@@ -624,7 +621,7 @@ static int __init setup_kcore(void)
 		if (!kcore_mem)
 			panic("mem_init: kmalloc failed\n");
 
-		kclist_add(kcore_mem, __va(physbase), size);
+		kclist_add(kcore_mem, __va(base), size);
 	}
 
 	kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
diff --git a/arch/ppc64/mm/numa.c b/arch/ppc64/mm/numa.c
index 0b191f2de01..c3116f0d788 100644
--- a/arch/ppc64/mm/numa.c
+++ b/arch/ppc64/mm/numa.c
@@ -671,7 +671,7 @@ new_range:
 		 * Mark reserved regions on this node
 		 */
 		for (i = 0; i < lmb.reserved.cnt; i++) {
-			unsigned long physbase = lmb.reserved.region[i].physbase;
+			unsigned long physbase = lmb.reserved.region[i].base;
 			unsigned long size = lmb.reserved.region[i].size;
 
 			if (pa_to_nid(physbase) != nid &&
diff --git a/include/asm-ppc64/lmb.h b/include/asm-ppc64/lmb.h
index a6cbca21ac1..cb368bf0f26 100644
--- a/include/asm-ppc64/lmb.h
+++ b/include/asm-ppc64/lmb.h
@@ -22,7 +22,6 @@
 
 struct lmb_property {
 	unsigned long base;
-	unsigned long physbase;
 	unsigned long size;
 };
 
-- 
cgit v1.2.3-70-g09d2


From c594adad5653491813959277fb87a2fef54c4e05 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 11 Aug 2005 16:55:21 +1000
Subject: [PATCH] Dynamic hugepage addresses for ppc64

Paulus, I think this is now a reasonable candidate for the post-2.6.13
queue.

Relax address restrictions for hugepages on ppc64

Presently, 64-bit applications on ppc64 may only use hugepages in the
address region from 1-1.5T.  Furthermore, if hugepages are enabled in
the kernel config, they may only use hugepages and never normal pages
in this area.  This patch relaxes this restriction, allowing any
address to be used with hugepages, but with a 1TB granularity.  That
is if you map a hugepage anywhere in the region 1TB-2TB, that entire
area will be reserved exclusively for hugepages for the remainder of
the process's lifetime.  This works analagously to hugepages in 32-bit
applications, where hugepages can be mapped anywhere, but with 256MB
(mmu segment) granularity.

This patch applies on top of the four level pagetable patch
(http://patchwork.ozlabs.org/linuxppc64/patch?id=1936).

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/ppc64/kernel/asm-offsets.c |   3 +-
 arch/ppc64/mm/hugetlbpage.c     | 211 ++++++++++++++++++++++++++++++----------
 arch/ppc64/mm/slb_low.S         |  23 ++---
 include/asm-ppc64/mmu.h         |   2 +-
 include/asm-ppc64/page.h        |  29 +++---
 5 files changed, 190 insertions(+), 78 deletions(-)

(limited to 'arch/ppc64/mm')

diff --git a/arch/ppc64/kernel/asm-offsets.c b/arch/ppc64/kernel/asm-offsets.c
index abb9e5b5da0..17e35d0fed0 100644
--- a/arch/ppc64/kernel/asm-offsets.c
+++ b/arch/ppc64/kernel/asm-offsets.c
@@ -94,7 +94,8 @@ int main(void)
 	DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
 	DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
 #ifdef CONFIG_HUGETLB_PAGE
-	DEFINE(PACAHTLBSEGS, offsetof(struct paca_struct, context.htlb_segs));
+	DEFINE(PACALOWHTLBAREAS, offsetof(struct paca_struct, context.low_htlb_areas));
+	DEFINE(PACAHIGHHTLBAREAS, offsetof(struct paca_struct, context.high_htlb_areas));
 #endif /* CONFIG_HUGETLB_PAGE */
 	DEFINE(PACADEFAULTDECR, offsetof(struct paca_struct, default_decr));
         DEFINE(PACA_EXGEN, offsetof(struct paca_struct, exgen));
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
index a13e44230a6..e7833c80eb6 100644
--- a/arch/ppc64/mm/hugetlbpage.c
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -27,6 +27,9 @@
 
 #include <linux/sysctl.h>
 
+#define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)
+#define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)
+
 /* Modelled after find_linux_pte() */
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
@@ -129,15 +132,17 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
 	return 0;
 }
 
-static void flush_segments(void *parm)
+static void flush_low_segments(void *parm)
 {
-	u16 segs = (unsigned long) parm;
+	u16 areas = (unsigned long) parm;
 	unsigned long i;
 
 	asm volatile("isync" : : : "memory");
 
-	for (i = 0; i < 16; i++) {
-		if (! (segs & (1U << i)))
+	BUILD_BUG_ON((sizeof(areas)*8) != NUM_LOW_AREAS);
+
+	for (i = 0; i < NUM_LOW_AREAS; i++) {
+		if (! (areas & (1U << i)))
 			continue;
 		asm volatile("slbie %0" : : "r" (i << SID_SHIFT));
 	}
@@ -145,13 +150,33 @@ static void flush_segments(void *parm)
 	asm volatile("isync" : : : "memory");
 }
 
-static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg)
+static void flush_high_segments(void *parm)
+{
+	u16 areas = (unsigned long) parm;
+	unsigned long i, j;
+
+	asm volatile("isync" : : : "memory");
+
+	BUILD_BUG_ON((sizeof(areas)*8) != NUM_HIGH_AREAS);
+
+	for (i = 0; i < NUM_HIGH_AREAS; i++) {
+		if (! (areas & (1U << i)))
+			continue;
+		for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
+			asm volatile("slbie %0"
+				     :: "r" ((i << HTLB_AREA_SHIFT) + (j << SID_SHIFT)));
+	}
+
+	asm volatile("isync" : : : "memory");
+}
+
+static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area)
 {
-	unsigned long start = seg << SID_SHIFT;
-	unsigned long end = (seg+1) << SID_SHIFT;
+	unsigned long start = area << SID_SHIFT;
+	unsigned long end = (area+1) << SID_SHIFT;
 	struct vm_area_struct *vma;
 
-	BUG_ON(seg >= 16);
+	BUG_ON(area >= NUM_LOW_AREAS);
 
 	/* Check no VMAs are in the region */
 	vma = find_vma(mm, start);
@@ -161,20 +186,69 @@ static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg)
 	return 0;
 }
 
-static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs)
+static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
+{
+	unsigned long start = area << HTLB_AREA_SHIFT;
+	unsigned long end = (area+1) << HTLB_AREA_SHIFT;
+	struct vm_area_struct *vma;
+
+	BUG_ON(area >= NUM_HIGH_AREAS);
+
+	/* Check no VMAs are in the region */
+	vma = find_vma(mm, start);
+	if (vma && (vma->vm_start < end))
+		return -EBUSY;
+
+	return 0;
+}
+
+static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
 {
 	unsigned long i;
 
-	newsegs &= ~(mm->context.htlb_segs);
-	if (! newsegs)
+	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
+	BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
+
+	newareas &= ~(mm->context.low_htlb_areas);
+	if (! newareas)
 		return 0; /* The segments we want are already open */
 
-	for (i = 0; i < 16; i++)
-		if ((1 << i) & newsegs)
-			if (prepare_low_seg_for_htlb(mm, i) != 0)
+	for (i = 0; i < NUM_LOW_AREAS; i++)
+		if ((1 << i) & newareas)
+			if (prepare_low_area_for_htlb(mm, i) != 0)
+				return -EBUSY;
+
+	mm->context.low_htlb_areas |= newareas;
+
+	/* update the paca copy of the context struct */
+	get_paca()->context = mm->context;
+
+	/* the context change must make it to memory before the flush,
+	 * so that further SLB misses do the right thing. */
+	mb();
+	on_each_cpu(flush_low_segments, (void *)(unsigned long)newareas, 0, 1);
+
+	return 0;
+}
+
+static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
+{
+	unsigned long i;
+
+	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
+	BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8)
+		     != NUM_HIGH_AREAS);
+
+	newareas &= ~(mm->context.high_htlb_areas);
+	if (! newareas)
+		return 0; /* The areas we want are already open */
+
+	for (i = 0; i < NUM_HIGH_AREAS; i++)
+		if ((1 << i) & newareas)
+			if (prepare_high_area_for_htlb(mm, i) != 0)
 				return -EBUSY;
 
-	mm->context.htlb_segs |= newsegs;
+	mm->context.high_htlb_areas |= newareas;
 
 	/* update the paca copy of the context struct */
 	get_paca()->context = mm->context;
@@ -182,29 +256,33 @@ static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs)
 	/* the context change must make it to memory before the flush,
 	 * so that further SLB misses do the right thing. */
 	mb();
-	on_each_cpu(flush_segments, (void *)(unsigned long)newsegs, 0, 1);
+	on_each_cpu(flush_high_segments, (void *)(unsigned long)newareas, 0, 1);
 
 	return 0;
 }
 
 int prepare_hugepage_range(unsigned long addr, unsigned long len)
 {
-	if (within_hugepage_high_range(addr, len))
-		return 0;
-	else if ((addr < 0x100000000UL) && ((addr+len) < 0x100000000UL)) {
-		int err;
-		/* Yes, we need both tests, in case addr+len overflows
-		 * 64-bit arithmetic */
-		err = open_low_hpage_segs(current->mm,
+	int err;
+
+	if ( (addr+len) < addr )
+		return -EINVAL;
+
+	if ((addr + len) < 0x100000000UL)
+		err = open_low_hpage_areas(current->mm,
 					  LOW_ESID_MASK(addr, len));
-		if (err)
-			printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
-			       " failed (segs: 0x%04hx)\n", addr, len,
-			       LOW_ESID_MASK(addr, len));
+	else
+		err = open_high_hpage_areas(current->mm,
+					    HTLB_AREA_MASK(addr, len));
+	if (err) {
+		printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
+		       " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n",
+		       addr, len,
+		       LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len));
 		return err;
 	}
 
-	return -EINVAL;
+	return 0;
 }
 
 struct page *
@@ -276,8 +354,8 @@ full_search:
 			vma = find_vma(mm, addr);
 			continue;
 		}
-		if (touches_hugepage_high_range(addr, len)) {
-			addr = TASK_HPAGE_END;
+		if (touches_hugepage_high_range(mm, addr, len)) {
+			addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
 			vma = find_vma(mm, addr);
 			continue;
 		}
@@ -356,8 +434,9 @@ hugepage_recheck:
 		if (touches_hugepage_low_range(mm, addr, len)) {
 			addr = (addr & ((~0) << SID_SHIFT)) - len;
 			goto hugepage_recheck;
-		} else if (touches_hugepage_high_range(addr, len)) {
-			addr = TASK_HPAGE_BASE - len;
+		} else if (touches_hugepage_high_range(mm, addr, len)) {
+			addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len;
+			goto hugepage_recheck;
 		}
 
 		/*
@@ -448,23 +527,28 @@ static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
 	return -ENOMEM;
 }
 
-static unsigned long htlb_get_high_area(unsigned long len)
+static unsigned long htlb_get_high_area(unsigned long len, u16 areamask)
 {
-	unsigned long addr = TASK_HPAGE_BASE;
+	unsigned long addr = 0x100000000UL;
 	struct vm_area_struct *vma;
 
 	vma = find_vma(current->mm, addr);
-	for (vma = find_vma(current->mm, addr);
-	     addr + len <= TASK_HPAGE_END;
-	     vma = vma->vm_next) {
+	while (addr + len <= TASK_SIZE_USER64) {
 		BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
-		BUG_ON(! within_hugepage_high_range(addr, len));
+
+		if (! __within_hugepage_high_range(addr, len, areamask)) {
+			addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
+			vma = find_vma(current->mm, addr);
+			continue;
+		}
 
 		if (!vma || (addr + len) <= vma->vm_start)
 			return addr;
 		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
-		/* Because we're in a hugepage region, this alignment
-		 * should not skip us over any VMAs */
+		/* Depending on segmask this might not be a confirmed
+		 * hugepage region, so the ALIGN could have skipped
+		 * some VMAs */
+		vma = find_vma(current->mm, addr);
 	}
 
 	return -ENOMEM;
@@ -474,6 +558,9 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 					unsigned long len, unsigned long pgoff,
 					unsigned long flags)
 {
+	int lastshift;
+	u16 areamask, curareas;
+
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
 
@@ -481,31 +568,49 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		return -EINVAL;
 
 	if (test_thread_flag(TIF_32BIT)) {
-		int lastshift = 0;
-		u16 segmask, cursegs = current->mm->context.htlb_segs;
+		curareas = current->mm->context.low_htlb_areas;
 
 		/* First see if we can do the mapping in the existing
-		 * low hpage segments */
-		addr = htlb_get_low_area(len, cursegs);
+		 * low areas */
+		addr = htlb_get_low_area(len, curareas);
 		if (addr != -ENOMEM)
 			return addr;
 
-		for (segmask = LOW_ESID_MASK(0x100000000UL-len, len);
-		     ! lastshift; segmask >>=1) {
-			if (segmask & 1)
+		lastshift = 0;
+		for (areamask = LOW_ESID_MASK(0x100000000UL-len, len);
+		     ! lastshift; areamask >>=1) {
+			if (areamask & 1)
 				lastshift = 1;
 
-			addr = htlb_get_low_area(len, cursegs | segmask);
+			addr = htlb_get_low_area(len, curareas | areamask);
 			if ((addr != -ENOMEM)
-			    && open_low_hpage_segs(current->mm, segmask) == 0)
+			    && open_low_hpage_areas(current->mm, areamask) == 0)
 				return addr;
 		}
-		printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
-		       " enough segments\n");
-		return -ENOMEM;
 	} else {
-		return htlb_get_high_area(len);
+		curareas = current->mm->context.high_htlb_areas;
+
+		/* First see if we can do the mapping in the existing
+		 * high areas */
+		addr = htlb_get_high_area(len, curareas);
+		if (addr != -ENOMEM)
+			return addr;
+
+		lastshift = 0;
+		for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len);
+		     ! lastshift; areamask >>=1) {
+			if (areamask & 1)
+				lastshift = 1;
+
+			addr = htlb_get_high_area(len, curareas | areamask);
+			if ((addr != -ENOMEM)
+			    && open_high_hpage_areas(current->mm, areamask) == 0)
+				return addr;
+		}
 	}
+	printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
+	       " enough areas\n");
+	return -ENOMEM;
 }
 
 int hash_huge_page(struct mm_struct *mm, unsigned long access,
diff --git a/arch/ppc64/mm/slb_low.S b/arch/ppc64/mm/slb_low.S
index f20fc52483a..bab255889c5 100644
--- a/arch/ppc64/mm/slb_low.S
+++ b/arch/ppc64/mm/slb_low.S
@@ -89,28 +89,29 @@ END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
 	b	9f
 
 0:	/* user address: proto-VSID = context<<15 | ESID */
-	li	r11,SLB_VSID_USER
-
 	srdi.	r9,r3,USER_ESID_BITS
 	bne-	8f			/* invalid ea bits set */
 
 #ifdef CONFIG_HUGETLB_PAGE
 BEGIN_FTR_SECTION
-	/* check against the hugepage ranges */
-	cmpldi	r3,(TASK_HPAGE_END>>SID_SHIFT)
-	bge	6f			/* >= TASK_HPAGE_END */
-	cmpldi	r3,(TASK_HPAGE_BASE>>SID_SHIFT)
-	bge	5f			/* TASK_HPAGE_BASE..TASK_HPAGE_END */
+	lhz	r9,PACAHIGHHTLBAREAS(r13)
+	srdi	r11,r3,(HTLB_AREA_SHIFT-SID_SHIFT)
+	srd	r9,r9,r11
+	andi.	r9,r9,1
+	bne	5f
+
+	li	r11,SLB_VSID_USER
+
 	cmpldi	r3,16
-	bge	6f			/* 4GB..TASK_HPAGE_BASE */
+	bge	6f
 
-	lhz	r9,PACAHTLBSEGS(r13)
+	lhz	r9,PACALOWHTLBAREAS(r13)
 	srd	r9,r9,r3
 	andi.	r9,r9,1
+
 	beq	6f
 
-5:	/* this is a hugepage user address */
-	li	r11,(SLB_VSID_USER|SLB_VSID_L)
+5:	li	r11,SLB_VSID_USER|SLB_VSID_L
 END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
 #endif /* CONFIG_HUGETLB_PAGE */
 
diff --git a/include/asm-ppc64/mmu.h b/include/asm-ppc64/mmu.h
index 789c2693483..ad36bb28de2 100644
--- a/include/asm-ppc64/mmu.h
+++ b/include/asm-ppc64/mmu.h
@@ -307,7 +307,7 @@ typedef unsigned long mm_context_id_t;
 typedef struct {
 	mm_context_id_t id;
 #ifdef CONFIG_HUGETLB_PAGE
-	u16 htlb_segs; /* bitmask */
+	u16 low_htlb_areas, high_htlb_areas;
 #endif
 } mm_context_t;
 
diff --git a/include/asm-ppc64/page.h b/include/asm-ppc64/page.h
index 7e7b18ea986..a79a08df62b 100644
--- a/include/asm-ppc64/page.h
+++ b/include/asm-ppc64/page.h
@@ -37,40 +37,45 @@
 
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
 
-/* For 64-bit processes the hugepage range is 1T-1.5T */
-#define TASK_HPAGE_BASE ASM_CONST(0x0000010000000000)
-#define TASK_HPAGE_END 	ASM_CONST(0x0000018000000000)
+#define HTLB_AREA_SHIFT		40
+#define HTLB_AREA_SIZE		(1UL << HTLB_AREA_SHIFT)
+#define GET_HTLB_AREA(x)	((x) >> HTLB_AREA_SHIFT)
 
 #define LOW_ESID_MASK(addr, len)	(((1U << (GET_ESID(addr+len-1)+1)) \
 	   	                	- (1U << GET_ESID(addr))) & 0xffff)
+#define HTLB_AREA_MASK(addr, len)	(((1U << (GET_HTLB_AREA(addr+len-1)+1)) \
+	   	                	- (1U << GET_HTLB_AREA(addr))) & 0xffff)
 
 #define ARCH_HAS_HUGEPAGE_ONLY_RANGE
 #define ARCH_HAS_PREPARE_HUGEPAGE_RANGE
 #define ARCH_HAS_SETCLEAR_HUGE_PTE
 
 #define touches_hugepage_low_range(mm, addr, len) \
-	(LOW_ESID_MASK((addr), (len)) & mm->context.htlb_segs)
-#define touches_hugepage_high_range(addr, len) \
-	(((addr) > (TASK_HPAGE_BASE-(len))) && ((addr) < TASK_HPAGE_END))
+	(LOW_ESID_MASK((addr), (len)) & (mm)->context.low_htlb_areas)
+#define touches_hugepage_high_range(mm, addr, len) \
+	(HTLB_AREA_MASK((addr), (len)) & (mm)->context.high_htlb_areas)
 
 #define __within_hugepage_low_range(addr, len, segmask) \
 	((LOW_ESID_MASK((addr), (len)) | (segmask)) == (segmask))
 #define within_hugepage_low_range(addr, len) \
 	__within_hugepage_low_range((addr), (len), \
-				    current->mm->context.htlb_segs)
-#define within_hugepage_high_range(addr, len) (((addr) >= TASK_HPAGE_BASE) \
-	  && ((addr)+(len) <= TASK_HPAGE_END) && ((addr)+(len) >= (addr)))
+				    current->mm->context.low_htlb_areas)
+#define __within_hugepage_high_range(addr, len, zonemask) \
+	((HTLB_AREA_MASK((addr), (len)) | (zonemask)) == (zonemask))
+#define within_hugepage_high_range(addr, len) \
+	__within_hugepage_high_range((addr), (len), \
+				    current->mm->context.high_htlb_areas)
 
 #define is_hugepage_only_range(mm, addr, len) \
-	(touches_hugepage_high_range((addr), (len)) || \
+	(touches_hugepage_high_range((mm), (addr), (len)) || \
 	  touches_hugepage_low_range((mm), (addr), (len)))
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 
 #define in_hugepage_area(context, addr) \
 	(cpu_has_feature(CPU_FTR_16M_PAGE) && \
-	 ( (((addr) >= TASK_HPAGE_BASE) && ((addr) < TASK_HPAGE_END)) || \
+	 ( ((1 << GET_HTLB_AREA(addr)) & (context).high_htlb_areas) || \
 	   ( ((addr) < 0x100000000L) && \
-	     ((1 << GET_ESID(addr)) & (context).htlb_segs) ) ) )
+	     ((1 << GET_ESID(addr)) & (context).low_htlb_areas) ) ) )
 
 #else /* !CONFIG_HUGETLB_PAGE */
 
-- 
cgit v1.2.3-70-g09d2


From 8913ca1c9ccb5eb6471afd419159729eef6e2730 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 27 Jul 2005 15:47:23 +1000
Subject: [PATCH] Remove nested feature sections

The {BEGIN,END}_FTR_SECTION asm macros used in ppc64 to nop out
sections of code at runtime cannot be nested.  However, we do nest
them in hash_low.S.  We get away with it there, because there is
nothing between the BEGIN markers for each section.  However, that's
confusing to someone reading the code.

This patch removes the nested ifset and ifclr feature sections,
replacing them with a single feature section in the full mask/value
form.

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/ppc64/mm/hash_low.S | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/ppc64/mm')

diff --git a/arch/ppc64/mm/hash_low.S b/arch/ppc64/mm/hash_low.S
index fbff24827ae..35eb49e1b89 100644
--- a/arch/ppc64/mm/hash_low.S
+++ b/arch/ppc64/mm/hash_low.S
@@ -128,13 +128,11 @@ _GLOBAL(__hash_page)
 	/* We eventually do the icache sync here (maybe inline that
 	 * code rather than call a C function...) 
 	 */
-BEGIN_FTR_SECTION
 BEGIN_FTR_SECTION
 	mr	r4,r30
 	mr	r5,r7
 	bl	.hash_page_do_lazy_icache
-END_FTR_SECTION_IFSET(CPU_FTR_NOEXECUTE)
-END_FTR_SECTION_IFCLR(CPU_FTR_COHERENT_ICACHE)
+END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
 
 	/* At this point, r3 contains new PP bits, save them in
 	 * place of "access" in the param area (sic)
-- 
cgit v1.2.3-70-g09d2


From 319e76a1ae835c34a2838c2bfebe3db4d5a6b387 Mon Sep 17 00:00:00 2001
From: David Gibson <dwg@au1.ibm.com>
Date: Wed, 31 Aug 2005 14:34:05 +1000
Subject: [PATCH] Fix bug in ppc64 dynamic hugepage support

In adjusting the logic for SLB miss for the dynamic hugepage stuff, I
messed up the !CONFIG_HUGETLB_PAGE case, failing to set the SLB flags
properly.

This fixes it.  It also streamlines the logic for the HUGETLB_PAGE case
(removing a couple of branches) while we're at it.

Booted, and roughly tested on POWER5 (with and without HUGETLB_PAGE),
iSeries/RS64 (no hugepage available), and G5 (with and without
HUGETLB_PAGE).

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc64/mm/slb_low.S | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'arch/ppc64/mm')

diff --git a/arch/ppc64/mm/slb_low.S b/arch/ppc64/mm/slb_low.S
index bab255889c5..698d6b9ed6d 100644
--- a/arch/ppc64/mm/slb_low.S
+++ b/arch/ppc64/mm/slb_low.S
@@ -97,25 +97,21 @@ BEGIN_FTR_SECTION
 	lhz	r9,PACAHIGHHTLBAREAS(r13)
 	srdi	r11,r3,(HTLB_AREA_SHIFT-SID_SHIFT)
 	srd	r9,r9,r11
-	andi.	r9,r9,1
-	bne	5f
+	lhz	r11,PACALOWHTLBAREAS(r13)
+	srd	r11,r11,r3
+	or	r9,r9,r11
+END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
+#endif /* CONFIG_HUGETLB_PAGE */
 
 	li	r11,SLB_VSID_USER
 
-	cmpldi	r3,16
-	bge	6f
-
-	lhz	r9,PACALOWHTLBAREAS(r13)
-	srd	r9,r9,r3
-	andi.	r9,r9,1
-
-	beq	6f
-
-5:	li	r11,SLB_VSID_USER|SLB_VSID_L
+#ifdef CONFIG_HUGETLB_PAGE
+BEGIN_FTR_SECTION
+	rldimi	r11,r9,8,55		/* shift masked bit into SLB_VSID_L */
 END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
 #endif /* CONFIG_HUGETLB_PAGE */
 
-6:	ld	r9,PACACONTEXTID(r13)
+	ld	r9,PACACONTEXTID(r13)
 	rldimi	r3,r9,USER_ESID_BITS,0
 
 9:	/* r3 = protovsid, r11 = flags, r10 = esid_data, cr7 = <>KERNELBASE */
-- 
cgit v1.2.3-70-g09d2


From 802f192e4a600f7ef84ca25c8b818c8830acef5a Mon Sep 17 00:00:00 2001
From: Bob Picco <bob.picco@hp.com>
Date: Sat, 3 Sep 2005 15:54:26 -0700
Subject: [PATCH] SPARSEMEM EXTREME

A new option for SPARSEMEM is ARCH_SPARSEMEM_EXTREME.  Architecture
platforms with a very sparse physical address space would likely want to
select this option.  For those architecture platforms that don't select the
option, the code generated is equivalent to SPARSEMEM currently in -mm.
I'll be posting a patch on ia64 ml which uses this new SPARSEMEM feature.

ARCH_SPARSEMEM_EXTREME makes mem_section a one dimensional array of
pointers to mem_sections.  This two level layout scheme is able to achieve
smaller memory requirements for SPARSEMEM with the tradeoff of an
additional shift and load when fetching the memory section.  The current
SPARSEMEM -mm implementation is a one dimensional array of mem_sections
which is the default SPARSEMEM configuration.  The patch attempts isolates
the implementation details of the physical layout of the sparsemem section
array.

ARCH_SPARSEMEM_EXTREME depends on 64BIT and is by default boolean false.

I've boot tested under aim load ia64 configured for ARCH_SPARSEMEM_EXTREME.
 I've also boot tested a 4 way Opteron machine with !ARCH_SPARSEMEM_EXTREME
and tested with aim.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc64/mm/init.c    | 27 +++++++++------------------
 arch/ppc64/mm/numa.c    | 43 ++++++++++++++++++++++++++++++++++++++++---
 include/asm-ppc64/lmb.h | 22 ++++++++++++++++++++++
 include/linux/mmzone.h  | 30 ++++++++++++++++++++++++++++--
 mm/Kconfig              |  9 +++++++++
 mm/sparse.c             | 38 ++++++++++++++++++++++++++++++++------
 6 files changed, 140 insertions(+), 29 deletions(-)

(limited to 'arch/ppc64/mm')

diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
index c02dc9809ca..b3b1e9c1770 100644
--- a/arch/ppc64/mm/init.c
+++ b/arch/ppc64/mm/init.c
@@ -552,27 +552,18 @@ void __init do_init_bootmem(void)
 	/* Add all physical memory to the bootmem map, mark each area
 	 * present.
 	 */
-	for (i=0; i < lmb.memory.cnt; i++) {
-		unsigned long base, size;
-		unsigned long start_pfn, end_pfn;
-
-		base = lmb.memory.region[i].base;
-		size = lmb.memory.region[i].size;
-
-		start_pfn = base >> PAGE_SHIFT;
-		end_pfn = start_pfn + (size >> PAGE_SHIFT);
-		memory_present(0, start_pfn, end_pfn);
-
-		free_bootmem(base, size);
-	}
+	for (i=0; i < lmb.memory.cnt; i++)
+		free_bootmem(lmb_start_pfn(&lmb.memory, i),
+			     lmb_size_bytes(&lmb.memory, i));
 
 	/* reserve the sections we're already using */
-	for (i=0; i < lmb.reserved.cnt; i++) {
-		unsigned long base = lmb.reserved.region[i].base;
-		unsigned long size = lmb.reserved.region[i].size;
+	for (i=0; i < lmb.reserved.cnt; i++)
+		reserve_bootmem(lmb_start_pfn(&lmb.reserved, i),
+				lmb_size_bytes(&lmb.reserved, i));
 
-		reserve_bootmem(base, size);
-	}
+	for (i=0; i < lmb.memory.cnt; i++)
+		memory_present(0, lmb_start_pfn(&lmb.memory, i),
+			       lmb_end_pfn(&lmb.memory, i));
 }
 
 /*
diff --git a/arch/ppc64/mm/numa.c b/arch/ppc64/mm/numa.c
index c3116f0d788..cb864b8f275 100644
--- a/arch/ppc64/mm/numa.c
+++ b/arch/ppc64/mm/numa.c
@@ -440,8 +440,6 @@ new_range:
 		for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
 			numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
 				numa_domain;
-		memory_present(numa_domain, start >> PAGE_SHIFT,
-						(start + size) >> PAGE_SHIFT);
 
 		if (--ranges)
 			goto new_range;
@@ -483,7 +481,6 @@ static void __init setup_nonnuma(void)
 
 	for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
 		numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
-	memory_present(0, 0, init_node_data[0].node_end_pfn);
 }
 
 static void __init dump_numa_topology(void)
@@ -695,6 +692,46 @@ new_range:
 						     size);
 			}
 		}
+		/*
+		 * This loop may look famaliar, but we have to do it again
+		 * after marking our reserved memory to mark memory present
+		 * for sparsemem.
+		 */
+		addr_cells = get_mem_addr_cells();
+		size_cells = get_mem_size_cells();
+		memory = NULL;
+		while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+			unsigned long mem_start, mem_size;
+			int numa_domain, ranges;
+			unsigned int *memcell_buf;
+			unsigned int len;
+
+			memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
+			if (!memcell_buf || len <= 0)
+				continue;
+
+			ranges = memory->n_addrs;	/* ranges in cell */
+new_range2:
+			mem_start = read_n_cells(addr_cells, &memcell_buf);
+			mem_size = read_n_cells(size_cells, &memcell_buf);
+			if (numa_enabled) {
+				numa_domain = of_node_numa_domain(memory);
+				if (numa_domain  >= MAX_NUMNODES)
+					numa_domain = 0;
+			} else
+				numa_domain =  0;
+
+			if (numa_domain != nid)
+				continue;
+
+			mem_size = numa_enforce_memory_limit(mem_start, mem_size);
+			memory_present(numa_domain, mem_start >> PAGE_SHIFT,
+				       (mem_start + mem_size) >> PAGE_SHIFT);
+
+			if (--ranges)		/* process all ranges in cell */
+				goto new_range2;
+		}
+
 	}
 }
 
diff --git a/include/asm-ppc64/lmb.h b/include/asm-ppc64/lmb.h
index cb368bf0f26..de91e034bd9 100644
--- a/include/asm-ppc64/lmb.h
+++ b/include/asm-ppc64/lmb.h
@@ -56,4 +56,26 @@ extern void lmb_dump_all(void);
 
 extern unsigned long io_hole_start;
 
+static inline unsigned long
+lmb_size_bytes(struct lmb_region *type, unsigned long region_nr)
+{
+	return type->region[region_nr].size;
+}
+static inline unsigned long
+lmb_size_pages(struct lmb_region *type, unsigned long region_nr)
+{
+	return lmb_size_bytes(type, region_nr) >> PAGE_SHIFT;
+}
+static inline unsigned long
+lmb_start_pfn(struct lmb_region *type, unsigned long region_nr)
+{
+	return type->region[region_nr].base >> PAGE_SHIFT;
+}
+static inline unsigned long
+lmb_end_pfn(struct lmb_region *type, unsigned long region_nr)
+{
+	return lmb_start_pfn(type, region_nr) +
+	       lmb_size_pages(type, region_nr);
+}
+
 #endif /* _PPC64_LMB_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6c90461ed99..b97054bbc39 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -487,6 +487,28 @@ struct mem_section {
 	unsigned long section_mem_map;
 };
 
+#ifdef CONFIG_ARCH_SPARSEMEM_EXTREME
+/*
+ * Should we ever require GCC 4 or later then the flat array scheme
+ * can be eliminated and a uniform solution for EXTREME and !EXTREME can
+ * be arrived at.
+ */
+#define SECTION_ROOT_SHIFT	(PAGE_SHIFT-3)
+#define SECTION_ROOT_MASK	((1UL<<SECTION_ROOT_SHIFT) - 1)
+#define SECTION_TO_ROOT(_sec)	((_sec) >> SECTION_ROOT_SHIFT)
+#define NR_SECTION_ROOTS	(NR_MEM_SECTIONS >>  SECTION_ROOT_SHIFT)
+
+extern struct mem_section *mem_section[NR_SECTION_ROOTS];
+
+static inline struct mem_section *__nr_to_section(unsigned long nr)
+{
+	if (!mem_section[SECTION_TO_ROOT(nr)])
+		return NULL;
+	return &mem_section[SECTION_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
+}
+
+#else
+
 extern struct mem_section mem_section[NR_MEM_SECTIONS];
 
 static inline struct mem_section *__nr_to_section(unsigned long nr)
@@ -494,6 +516,10 @@ static inline struct mem_section *__nr_to_section(unsigned long nr)
 	return &mem_section[nr];
 }
 
+#define sparse_index_init(_sec, _nid)  do {} while (0)
+
+#endif
+
 /*
  * We use the lower bits of the mem_map pointer to store
  * a little bit of information.  There should be at least
@@ -513,12 +539,12 @@ static inline struct page *__section_mem_map_addr(struct mem_section *section)
 
 static inline int valid_section(struct mem_section *section)
 {
-	return (section->section_mem_map & SECTION_MARKED_PRESENT);
+	return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
 }
 
 static inline int section_has_mem_map(struct mem_section *section)
 {
-	return (section->section_mem_map & SECTION_HAS_MEM_MAP);
+	return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
 }
 
 static inline int valid_section_nr(unsigned long nr)
diff --git a/mm/Kconfig b/mm/Kconfig
index cd379936cac..fc644c5c065 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -89,3 +89,12 @@ config NEED_MULTIPLE_NODES
 config HAVE_MEMORY_PRESENT
 	def_bool y
 	depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
+
+#
+# Architectecture platforms which require a two level mem_section in SPARSEMEM
+# must select this option. This is usually for architecture platforms with
+# an extremely sparse physical address space.
+#
+config ARCH_SPARSEMEM_EXTREME
+	def_bool n
+	depends on SPARSEMEM && 64BIT
diff --git a/mm/sparse.c b/mm/sparse.c
index b54e304df4a..b2b456bf0a5 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -13,7 +13,26 @@
  *
  * 1) mem_section	- memory sections, mem_map's for valid memory
  */
-struct mem_section mem_section[NR_MEM_SECTIONS];
+#ifdef CONFIG_ARCH_SPARSEMEM_EXTREME
+struct mem_section *mem_section[NR_SECTION_ROOTS]
+	____cacheline_maxaligned_in_smp;
+
+static void sparse_index_init(unsigned long section, int nid)
+{
+	unsigned long root = SECTION_TO_ROOT(section);
+
+	if (mem_section[root])
+		return;
+	mem_section[root] = alloc_bootmem_node(NODE_DATA(nid), PAGE_SIZE);
+	if (mem_section[root])
+		memset(mem_section[root], 0, PAGE_SIZE);
+	else
+		panic("memory_present: NO MEMORY\n");
+}
+#else
+struct mem_section mem_section[NR_MEM_SECTIONS]
+	____cacheline_maxaligned_in_smp;
+#endif
 EXPORT_SYMBOL(mem_section);
 
 /* Record a memory area against a node. */
@@ -24,8 +43,13 @@ void memory_present(int nid, unsigned long start, unsigned long end)
 	start &= PAGE_SECTION_MASK;
 	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
 		unsigned long section = pfn_to_section_nr(pfn);
-		if (!mem_section[section].section_mem_map)
-			mem_section[section].section_mem_map = SECTION_MARKED_PRESENT;
+		struct mem_section *ms;
+
+		sparse_index_init(section, nid);
+
+		ms = __nr_to_section(section);
+		if (!ms->section_mem_map)
+			ms->section_mem_map = SECTION_MARKED_PRESENT;
 	}
 }
 
@@ -85,6 +109,7 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
 {
 	struct page *map;
 	int nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
+	struct mem_section *ms = __nr_to_section(pnum);
 
 	map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
 	if (map)
@@ -96,7 +121,7 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
 		return map;
 
 	printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
-	mem_section[pnum].section_mem_map = 0;
+	ms->section_mem_map = 0;
 	return NULL;
 }
 
@@ -114,8 +139,9 @@ void sparse_init(void)
 			continue;
 
 		map = sparse_early_mem_map_alloc(pnum);
-		if (map)
-			sparse_init_one_section(&mem_section[pnum], pnum, map);
+		if (!map)
+			continue;
+		sparse_init_one_section(__nr_to_section(pnum), pnum, map);
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From 4721e2214b5fd6eca48caea76afb1bad3148930f Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 6 Sep 2005 12:05:26 +1000
Subject: [PATCH] ppc64: poison initmem

Poison initmem after we free it so we catch use after free issues.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/ppc64/mm/init.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/ppc64/mm')

diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
index b3b1e9c1770..a14ab87df49 100644
--- a/arch/ppc64/mm/init.c
+++ b/arch/ppc64/mm/init.c
@@ -392,6 +392,7 @@ void free_initmem(void)
 
 	addr = (unsigned long)__init_begin;
 	for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
+		memset((void *)addr, 0xcc, PAGE_SIZE);
 		ClearPageReserved(virt_to_page(addr));
 		set_page_count(virt_to_page(addr), 1);
 		free_page(addr);
-- 
cgit v1.2.3-70-g09d2


From 14b34661615ec036ab4c91637913706e4caccc93 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 6 Sep 2005 14:59:47 +1000
Subject: [PATCH] Invert sense of SLB class bit

Currently, we set the class bit in kernel SLB entries, and clear it on
user SLB entries.  On POWER5, ERAT entries created in real mode have
the class bit clear.  So to avoid flushing kernel ERAT entries on each
context switch, this patch inverts our usage of the class bit, setting
it on user SLB entries and clearing it on kernel SLB entries.

Booted on POWER5 and G5.

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/ppc64/kernel/entry.S   | 11 +++++------
 arch/ppc64/mm/hugetlbpage.c |  6 ++++--
 arch/ppc64/mm/slb.c         |  4 ++--
 include/asm-ppc64/mmu.h     |  6 ++++--
 4 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'arch/ppc64/mm')

diff --git a/arch/ppc64/kernel/entry.S b/arch/ppc64/kernel/entry.S
index b61572eb2a7..bf99b4a92f2 100644
--- a/arch/ppc64/kernel/entry.S
+++ b/arch/ppc64/kernel/entry.S
@@ -400,15 +400,14 @@ BEGIN_FTR_SECTION
 	cmpd	cr1,r6,r9	/* or is new ESID the same as current ESID? */
 	cror	eq,4*cr1+eq,eq
 	beq	2f		/* if yes, don't slbie it */
-	oris	r0,r6,0x0800	/* set C (class) bit */
 
 	/* Bolt in the new stack SLB entry */
 	ld	r7,KSP_VSID(r4)	/* Get new stack's VSID */
-	oris	r6,r6,(SLB_ESID_V)@h
-	ori	r6,r6,(SLB_NUM_BOLTED-1)@l
-	slbie	r0
-	slbie	r0		/* Workaround POWER5 < DD2.1 issue */
-	slbmte	r7,r6
+	oris	r0,r6,(SLB_ESID_V)@h
+	ori	r0,r0,(SLB_NUM_BOLTED-1)@l
+	slbie	r6
+	slbie	r6		/* Workaround POWER5 < DD2.1 issue */
+	slbmte	r7,r0
 	isync
 
 2:
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
index e7833c80eb6..338771ec70d 100644
--- a/arch/ppc64/mm/hugetlbpage.c
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -144,7 +144,8 @@ static void flush_low_segments(void *parm)
 	for (i = 0; i < NUM_LOW_AREAS; i++) {
 		if (! (areas & (1U << i)))
 			continue;
-		asm volatile("slbie %0" : : "r" (i << SID_SHIFT));
+		asm volatile("slbie %0"
+			     : : "r" ((i << SID_SHIFT) | SLBIE_C));
 	}
 
 	asm volatile("isync" : : : "memory");
@@ -164,7 +165,8 @@ static void flush_high_segments(void *parm)
 			continue;
 		for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
 			asm volatile("slbie %0"
-				     :: "r" ((i << HTLB_AREA_SHIFT) + (j << SID_SHIFT)));
+				     :: "r" (((i << HTLB_AREA_SHIFT)
+					     + (j << SID_SHIFT)) | SLBIE_C));
 	}
 
 	asm volatile("isync" : : : "memory");
diff --git a/arch/ppc64/mm/slb.c b/arch/ppc64/mm/slb.c
index 244150a0bc1..0473953f6a3 100644
--- a/arch/ppc64/mm/slb.c
+++ b/arch/ppc64/mm/slb.c
@@ -87,8 +87,8 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 		int i;
 		asm volatile("isync" : : : "memory");
 		for (i = 0; i < offset; i++) {
-			esid_data = (unsigned long)get_paca()->slb_cache[i]
-				<< SID_SHIFT;
+			esid_data = ((unsigned long)get_paca()->slb_cache[i]
+				<< SID_SHIFT) | SLBIE_C;
 			asm volatile("slbie %0" : : "r" (esid_data));
 		}
 		asm volatile("isync" : : : "memory");
diff --git a/include/asm-ppc64/mmu.h b/include/asm-ppc64/mmu.h
index ad36bb28de2..7bc42eb087a 100644
--- a/include/asm-ppc64/mmu.h
+++ b/include/asm-ppc64/mmu.h
@@ -54,8 +54,10 @@ extern char initial_stab[];
 #define SLB_VSID_C		ASM_CONST(0x0000000000000080) /* class */
 #define SLB_VSID_LS		ASM_CONST(0x0000000000000070) /* size of largepage */
  
-#define SLB_VSID_KERNEL		(SLB_VSID_KP|SLB_VSID_C)
-#define SLB_VSID_USER		(SLB_VSID_KP|SLB_VSID_KS)
+#define SLB_VSID_KERNEL		(SLB_VSID_KP)
+#define SLB_VSID_USER		(SLB_VSID_KP|SLB_VSID_KS|SLB_VSID_C)
+
+#define SLBIE_C			(0x08000000)
 
 /*
  * Hash table
-- 
cgit v1.2.3-70-g09d2


From bb144a85c70a65730424ad1a9dc50fef66e5cafe Mon Sep 17 00:00:00 2001
From: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Date: Tue, 6 Sep 2005 15:19:29 -0700
Subject: [PATCH] Kprobes: prevent possible race conditions ppc64 changes

This patch contains the ppc64 architecture specific changes to prevent the
possible race conditions.

Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc64/kernel/kprobes.c     | 29 +++++++++++++++--------------
 arch/ppc64/kernel/misc.S        |  4 ++--
 arch/ppc64/kernel/traps.c       |  5 +++--
 arch/ppc64/kernel/vmlinux.lds.S |  1 +
 arch/ppc64/mm/fault.c           |  5 +++--
 include/asm-ppc64/processor.h   | 14 ++++++++++++++
 6 files changed, 38 insertions(+), 20 deletions(-)

(limited to 'arch/ppc64/mm')

diff --git a/arch/ppc64/kernel/kprobes.c b/arch/ppc64/kernel/kprobes.c
index a3d519518fb..591e4b67b5a 100644
--- a/arch/ppc64/kernel/kprobes.c
+++ b/arch/ppc64/kernel/kprobes.c
@@ -44,7 +44,7 @@ static struct kprobe *kprobe_prev;
 static unsigned long kprobe_status_prev, kprobe_saved_msr_prev;
 static struct pt_regs jprobe_saved_regs;
 
-int arch_prepare_kprobe(struct kprobe *p)
+int __kprobes arch_prepare_kprobe(struct kprobe *p)
 {
 	int ret = 0;
 	kprobe_opcode_t insn = *p->addr;
@@ -68,27 +68,27 @@ int arch_prepare_kprobe(struct kprobe *p)
 	return ret;
 }
 
-void arch_copy_kprobe(struct kprobe *p)
+void __kprobes arch_copy_kprobe(struct kprobe *p)
 {
 	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
 	p->opcode = *p->addr;
 }
 
-void arch_arm_kprobe(struct kprobe *p)
+void __kprobes arch_arm_kprobe(struct kprobe *p)
 {
 	*p->addr = BREAKPOINT_INSTRUCTION;
 	flush_icache_range((unsigned long) p->addr,
 			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
 
-void arch_disarm_kprobe(struct kprobe *p)
+void __kprobes arch_disarm_kprobe(struct kprobe *p)
 {
 	*p->addr = p->opcode;
 	flush_icache_range((unsigned long) p->addr,
 			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
 
-void arch_remove_kprobe(struct kprobe *p)
+void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
 	up(&kprobe_mutex);
 	free_insn_slot(p->ainsn.insn);
@@ -122,7 +122,8 @@ static inline void restore_previous_kprobe(void)
 	kprobe_saved_msr = kprobe_saved_msr_prev;
 }
 
-void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
+void __kprobes arch_prepare_kretprobe(struct kretprobe *rp,
+				      struct pt_regs *regs)
 {
 	struct kretprobe_instance *ri;
 
@@ -244,7 +245,7 @@ void kretprobe_trampoline_holder(void)
 /*
  * Called when the probe at kretprobe trampoline is hit
  */
-int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
+int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 {
         struct kretprobe_instance *ri = NULL;
         struct hlist_head *head;
@@ -308,7 +309,7 @@ int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
  * single-stepped a copy of the instruction.  The address of this
  * copy is p->ainsn.insn.
  */
-static void resume_execution(struct kprobe *p, struct pt_regs *regs)
+static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs)
 {
 	int ret;
 	unsigned int insn = *p->ainsn.insn;
@@ -373,8 +374,8 @@ static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 /*
  * Wrapper routine to for handling exceptions.
  */
-int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
-			     void *data)
+int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+				       unsigned long val, void *data)
 {
 	struct die_args *args = (struct die_args *)data;
 	int ret = NOTIFY_DONE;
@@ -406,7 +407,7 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
 	return ret;
 }
 
-int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct jprobe *jp = container_of(p, struct jprobe, kp);
 
@@ -419,16 +420,16 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	return 1;
 }
 
-void jprobe_return(void)
+void __kprobes jprobe_return(void)
 {
 	asm volatile("trap" ::: "memory");
 }
 
-void jprobe_return_end(void)
+void __kprobes jprobe_return_end(void)
 {
 };
 
-int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	/*
 	 * FIXME - we should ideally be validating that we got here 'cos
diff --git a/arch/ppc64/kernel/misc.S b/arch/ppc64/kernel/misc.S
index 2164bd7b4ef..6d860c1d9fa 100644
--- a/arch/ppc64/kernel/misc.S
+++ b/arch/ppc64/kernel/misc.S
@@ -183,7 +183,7 @@ PPC64_CACHES:
  *   flush all bytes from start through stop-1 inclusive
  */
 
-_GLOBAL(__flush_icache_range)
+_KPROBE(__flush_icache_range)
 
 /*
  * Flush the data cache to memory 
@@ -223,7 +223,7 @@ _GLOBAL(__flush_icache_range)
 	bdnz	2b
 	isync
 	blr
-	
+	.previous .text
 /*
  * Like above, but only do the D-cache.
  *
diff --git a/arch/ppc64/kernel/traps.c b/arch/ppc64/kernel/traps.c
index a8d5e83ee89..7467ae508e6 100644
--- a/arch/ppc64/kernel/traps.c
+++ b/arch/ppc64/kernel/traps.c
@@ -30,6 +30,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/delay.h>
+#include <linux/kprobes.h>
 #include <asm/kdebug.h>
 
 #include <asm/pgtable.h>
@@ -220,7 +221,7 @@ void instruction_breakpoint_exception(struct pt_regs *regs)
 	_exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip);
 }
 
-void single_step_exception(struct pt_regs *regs)
+void __kprobes single_step_exception(struct pt_regs *regs)
 {
 	regs->msr &= ~MSR_SE;  /* Turn off 'trace' bit */
 
@@ -398,7 +399,7 @@ check_bug_trap(struct pt_regs *regs)
 	return 0;
 }
 
-void program_check_exception(struct pt_regs *regs)
+void __kprobes program_check_exception(struct pt_regs *regs)
 {
 	if (debugger_fault_handler(regs))
 		return;
diff --git a/arch/ppc64/kernel/vmlinux.lds.S b/arch/ppc64/kernel/vmlinux.lds.S
index 4103cc13f8d..0306510bc4f 100644
--- a/arch/ppc64/kernel/vmlinux.lds.S
+++ b/arch/ppc64/kernel/vmlinux.lds.S
@@ -15,6 +15,7 @@ SECTIONS
 	*(.text .text.*)
 	SCHED_TEXT
 	LOCK_TEXT
+	KPROBES_TEXT
 	*(.fixup)
 	. = ALIGN(4096);
 	_etext = .;
diff --git a/arch/ppc64/mm/fault.c b/arch/ppc64/mm/fault.c
index 20b0f37e8bf..772f0714a5b 100644
--- a/arch/ppc64/mm/fault.c
+++ b/arch/ppc64/mm/fault.c
@@ -29,6 +29,7 @@
 #include <linux/interrupt.h>
 #include <linux/smp_lock.h>
 #include <linux/module.h>
+#include <linux/kprobes.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -84,8 +85,8 @@ static int store_updates_sp(struct pt_regs *regs)
  * The return value is 0 if the fault was handled, or the signal
  * number if this is a kernel fault that can't be handled here.
  */
-int do_page_fault(struct pt_regs *regs, unsigned long address,
-		  unsigned long error_code)
+int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
+			    unsigned long error_code)
 {
 	struct vm_area_struct * vma;
 	struct mm_struct *mm = current->mm;
diff --git a/include/asm-ppc64/processor.h b/include/asm-ppc64/processor.h
index 7bd4796f123..8bd7aa95938 100644
--- a/include/asm-ppc64/processor.h
+++ b/include/asm-ppc64/processor.h
@@ -311,6 +311,20 @@ name: \
 	.type GLUE(.,name),@function; \
 GLUE(.,name):
 
+#define _KPROBE(name) \
+	.section ".kprobes.text","a"; \
+	.align 2 ; \
+	.globl name; \
+	.globl GLUE(.,name); \
+	.section ".opd","aw"; \
+name: \
+	.quad GLUE(.,name); \
+	.quad .TOC.@tocbase; \
+	.quad 0; \
+	.previous; \
+	.type GLUE(.,name),@function; \
+GLUE(.,name):
+
 #define _STATIC(name) \
 	.section ".text"; \
 	.align 2 ; \
-- 
cgit v1.2.3-70-g09d2