summaryrefslogtreecommitdiffstats
path: root/arch/ppc64/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/ppc64/mm
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'arch/ppc64/mm')
-rw-r--r--arch/ppc64/mm/Makefile11
-rw-r--r--arch/ppc64/mm/fault.c312
-rw-r--r--arch/ppc64/mm/hash_low.S287
-rw-r--r--arch/ppc64/mm/hash_native.c423
-rw-r--r--arch/ppc64/mm/hash_utils.c439
-rw-r--r--arch/ppc64/mm/hugetlbpage.c904
-rw-r--r--arch/ppc64/mm/imalloc.c312
-rw-r--r--arch/ppc64/mm/init.c927
-rw-r--r--arch/ppc64/mm/mmap.c86
-rw-r--r--arch/ppc64/mm/numa.c734
-rw-r--r--arch/ppc64/mm/slb.c159
-rw-r--r--arch/ppc64/mm/slb_low.S154
-rw-r--r--arch/ppc64/mm/stab.c239
-rw-r--r--arch/ppc64/mm/tlb.c180
14 files changed, 5167 insertions, 0 deletions
diff --git a/arch/ppc64/mm/Makefile b/arch/ppc64/mm/Makefile
new file mode 100644
index 00000000000..ac522d57b2a
--- /dev/null
+++ b/arch/ppc64/mm/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for the linux ppc-specific parts of the memory manager.
+#
+
+EXTRA_CFLAGS += -mno-minimal-toc
+
+obj-y := fault.o init.o imalloc.o hash_utils.o hash_low.o tlb.o \
+ slb_low.o slb.o stab.o mmap.o
+obj-$(CONFIG_DISCONTIGMEM) += numa.o
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_PPC_MULTIPLATFORM) += hash_native.o
diff --git a/arch/ppc64/mm/fault.c b/arch/ppc64/mm/fault.c
new file mode 100644
index 00000000000..20b0f37e8bf
--- /dev/null
+++ b/arch/ppc64/mm/fault.c
@@ -0,0 +1,312 @@
+/*
+ * arch/ppc/mm/fault.c
+ *
+ * PowerPC version
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Derived from "arch/i386/mm/fault.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * Modified by Cort Dougan and Paul Mackerras.
+ *
+ * Modified for PPC64 by Dave Engebretsen (engebret@ibm.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/smp_lock.h>
+#include <linux/module.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/kdebug.h>
+
+/*
+ * Check whether the instruction at regs->nip is a store using
+ * an update addressing form which will update r1.
+ */
+static int store_updates_sp(struct pt_regs *regs)
+{
+ unsigned int inst;
+
+ if (get_user(inst, (unsigned int __user *)regs->nip))
+ return 0;
+ /* check for 1 in the rA field */
+ if (((inst >> 16) & 0x1f) != 1)
+ return 0;
+ /* check major opcode */
+ switch (inst >> 26) {
+ case 37: /* stwu */
+ case 39: /* stbu */
+ case 45: /* sthu */
+ case 53: /* stfsu */
+ case 55: /* stfdu */
+ return 1;
+ case 62: /* std or stdu */
+ return (inst & 3) == 1;
+ case 31:
+ /* check minor opcode */
+ switch ((inst >> 1) & 0x3ff) {
+ case 181: /* stdux */
+ case 183: /* stwux */
+ case 247: /* stbux */
+ case 439: /* sthux */
+ case 695: /* stfsux */
+ case 759: /* stfdux */
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * The error_code parameter is
+ * - DSISR for a non-SLB data access fault,
+ * - SRR1 & 0x08000000 for a non-SLB instruction access fault
+ * - 0 any SLB fault.
+ * The return value is 0 if the fault was handled, or the signal
+ * number if this is a kernel fault that can't be handled here.
+ */
+int do_page_fault(struct pt_regs *regs, unsigned long address,
+ unsigned long error_code)
+{
+ struct vm_area_struct * vma;
+ struct mm_struct *mm = current->mm;
+ siginfo_t info;
+ unsigned long code = SEGV_MAPERR;
+ unsigned long is_write = error_code & DSISR_ISSTORE;
+ unsigned long trap = TRAP(regs);
+ unsigned long is_exec = trap == 0x400;
+
+ BUG_ON((trap == 0x380) || (trap == 0x480));
+
+ if (notify_die(DIE_PAGE_FAULT, "page_fault", regs, error_code,
+ 11, SIGSEGV) == NOTIFY_STOP)
+ return 0;
+
+ if (trap == 0x300) {
+ if (debugger_fault_handler(regs))
+ return 0;
+ }
+
+ /* On a kernel SLB miss we can only check for a valid exception entry */
+ if (!user_mode(regs) && (address >= TASK_SIZE))
+ return SIGSEGV;
+
+ if (error_code & DSISR_DABRMATCH) {
+ if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
+ 11, SIGSEGV) == NOTIFY_STOP)
+ return 0;
+ if (debugger_dabr_match(regs))
+ return 0;
+ }
+
+ if (in_atomic() || mm == NULL) {
+ if (!user_mode(regs))
+ return SIGSEGV;
+ /* in_atomic() in user mode is really bad,
+ as is current->mm == NULL. */
+ printk(KERN_EMERG "Page fault in user mode with"
+ "in_atomic() = %d mm = %p\n", in_atomic(), mm);
+ printk(KERN_EMERG "NIP = %lx MSR = %lx\n",
+ regs->nip, regs->msr);
+ die("Weird page fault", regs, SIGSEGV);
+ }
+
+ /* When running in the kernel we expect faults to occur only to
+ * addresses in user space. All other faults represent errors in the
+ * kernel and should generate an OOPS. Unfortunatly, in the case of an
+ * erroneous fault occuring in a code path which already holds mmap_sem
+ * we will deadlock attempting to validate the fault against the
+ * address space. Luckily the kernel only validly references user
+ * space from well defined areas of code, which are listed in the
+ * exceptions table.
+ *
+ * As the vast majority of faults will be valid we will only perform
+ * the source reference check when there is a possibilty of a deadlock.
+ * Attempt to lock the address space, if we cannot we then validate the
+ * source. If this is invalid we can skip the address space check,
+ * thus avoiding the deadlock.
+ */
+ if (!down_read_trylock(&mm->mmap_sem)) {
+ if (!user_mode(regs) && !search_exception_tables(regs->nip))
+ goto bad_area_nosemaphore;
+
+ down_read(&mm->mmap_sem);
+ }
+
+ vma = find_vma(mm, address);
+ if (!vma)
+ goto bad_area;
+
+ if (vma->vm_start <= address) {
+ goto good_area;
+ }
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto bad_area;
+
+ /*
+ * N.B. The POWER/Open ABI allows programs to access up to
+ * 288 bytes below the stack pointer.
+ * The kernel signal delivery code writes up to about 1.5kB
+ * below the stack pointer (r1) before decrementing it.
+ * The exec code can write slightly over 640kB to the stack
+ * before setting the user r1. Thus we allow the stack to
+ * expand to 1MB without further checks.
+ */
+ if (address + 0x100000 < vma->vm_end) {
+ /* get user regs even if this fault is in kernel mode */
+ struct pt_regs *uregs = current->thread.regs;
+ if (uregs == NULL)
+ goto bad_area;
+
+ /*
+ * A user-mode access to an address a long way below
+ * the stack pointer is only valid if the instruction
+ * is one which would update the stack pointer to the
+ * address accessed if the instruction completed,
+ * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
+ * (or the byte, halfword, float or double forms).
+ *
+ * If we don't check this then any write to the area
+ * between the last mapped region and the stack will
+ * expand the stack rather than segfaulting.
+ */
+ if (address + 2048 < uregs->gpr[1]
+ && (!user_mode(regs) || !store_updates_sp(regs)))
+ goto bad_area;
+ }
+
+ if (expand_stack(vma, address))
+ goto bad_area;
+
+good_area:
+ code = SEGV_ACCERR;
+
+ if (is_exec) {
+ /* protection fault */
+ if (error_code & DSISR_PROTFAULT)
+ goto bad_area;
+ if (!(vma->vm_flags & VM_EXEC))
+ goto bad_area;
+ /* a write */
+ } else if (is_write) {
+ if (!(vma->vm_flags & VM_WRITE))
+ goto bad_area;
+ /* a read */
+ } else {
+ if (!(vma->vm_flags & VM_READ))
+ goto bad_area;
+ }
+
+ survive:
+ /*
+ * If for any reason at all we couldn't handle the fault,
+ * make sure we exit gracefully rather than endlessly redo
+ * the fault.
+ */
+ switch (handle_mm_fault(mm, vma, address, is_write)) {
+
+ case VM_FAULT_MINOR:
+ current->min_flt++;
+ break;
+ case VM_FAULT_MAJOR:
+ current->maj_flt++;
+ break;
+ case VM_FAULT_SIGBUS:
+ goto do_sigbus;
+ case VM_FAULT_OOM:
+ goto out_of_memory;
+ default:
+ BUG();
+ }
+
+ up_read(&mm->mmap_sem);
+ return 0;
+
+bad_area:
+ up_read(&mm->mmap_sem);
+
+bad_area_nosemaphore:
+ /* User mode accesses cause a SIGSEGV */
+ if (user_mode(regs)) {
+ info.si_signo = SIGSEGV;
+ info.si_errno = 0;
+ info.si_code = code;
+ info.si_addr = (void __user *) address;
+ force_sig_info(SIGSEGV, &info, current);
+ return 0;
+ }
+
+ if (trap == 0x400 && (error_code & DSISR_PROTFAULT)
+ && printk_ratelimit())
+ printk(KERN_CRIT "kernel tried to execute NX-protected"
+ " page (%lx) - exploit attempt? (uid: %d)\n",
+ address, current->uid);
+
+ return SIGSEGV;
+
+/*
+ * We ran out of memory, or some other thing happened to us that made
+ * us unable to handle the page fault gracefully.
+ */
+out_of_memory:
+ up_read(&mm->mmap_sem);
+ if (current->pid == 1) {
+ yield();
+ down_read(&mm->mmap_sem);
+ goto survive;
+ }
+ printk("VM: killing process %s\n", current->comm);
+ if (user_mode(regs))
+ do_exit(SIGKILL);
+ return SIGKILL;
+
+do_sigbus:
+ up_read(&mm->mmap_sem);
+ if (user_mode(regs)) {
+ info.si_signo = SIGBUS;
+ info.si_errno = 0;
+ info.si_code = BUS_ADRERR;
+ info.si_addr = (void __user *)address;
+ force_sig_info(SIGBUS, &info, current);
+ return 0;
+ }
+ return SIGBUS;
+}
+
+/*
+ * bad_page_fault is called when we have a bad access from the kernel.
+ * It is called from do_page_fault above and from some of the procedures
+ * in traps.c.
+ */
+void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
+{
+ const struct exception_table_entry *entry;
+
+ /* Are we prepared to handle this fault? */
+ if ((entry = search_exception_tables(regs->nip)) != NULL) {
+ regs->nip = entry->fixup;
+ return;
+ }
+
+ /* kernel has accessed a bad area */
+ die("Kernel access of bad area", regs, sig);
+}
diff --git a/arch/ppc64/mm/hash_low.S b/arch/ppc64/mm/hash_low.S
new file mode 100644
index 00000000000..8c0156a3700
--- /dev/null
+++ b/arch/ppc64/mm/hash_low.S
@@ -0,0 +1,287 @@
+/*
+ * ppc64 MMU hashtable management routines
+ *
+ * (c) Copyright IBM Corp. 2003
+ *
+ * Maintained by: Benjamin Herrenschmidt
+ * <benh@kernel.crashing.org>
+ *
+ * This file is covered by the GNU Public Licence v2 as
+ * described in the kernel's COPYING file.
+ */
+
+#include <asm/processor.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/page.h>
+#include <asm/types.h>
+#include <asm/ppc_asm.h>
+#include <asm/offsets.h>
+#include <asm/cputable.h>
+
+ .text
+
+/*
+ * Stackframe:
+ *
+ * +-> Back chain (SP + 256)
+ * | General register save area (SP + 112)
+ * | Parameter save area (SP + 48)
+ * | TOC save area (SP + 40)
+ * | link editor doubleword (SP + 32)
+ * | compiler doubleword (SP + 24)
+ * | LR save area (SP + 16)
+ * | CR save area (SP + 8)
+ * SP ---> +-- Back chain (SP + 0)
+ */
+#define STACKFRAMESIZE 256
+
+/* Save parameters offsets */
+#define STK_PARM(i) (STACKFRAMESIZE + 48 + ((i)-3)*8)
+
+/* Save non-volatile offsets */
+#define STK_REG(i) (112 + ((i)-14)*8)
+
+/*
+ * _hash_page(unsigned long ea, unsigned long access, unsigned long vsid,
+ * pte_t *ptep, unsigned long trap, int local)
+ *
+ * Adds a page to the hash table. This is the non-LPAR version for now
+ */
+
+_GLOBAL(__hash_page)
+ mflr r0
+ std r0,16(r1)
+ stdu r1,-STACKFRAMESIZE(r1)
+ /* Save all params that we need after a function call */
+ std r6,STK_PARM(r6)(r1)
+ std r8,STK_PARM(r8)(r1)
+
+ /* Add _PAGE_PRESENT to access */
+ ori r4,r4,_PAGE_PRESENT
+
+ /* Save non-volatile registers.
+ * r31 will hold "old PTE"
+ * r30 is "new PTE"
+ * r29 is "va"
+ * r28 is a hash value
+ * r27 is hashtab mask (maybe dynamic patched instead ?)
+ */
+ std r27,STK_REG(r27)(r1)
+ std r28,STK_REG(r28)(r1)
+ std r29,STK_REG(r29)(r1)
+ std r30,STK_REG(r30)(r1)
+ std r31,STK_REG(r31)(r1)
+
+ /* Step 1:
+ *
+ * Check permissions, atomically mark the linux PTE busy
+ * and hashed.
+ */
+1:
+ ldarx r31,0,r6
+ /* Check access rights (access & ~(pte_val(*ptep))) */
+ andc. r0,r4,r31
+ bne- htab_wrong_access
+ /* Check if PTE is busy */
+ andi. r0,r31,_PAGE_BUSY
+ bne- 1b
+ /* Prepare new PTE value (turn access RW into DIRTY, then
+ * add BUSY,HASHPTE and ACCESSED)
+ */
+ rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */
+ or r30,r30,r31
+ ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE
+ /* Write the linux PTE atomically (setting busy) */
+ stdcx. r30,0,r6
+ bne- 1b
+ isync
+
+ /* Step 2:
+ *
+ * Insert/Update the HPTE in the hash table. At this point,
+ * r4 (access) is re-useable, we use it for the new HPTE flags
+ */
+
+ /* Calc va and put it in r29 */
+ rldicr r29,r5,28,63-28
+ rldicl r3,r3,0,36
+ or r29,r3,r29
+
+ /* Calculate hash value for primary slot and store it in r28 */
+ rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */
+ rldicl r0,r3,64-12,48 /* (ea >> 12) & 0xffff */
+ xor r28,r5,r0
+
+ /* Convert linux PTE bits into HW equivalents */
+ andi. r3,r30,0x1fe /* Get basic set of flags */
+ xori r3,r3,HW_NO_EXEC /* _PAGE_EXEC -> NOEXEC */
+ rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */
+ rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */
+ and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY -> r0 bit 30 */
+ andc r0,r30,r0 /* r0 = pte & ~r0 */
+ rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */
+
+ /* We eventually do the icache sync here (maybe inline that
+ * code rather than call a C function...)
+ */
+BEGIN_FTR_SECTION
+BEGIN_FTR_SECTION
+ mr r4,r30
+ mr r5,r7
+ bl .hash_page_do_lazy_icache
+END_FTR_SECTION_IFSET(CPU_FTR_NOEXECUTE)
+END_FTR_SECTION_IFCLR(CPU_FTR_COHERENT_ICACHE)
+
+ /* At this point, r3 contains new PP bits, save them in
+ * place of "access" in the param area (sic)
+ */
+ std r3,STK_PARM(r4)(r1)
+
+ /* Get htab_hash_mask */
+ ld r4,htab_hash_mask@got(2)
+ ld r27,0(r4) /* htab_hash_mask -> r27 */
+
+ /* Check if we may already be in the hashtable, in this case, we
+ * go to out-of-line code to try to modify the HPTE
+ */
+ andi. r0,r31,_PAGE_HASHPTE
+ bne htab_modify_pte
+
+htab_insert_pte:
+ /* Clear hpte bits in new pte (we also clear BUSY btw) and
+ * add _PAGE_HASHPTE
+ */
+ lis r0,_PAGE_HPTEFLAGS@h
+ ori r0,r0,_PAGE_HPTEFLAGS@l
+ andc r30,r30,r0
+ ori r30,r30,_PAGE_HASHPTE
+
+ /* page number in r5 */
+ rldicl r5,r31,64-PTE_SHIFT,PTE_SHIFT
+
+ /* Calculate primary group hash */
+ and r0,r28,r27
+ rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
+
+ /* Call ppc_md.hpte_insert */
+ ld r7,STK_PARM(r4)(r1) /* Retreive new pp bits */
+ mr r4,r29 /* Retreive va */
+ li r6,0 /* primary slot */
+ li r8,0 /* not bolted and not large */
+ li r9,0
+_GLOBAL(htab_call_hpte_insert1)
+ bl . /* Will be patched by htab_finish_init() */
+ cmpdi 0,r3,0
+ bge htab_pte_insert_ok /* Insertion successful */
+ cmpdi 0,r3,-2 /* Critical failure */
+ beq- htab_pte_insert_failure
+
+ /* Now try secondary slot */
+
+ /* page number in r5 */
+ rldicl r5,r31,64-PTE_SHIFT,PTE_SHIFT
+
+ /* Calculate secondary group hash */
+ andc r0,r27,r28
+ rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */
+
+ /* Call ppc_md.hpte_insert */
+ ld r7,STK_PARM(r4)(r1) /* Retreive new pp bits */
+ mr r4,r29 /* Retreive va */
+ li r6,1 /* secondary slot */
+ li r8,0 /* not bolted and not large */
+ li r9,0
+_GLOBAL(htab_call_hpte_insert2)
+ bl . /* Will be patched by htab_finish_init() */
+ cmpdi 0,r3,0
+ bge+ htab_pte_insert_ok /* Insertion successful */
+ cmpdi 0,r3,-2 /* Critical failure */
+ beq- htab_pte_insert_failure
+
+ /* Both are full, we need to evict something */
+ mftb r0
+ /* Pick a random group based on TB */
+ andi. r0,r0,1
+ mr r5,r28
+ bne 2f
+ not r5,r5
+2: and r0,r5,r27
+ rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
+ /* Call ppc_md.hpte_remove */
+_GLOBAL(htab_call_hpte_remove)
+ bl . /* Will be patched by htab_finish_init() */
+
+ /* Try all again */
+ b htab_insert_pte
+
+htab_pte_insert_ok:
+ /* Insert slot number & secondary bit in PTE */
+ rldimi r30,r3,12,63-15
+
+ /* Write out the PTE with a normal write
+ * (maybe add eieio may be good still ?)
+ */
+htab_write_out_pte:
+ ld r6,STK_PARM(r6)(r1)
+ std r30,0(r6)
+ li r3, 0
+bail:
+ ld r27,STK_REG(r27)(r1)
+ ld r28,STK_REG(r28)(r1)
+ ld r29,STK_REG(r29)(r1)
+ ld r30,STK_REG(r30)(r1)
+ ld r31,STK_REG(r31)(r1)
+ addi r1,r1,STACKFRAMESIZE
+ ld r0,16(r1)
+ mtlr r0
+ blr
+
+htab_modify_pte:
+ /* Keep PP bits in r4 and slot idx from the PTE around in r3 */
+ mr r4,r3
+ rlwinm r3,r31,32-12,29,31
+
+ /* Secondary group ? if yes, get a inverted hash value */
+ mr r5,r28
+ andi. r0,r31,_PAGE_SECONDARY
+ beq 1f
+ not r5,r5
+1:
+ /* Calculate proper slot value for ppc_md.hpte_updatepp */
+ and r0,r5,r27
+ rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */
+ add r3,r0,r3 /* add slot idx */
+
+ /* Call ppc_md.hpte_updatepp */
+ mr r5,r29 /* va */
+ li r6,0 /* large is 0 */
+ ld r7,STK_PARM(r8)(r1) /* get "local" param */
+_GLOBAL(htab_call_hpte_updatepp)
+ bl . /* Will be patched by htab_finish_init() */
+
+ /* if we failed because typically the HPTE wasn't really here
+ * we try an insertion.
+ */
+ cmpdi 0,r3,-1
+ beq- htab_insert_pte
+
+ /* Clear the BUSY bit and Write out the PTE */
+ li r0,_PAGE_BUSY
+ andc r30,r30,r0
+ b htab_write_out_pte
+
+htab_wrong_access:
+ /* Bail out clearing reservation */
+ stdcx. r31,0,r6
+ li r3,1
+ b bail
+
+htab_pte_insert_failure:
+ /* Bail out restoring old PTE */
+ ld r6,STK_PARM(r6)(r1)
+ std r31,0(r6)
+ li r3,-1
+ b bail
+
+
diff --git a/arch/ppc64/mm/hash_native.c b/arch/ppc64/mm/hash_native.c
new file mode 100644
index 00000000000..144657e0c3d
--- /dev/null
+++ b/arch/ppc64/mm/hash_native.c
@@ -0,0 +1,423 @@
+/*
+ * native hashtable management.
+ *
+ * SMP scalability work:
+ * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/spinlock.h>
+#include <linux/bitops.h>
+#include <linux/threads.h>
+#include <linux/smp.h>
+
+#include <asm/abs_addr.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/cputable.h>
+
+#define HPTE_LOCK_BIT 3
+
+static DEFINE_SPINLOCK(native_tlbie_lock);
+
+static inline void native_lock_hpte(HPTE *hptep)
+{
+ unsigned long *word = &hptep->dw0.dword0;
+
+ while (1) {
+ if (!test_and_set_bit(HPTE_LOCK_BIT, word))
+ break;
+ while(test_bit(HPTE_LOCK_BIT, word))
+ cpu_relax();
+ }
+}
+
+static inline void native_unlock_hpte(HPTE *hptep)
+{
+ unsigned long *word = &hptep->dw0.dword0;
+
+ asm volatile("lwsync":::"memory");
+ clear_bit(HPTE_LOCK_BIT, word);
+}
+
+long native_hpte_insert(unsigned long hpte_group, unsigned long va,
+ unsigned long prpn, int secondary,
+ unsigned long hpteflags, int bolted, int large)
+{
+ unsigned long arpn = physRpn_to_absRpn(prpn);
+ HPTE *hptep = htab_address + hpte_group;
+ Hpte_dword0 dw0;
+ HPTE lhpte;
+ int i;
+
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+ dw0 = hptep->dw0.dw0;
+
+ if (!dw0.v) {
+ /* retry with lock held */
+ native_lock_hpte(hptep);
+ dw0 = hptep->dw0.dw0;
+ if (!dw0.v)
+ break;
+ native_unlock_hpte(hptep);
+ }
+
+ hptep++;
+ }
+
+ if (i == HPTES_PER_GROUP)
+ return -1;
+
+ lhpte.dw1.dword1 = 0;
+ lhpte.dw1.dw1.rpn = arpn;
+ lhpte.dw1.flags.flags = hpteflags;
+
+ lhpte.dw0.dword0 = 0;
+ lhpte.dw0.dw0.avpn = va >> 23;
+ lhpte.dw0.dw0.h = secondary;
+ lhpte.dw0.dw0.bolted = bolted;
+ lhpte.dw0.dw0.v = 1;
+
+ if (large) {
+ lhpte.dw0.dw0.l = 1;
+ lhpte.dw0.dw0.avpn &= ~0x1UL;
+ }
+
+ hptep->dw1.dword1 = lhpte.dw1.dword1;
+
+ /* Guarantee the second dword is visible before the valid bit */
+ __asm__ __volatile__ ("eieio" : : : "memory");
+
+ /*
+ * Now set the first dword including the valid bit
+ * NOTE: this also unlocks the hpte
+ */
+ hptep->dw0.dword0 = lhpte.dw0.dword0;
+
+ __asm__ __volatile__ ("ptesync" : : : "memory");
+
+ return i | (secondary << 3);
+}
+
+static long native_hpte_remove(unsigned long hpte_group)
+{
+ HPTE *hptep;
+ Hpte_dword0 dw0;
+ int i;
+ int slot_offset;
+
+ /* pick a random entry to start at */
+ slot_offset = mftb() & 0x7;
+
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+ hptep = htab_address + hpte_group + slot_offset;
+ dw0 = hptep->dw0.dw0;
+
+ if (dw0.v && !dw0.bolted) {
+ /* retry with lock held */
+ native_lock_hpte(hptep);
+ dw0 = hptep->dw0.dw0;
+ if (dw0.v && !dw0.bolted)
+ break;
+ native_unlock_hpte(hptep);
+ }
+
+ slot_offset++;
+ slot_offset &= 0x7;
+ }
+
+ if (i == HPTES_PER_GROUP)
+ return -1;
+
+ /* Invalidate the hpte. NOTE: this also unlocks it */
+ hptep->dw0.dword0 = 0;
+
+ return i;
+}
+
+static inline void set_pp_bit(unsigned long pp, HPTE *addr)
+{
+ unsigned long old;
+ unsigned long *p = &addr->dw1.dword1;
+
+ __asm__ __volatile__(
+ "1: ldarx %0,0,%3\n\
+ rldimi %0,%2,0,61\n\
+ stdcx. %0,0,%3\n\
+ bne 1b"
+ : "=&r" (old), "=m" (*p)
+ : "r" (pp), "r" (p), "m" (*p)
+ : "cc");
+}
+
+/*
+ * Only works on small pages. Yes its ugly to have to check each slot in
+ * the group but we only use this during bootup.
+ */
+static long native_hpte_find(unsigned long vpn)
+{
+ HPTE *hptep;
+ unsigned long hash;
+ unsigned long i, j;
+ long slot;
+ Hpte_dword0 dw0;
+
+ hash = hpt_hash(vpn, 0);
+
+ for (j = 0; j < 2; j++) {
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+ hptep = htab_address + slot;
+ dw0 = hptep->dw0.dw0;
+
+ if ((dw0.avpn == (vpn >> 11)) && dw0.v &&
+ (dw0.h == j)) {
+ /* HPTE matches */
+ if (j)
+ slot = -slot;
+ return slot;
+ }
+ ++slot;
+ }
+ hash = ~hash;
+ }
+
+ return -1;
+}
+
+static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
+ unsigned long va, int large, int local)
+{
+ HPTE *hptep = htab_address + slot;
+ Hpte_dword0 dw0;
+ unsigned long avpn = va >> 23;
+ int ret = 0;
+
+ if (large)
+ avpn &= ~0x1UL;
+
+ native_lock_hpte(hptep);
+
+ dw0 = hptep->dw0.dw0;
+
+ /* Even if we miss, we need to invalidate the TLB */
+ if ((dw0.avpn != avpn) || !dw0.v) {
+ native_unlock_hpte(hptep);
+ ret = -1;
+ } else {
+ set_pp_bit(newpp, hptep);
+ native_unlock_hpte(hptep);
+ }
+
+ /* Ensure it is out of the tlb too */
+ if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
+ tlbiel(va);
+ } else {
+ int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+
+ if (lock_tlbie)
+ spin_lock(&native_tlbie_lock);
+ tlbie(va, large);
+ if (lock_tlbie)
+ spin_unlock(&native_tlbie_lock);
+ }
+
+ return ret;
+}
+
+/*
+ * Update the page protection bits. Intended to be used to create
+ * guard pages for kernel data structures on pages which are bolted
+ * in the HPT. Assumes pages being operated on will not be stolen.
+ * Does not work on large pages.
+ *
+ * No need to lock here because we should be the only user.
+ */
+static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea)
+{
+ unsigned long vsid, va, vpn, flags = 0;
+ long slot;
+ HPTE *hptep;
+ int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+
+ vsid = get_kernel_vsid(ea);
+ va = (vsid << 28) | (ea & 0x0fffffff);
+ vpn = va >> PAGE_SHIFT;
+
+ slot = native_hpte_find(vpn);
+ if (slot == -1)
+ panic("could not find page to bolt\n");
+ hptep = htab_address + slot;
+
+ set_pp_bit(newpp, hptep);
+
+ /* Ensure it is out of the tlb too */
+ if (lock_tlbie)
+ spin_lock_irqsave(&native_tlbie_lock, flags);
+ tlbie(va, 0);
+ if (lock_tlbie)
+ spin_unlock_irqrestore(&native_tlbie_lock, flags);
+}
+
+static void native_hpte_invalidate(unsigned long slot, unsigned long va,
+ int large, int local)
+{
+ HPTE *hptep = htab_address + slot;
+ Hpte_dword0 dw0;
+ unsigned long avpn = va >> 23;
+ unsigned long flags;
+ int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+
+ if (large)
+ avpn &= ~0x1UL;
+
+ local_irq_save(flags);
+ native_lock_hpte(hptep);
+
+ dw0 = hptep->dw0.dw0;
+
+ /* Even if we miss, we need to invalidate the TLB */
+ if ((dw0.avpn != avpn) || !dw0.v) {
+ native_unlock_hpte(hptep);
+ } else {
+ /* Invalidate the hpte. NOTE: this also unlocks it */
+ hptep->dw0.dword0 = 0;
+ }
+
+ /* Invalidate the tlb */
+ if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
+ tlbiel(va);
+ } else {
+ if (lock_tlbie)
+ spin_lock(&native_tlbie_lock);
+ tlbie(va, large);
+ if (lock_tlbie)
+ spin_unlock(&native_tlbie_lock);
+ }
+ local_irq_restore(flags);
+}
+
+static void native_flush_hash_range(unsigned long context,
+ unsigned long number, int local)
+{
+ unsigned long vsid, vpn, va, hash, secondary, slot, flags, avpn;
+ int i, j;
+ HPTE *hptep;
+ Hpte_dword0 dw0;
+ struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+
+ /* XXX fix for large ptes */
+ unsigned long large = 0;
+
+ local_irq_save(flags);
+
+ j = 0;
+ for (i = 0; i < number; i++) {
+ if ((batch->addr[i] >= USER_START) &&
+ (batch->addr[i] <= USER_END))
+ vsid = get_vsid(context, batch->addr[i]);
+ else
+ vsid = get_kernel_vsid(batch->addr[i]);
+
+ va = (vsid << 28) | (batch->addr[i] & 0x0fffffff);
+ batch->vaddr[j] = va;
+ if (large)
+ vpn = va >> HPAGE_SHIFT;
+ else
+ vpn = va >> PAGE_SHIFT;
+ hash = hpt_hash(vpn, large);
+ secondary = (pte_val(batch->pte[i]) & _PAGE_SECONDARY) >> 15;
+ if (secondary)
+ hash = ~hash;
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += (pte_val(batch->pte[i]) & _PAGE_GROUP_IX) >> 12;
+
+ hptep = htab_address + slot;
+
+ avpn = va >> 23;
+ if (large)
+ avpn &= ~0x1UL;
+
+ native_lock_hpte(hptep);
+
+ dw0 = hptep->dw0.dw0;
+
+ /* Even if we miss, we need to invalidate the TLB */
+ if ((dw0.avpn != avpn) || !dw0.v) {
+ native_unlock_hpte(hptep);
+ } else {
+ /* Invalidate the hpte. NOTE: this also unlocks it */
+ hptep->dw0.dword0 = 0;
+ }
+
+ j++;
+ }
+
+ if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
+ asm volatile("ptesync":::"memory");
+
+ for (i = 0; i < j; i++)
+ __tlbiel(batch->vaddr[i]);
+
+ asm volatile("ptesync":::"memory");
+ } else {
+ int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+
+ if (lock_tlbie)
+ spin_lock(&native_tlbie_lock);
+
+ asm volatile("ptesync":::"memory");
+
+ for (i = 0; i < j; i++)
+ __tlbie(batch->vaddr[i], 0);
+
+ asm volatile("eieio; tlbsync; ptesync":::"memory");
+
+ if (lock_tlbie)
+ spin_unlock(&native_tlbie_lock);
+ }
+
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_PPC_PSERIES
+/* Disable TLB batching on nighthawk */
+static inline int tlb_batching_enabled(void)
+{
+ struct device_node *root = of_find_node_by_path("/");
+ int enabled = 1;
+
+ if (root) {
+ const char *model = get_property(root, "model", NULL);
+ if (model && !strcmp(model, "IBM,9076-N81"))
+ enabled = 0;
+ of_node_put(root);
+ }
+
+ return enabled;
+}
+#else
+static inline int tlb_batching_enabled(void)
+{
+ return 1;
+}
+#endif
+
+void hpte_init_native(void)
+{
+ ppc_md.hpte_invalidate = native_hpte_invalidate;
+ ppc_md.hpte_updatepp = native_hpte_updatepp;
+ ppc_md.hpte_updateboltedpp = native_hpte_updateboltedpp;
+ ppc_md.hpte_insert = native_hpte_insert;
+ ppc_md.hpte_remove = native_hpte_remove;
+ if (tlb_batching_enabled())
+ ppc_md.flush_hash_range = native_flush_hash_range;
+ htab_finish_init();
+}
diff --git a/arch/ppc64/mm/hash_utils.c b/arch/ppc64/mm/hash_utils.c
new file mode 100644
index 00000000000..e48be12f518
--- /dev/null
+++ b/arch/ppc64/mm/hash_utils.c
@@ -0,0 +1,439 @@
+/*
+ * PowerPC64 port by Mike Corrigan and Dave Engebretsen
+ * {mikejc|engebret}@us.ibm.com
+ *
+ * Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com>
+ *
+ * SMP scalability work:
+ * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * Module name: htab.c
+ *
+ * Description:
+ * PowerPC Hashed Page Table functions
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/config.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/sysctl.h>
+#include <linux/ctype.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/signal.h>
+
+#include <asm/ppcdebug.h>
+#include <asm/processor.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/page.h>
+#include <asm/types.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/machdep.h>
+#include <asm/lmb.h>
+#include <asm/abs_addr.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+#include <asm/eeh.h>
+#include <asm/tlb.h>
+#include <asm/cacheflush.h>
+#include <asm/cputable.h>
+#include <asm/abs_addr.h>
+#include <asm/sections.h>
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+/*
+ * Note: pte --> Linux PTE
+ * HPTE --> PowerPC Hashed Page Table Entry
+ *
+ * Execution context:
+ * htab_initialize is called with the MMU off (of course), but
+ * the kernel has been copied down to zero so it can directly
+ * reference global data. At this point it is very difficult
+ * to print debug info.
+ *
+ */
+
+#ifdef CONFIG_U3_DART
+extern unsigned long dart_tablebase;
+#endif /* CONFIG_U3_DART */
+
+HPTE *htab_address;
+unsigned long htab_hash_mask;
+
+extern unsigned long _SDR1;
+
+#define KB (1024)
+#define MB (1024*KB)
+
+static inline void loop_forever(void)
+{
+ volatile unsigned long x = 1;
+ for(;x;x|=1)
+ ;
+}
+
+#ifdef CONFIG_PPC_MULTIPLATFORM
+static inline void create_pte_mapping(unsigned long start, unsigned long end,
+ unsigned long mode, int large)
+{
+ unsigned long addr;
+ unsigned int step;
+ unsigned long tmp_mode;
+
+ if (large)
+ step = 16*MB;
+ else
+ step = 4*KB;
+
+ for (addr = start; addr < end; addr += step) {
+ unsigned long vpn, hash, hpteg;
+ unsigned long vsid = get_kernel_vsid(addr);
+ unsigned long va = (vsid << 28) | (addr & 0xfffffff);
+ int ret;
+
+ if (large)
+ vpn = va >> HPAGE_SHIFT;
+ else
+ vpn = va >> PAGE_SHIFT;
+
+
+ tmp_mode = mode;
+
+ /* Make non-kernel text non-executable */
+ if (!in_kernel_text(addr))
+ tmp_mode = mode | HW_NO_EXEC;
+
+ hash = hpt_hash(vpn, large);
+
+ hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+
+#ifdef CONFIG_PPC_PSERIES
+ if (systemcfg->platform & PLATFORM_LPAR)
+ ret = pSeries_lpar_hpte_insert(hpteg, va,
+ virt_to_abs(addr) >> PAGE_SHIFT,
+ 0, tmp_mode, 1, large);
+ else
+#endif /* CONFIG_PPC_PSERIES */
+ ret = native_hpte_insert(hpteg, va,
+ virt_to_abs(addr) >> PAGE_SHIFT,
+ 0, tmp_mode, 1, large);
+
+ if (ret == -1) {
+ ppc64_terminate_msg(0x20, "create_pte_mapping");
+ loop_forever();
+ }
+ }
+}
+
+void __init htab_initialize(void)
+{
+ unsigned long table, htab_size_bytes;
+ unsigned long pteg_count;
+ unsigned long mode_rw;
+ int i, use_largepages = 0;
+ unsigned long base = 0, size = 0;
+ extern unsigned long tce_alloc_start, tce_alloc_end;
+
+ DBG(" -> htab_initialize()\n");
+
+ /*
+ * Calculate the required size of the htab. We want the number of
+ * PTEGs to equal one half the number of real pages.
+ */
+ htab_size_bytes = 1UL << ppc64_pft_size;
+ pteg_count = htab_size_bytes >> 7;
+
+ /* For debug, make the HTAB 1/8 as big as it normally would be. */
+ ifppcdebug(PPCDBG_HTABSIZE) {
+ pteg_count >>= 3;
+ htab_size_bytes = pteg_count << 7;
+ }
+
+ htab_hash_mask = pteg_count - 1;
+
+ if (systemcfg->platform & PLATFORM_LPAR) {
+ /* Using a hypervisor which owns the htab */
+ htab_address = NULL;
+ _SDR1 = 0;
+ } else {
+ /* Find storage for the HPT. Must be contiguous in
+ * the absolute address space.
+ */
+ table = lmb_alloc(htab_size_bytes, htab_size_bytes);
+
+ DBG("Hash table allocated at %lx, size: %lx\n", table,
+ htab_size_bytes);
+
+ if ( !table ) {
+ ppc64_terminate_msg(0x20, "hpt space");
+ loop_forever();
+ }
+ htab_address = abs_to_virt(table);
+
+ /* htab absolute addr + encoded htabsize */
+ _SDR1 = table + __ilog2(pteg_count) - 11;
+
+ /* Initialize the HPT with no entries */
+ memset((void *)table, 0, htab_size_bytes);
+ }
+
+ mode_rw = _PAGE_ACCESSED | _PAGE_COHERENT | PP_RWXX;
+
+ /* On U3 based machines, we need to reserve the DART area and
+ * _NOT_ map it to avoid cache paradoxes as it's remapped non
+ * cacheable later on
+ */
+ if (cpu_has_feature(CPU_FTR_16M_PAGE))
+ use_largepages = 1;
+
+ /* create bolted the linear mapping in the hash table */
+ for (i=0; i < lmb.memory.cnt; i++) {
+ base = lmb.memory.region[i].physbase + KERNELBASE;
+ size = lmb.memory.region[i].size;
+
+ DBG("creating mapping for region: %lx : %lx\n", base, size);
+
+#ifdef CONFIG_U3_DART
+ /* Do not map the DART space. Fortunately, it will be aligned
+ * in such a way that it will not cross two lmb regions and will
+ * fit within a single 16Mb page.
+ * The DART space is assumed to be a full 16Mb region even if we
+ * only use 2Mb of that space. We will use more of it later for
+ * AGP GART. We have to use a full 16Mb large page.
+ */
+ DBG("DART base: %lx\n", dart_tablebase);
+
+ if (dart_tablebase != 0 && dart_tablebase >= base
+ && dart_tablebase < (base + size)) {
+ if (base != dart_tablebase)
+ create_pte_mapping(base, dart_tablebase, mode_rw,
+ use_largepages);
+ if ((base + size) > (dart_tablebase + 16*MB))
+ create_pte_mapping(dart_tablebase + 16*MB, base + size,
+ mode_rw, use_largepages);
+ continue;
+ }
+#endif /* CONFIG_U3_DART */
+ create_pte_mapping(base, base + size, mode_rw, use_largepages);
+ }
+
+ /*
+ * If we have a memory_limit and we've allocated TCEs then we need to
+ * explicitly map the TCE area at the top of RAM. We also cope with the
+ * case that the TCEs start below memory_limit.
+ * tce_alloc_start/end are 16MB aligned so the mapping should work
+ * for either 4K or 16MB pages.
+ */
+ if (tce_alloc_start) {
+ tce_alloc_start += KERNELBASE;
+ tce_alloc_end += KERNELBASE;
+
+ if (base + size >= tce_alloc_start)
+ tce_alloc_start = base + size + 1;
+
+ create_pte_mapping(tce_alloc_start, tce_alloc_end,
+ mode_rw, use_largepages);
+ }
+
+ DBG(" <- htab_initialize()\n");
+}
+#undef KB
+#undef MB
+#endif /* CONFIG_PPC_MULTIPLATFORM */
+
+/*
+ * Called by asm hashtable.S for doing lazy icache flush
+ */
+unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
+{
+ struct page *page;
+
+ if (!pfn_valid(pte_pfn(pte)))
+ return pp;
+
+ page = pte_page(pte);
+
+ /* page is dirty */
+ if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
+ if (trap == 0x400) {
+ __flush_dcache_icache(page_address(page));
+ set_bit(PG_arch_1, &page->flags);
+ } else
+ pp |= HW_NO_EXEC;
+ }
+ return pp;
+}
+
+/* Result code is:
+ * 0 - handled
+ * 1 - normal page fault
+ * -1 - critical hash insertion error
+ */
+int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
+{
+ void *pgdir;
+ unsigned long vsid;
+ struct mm_struct *mm;
+ pte_t *ptep;
+ int ret;
+ int user_region = 0;
+ int local = 0;
+ cpumask_t tmp;
+
+ switch (REGION_ID(ea)) {
+ case USER_REGION_ID:
+ user_region = 1;
+ mm = current->mm;
+ if ((ea > USER_END) || (! mm))
+ return 1;
+
+ vsid = get_vsid(mm->context.id, ea);
+ break;
+ case IO_REGION_ID:
+ if (ea > IMALLOC_END)
+ return 1;
+ mm = &ioremap_mm;
+ vsid = get_kernel_vsid(ea);
+ break;
+ case VMALLOC_REGION_ID:
+ if (ea > VMALLOC_END)
+ return 1;
+ mm = &init_mm;
+ vsid = get_kernel_vsid(ea);
+ break;
+#if 0
+ case KERNEL_REGION_ID:
+ /*
+ * Should never get here - entire 0xC0... region is bolted.
+ * Send the problem up to do_page_fault
+ */
+#endif
+ default:
+ /* Not a valid range
+ * Send the problem up to do_page_fault
+ */
+ return 1;
+ break;
+ }
+
+ pgdir = mm->pgd;
+
+ if (pgdir == NULL)
+ return 1;
+
+ tmp = cpumask_of_cpu(smp_processor_id());
+ if (user_region && cpus_equal(mm->cpu_vm_mask, tmp))
+ local = 1;
+
+ /* Is this a huge page ? */
+ if (unlikely(in_hugepage_area(mm->context, ea)))
+ ret = hash_huge_page(mm, access, ea, vsid, local);
+ else {
+ ptep = find_linux_pte(pgdir, ea);
+ if (ptep == NULL)
+ return 1;
+ ret = __hash_page(ea, access, vsid, ptep, trap, local);
+ }
+
+ return ret;
+}
+
+void flush_hash_page(unsigned long context, unsigned long ea, pte_t pte,
+ int local)
+{
+ unsigned long vsid, vpn, va, hash, secondary, slot;
+ unsigned long huge = pte_huge(pte);
+
+ if ((ea >= USER_START) && (ea <= USER_END))
+ vsid = get_vsid(context, ea);
+ else
+ vsid = get_kernel_vsid(ea);
+
+ va = (vsid << 28) | (ea & 0x0fffffff);
+ if (huge)
+ vpn = va >> HPAGE_SHIFT;
+ else
+ vpn = va >> PAGE_SHIFT;
+ hash = hpt_hash(vpn, huge);
+ secondary = (pte_val(pte) & _PAGE_SECONDARY) >> 15;
+ if (secondary)
+ hash = ~hash;
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += (pte_val(pte) & _PAGE_GROUP_IX) >> 12;
+
+ ppc_md.hpte_invalidate(slot, va, huge, local);
+}
+
+void flush_hash_range(unsigned long context, unsigned long number, int local)
+{
+ if (ppc_md.flush_hash_range) {
+ ppc_md.flush_hash_range(context, number, local);
+ } else {
+ int i;
+ struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+
+ for (i = 0; i < number; i++)
+ flush_hash_page(context, batch->addr[i], batch->pte[i],
+ local);
+ }
+}
+
+static inline void make_bl(unsigned int *insn_addr, void *func)
+{
+ unsigned long funcp = *((unsigned long *)func);
+ int offset = funcp - (unsigned long)insn_addr;
+
+ *insn_addr = (unsigned int)(0x48000001 | (offset & 0x03fffffc));
+ flush_icache_range((unsigned long)insn_addr, 4+
+ (unsigned long)insn_addr);
+}
+
+/*
+ * low_hash_fault is called when we the low level hash code failed
+ * to instert a PTE due to an hypervisor error
+ */
+void low_hash_fault(struct pt_regs *regs, unsigned long address)
+{
+ if (user_mode(regs)) {
+ siginfo_t info;
+
+ info.si_signo = SIGBUS;
+ info.si_errno = 0;
+ info.si_code = BUS_ADRERR;
+ info.si_addr = (void __user *)address;
+ force_sig_info(SIGBUS, &info, current);
+ return;
+ }
+ bad_page_fault(regs, address, SIGBUS);
+}
+
+void __init htab_finish_init(void)
+{
+ extern unsigned int *htab_call_hpte_insert1;
+ extern unsigned int *htab_call_hpte_insert2;
+ extern unsigned int *htab_call_hpte_remove;
+ extern unsigned int *htab_call_hpte_updatepp;
+
+ make_bl(htab_call_hpte_insert1, ppc_md.hpte_insert);
+ make_bl(htab_call_hpte_insert2, ppc_md.hpte_insert);
+ make_bl(htab_call_hpte_remove, ppc_md.hpte_remove);
+ make_bl(htab_call_hpte_updatepp, ppc_md.hpte_updatepp);
+}
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
new file mode 100644
index 00000000000..c62ddaff072
--- /dev/null
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -0,0 +1,904 @@
+/*
+ * PPC64 (POWER4) Huge TLB Page Support for Kernel.
+ *
+ * Copyright (C) 2003 David Gibson, IBM Corporation.
+ *
+ * Based on the IA-32 version:
+ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/sysctl.h>
+#include <asm/mman.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/machdep.h>
+#include <asm/cputable.h>
+#include <asm/tlb.h>
+
+#include <linux/sysctl.h>
+
+#define HUGEPGDIR_SHIFT (HPAGE_SHIFT + PAGE_SHIFT - 3)
+#define HUGEPGDIR_SIZE (1UL << HUGEPGDIR_SHIFT)
+#define HUGEPGDIR_MASK (~(HUGEPGDIR_SIZE-1))
+
+#define HUGEPTE_INDEX_SIZE 9
+#define HUGEPGD_INDEX_SIZE 10
+
+#define PTRS_PER_HUGEPTE (1 << HUGEPTE_INDEX_SIZE)
+#define PTRS_PER_HUGEPGD (1 << HUGEPGD_INDEX_SIZE)
+
+static inline int hugepgd_index(unsigned long addr)
+{
+ return (addr & ~REGION_MASK) >> HUGEPGDIR_SHIFT;
+}
+
+static pgd_t *hugepgd_offset(struct mm_struct *mm, unsigned long addr)
+{
+ int index;
+
+ if (! mm->context.huge_pgdir)
+ return NULL;
+
+
+ index = hugepgd_index(addr);
+ BUG_ON(index >= PTRS_PER_HUGEPGD);
+ return mm->context.huge_pgdir + index;
+}
+
+static inline pte_t *hugepte_offset(pgd_t *dir, unsigned long addr)
+{
+ int index;
+
+ if (pgd_none(*dir))
+ return NULL;
+
+ index = (addr >> HPAGE_SHIFT) % PTRS_PER_HUGEPTE;
+ return (pte_t *)pgd_page(*dir) + index;
+}
+
+static pgd_t *hugepgd_alloc(struct mm_struct *mm, unsigned long addr)
+{
+ BUG_ON(! in_hugepage_area(mm->context, addr));
+
+ if (! mm->context.huge_pgdir) {
+ pgd_t *new;
+ spin_unlock(&mm->page_table_lock);
+ /* Don't use pgd_alloc(), because we want __GFP_REPEAT */
+ new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT);
+ BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));
+ spin_lock(&mm->page_table_lock);
+
+ /*
+ * Because we dropped the lock, we should re-check the
+ * entry, as somebody else could have populated it..
+ */
+ if (mm->context.huge_pgdir)
+ pgd_free(new);
+ else
+ mm->context.huge_pgdir = new;
+ }
+ return hugepgd_offset(mm, addr);
+}
+
+static pte_t *hugepte_alloc(struct mm_struct *mm, pgd_t *dir,
+ unsigned long addr)
+{
+ if (! pgd_present(*dir)) {
+ pte_t *new;
+
+ spin_unlock(&mm->page_table_lock);
+ new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT);
+ BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));
+ spin_lock(&mm->page_table_lock);
+ /*
+ * Because we dropped the lock, we should re-check the
+ * entry, as somebody else could have populated it..
+ */
+ if (pgd_present(*dir)) {
+ if (new)
+ kmem_cache_free(zero_cache, new);
+ } else {
+ struct page *ptepage;
+
+ if (! new)
+ return NULL;
+ ptepage = virt_to_page(new);
+ ptepage->mapping = (void *) mm;
+ ptepage->index = addr & HUGEPGDIR_MASK;
+ pgd_populate(mm, dir, new);
+ }
+ }
+
+ return hugepte_offset(dir, addr);
+}
+
+static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+ pgd_t *pgd;
+
+ BUG_ON(! in_hugepage_area(mm->context, addr));
+
+ pgd = hugepgd_offset(mm, addr);
+ if (! pgd)
+ return NULL;
+
+ return hugepte_offset(pgd, addr);
+}
+
+static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+{
+ pgd_t *pgd;
+
+ BUG_ON(! in_hugepage_area(mm->context, addr));
+
+ pgd = hugepgd_alloc(mm, addr);
+ if (! pgd)
+ return NULL;
+
+ return hugepte_alloc(mm, pgd, addr);
+}
+
+static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, struct page *page,
+ pte_t *ptep, int write_access)
+{
+ pte_t entry;
+
+ add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
+ if (write_access) {
+ entry =
+ pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+ } else {
+ entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+ }
+ entry = pte_mkyoung(entry);
+ entry = pte_mkhuge(entry);
+
+ set_pte_at(mm, addr, ptep, entry);
+}
+
+/*
+ * This function checks for proper alignment of input addr and len parameters.
+ */
+int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
+{
+ if (len & ~HPAGE_MASK)
+ return -EINVAL;
+ if (addr & ~HPAGE_MASK)
+ return -EINVAL;
+ if (! (within_hugepage_low_range(addr, len)
+ || within_hugepage_high_range(addr, len)) )
+ return -EINVAL;
+ return 0;
+}
+
+static void flush_segments(void *parm)
+{
+ u16 segs = (unsigned long) parm;
+ unsigned long i;
+
+ asm volatile("isync" : : : "memory");
+
+ for (i = 0; i < 16; i++) {
+ if (! (segs & (1U << i)))
+ continue;
+ asm volatile("slbie %0" : : "r" (i << SID_SHIFT));
+ }
+
+ asm volatile("isync" : : : "memory");
+}
+
+static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg)
+{
+ unsigned long start = seg << SID_SHIFT;
+ unsigned long end = (seg+1) << SID_SHIFT;
+ struct vm_area_struct *vma;
+ unsigned long addr;
+ struct mmu_gather *tlb;
+
+ BUG_ON(seg >= 16);
+
+ /* Check no VMAs are in the region */
+ vma = find_vma(mm, start);
+ if (vma && (vma->vm_start < end))
+ return -EBUSY;
+
+ /* Clean up any leftover PTE pages in the region */
+ spin_lock(&mm->page_table_lock);
+ tlb = tlb_gather_mmu(mm, 0);
+ for (addr = start; addr < end; addr += PMD_SIZE) {
+ pgd_t *pgd = pgd_offset(mm, addr);
+ pmd_t *pmd;
+ struct page *page;
+ pte_t *pte;
+ int i;
+
+ if (pgd_none(*pgd))
+ continue;
+ pmd = pmd_offset(pgd, addr);
+ if (!pmd || pmd_none(*pmd))
+ continue;
+ if (pmd_bad(*pmd)) {
+ pmd_ERROR(*pmd);
+ pmd_clear(pmd);
+ continue;
+ }
+ pte = (pte_t *)pmd_page_kernel(*pmd);
+ /* No VMAs, so there should be no PTEs, check just in case. */
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ BUG_ON(!pte_none(*pte));
+ pte++;
+ }
+ page = pmd_page(*pmd);
+ pmd_clear(pmd);
+ mm->nr_ptes--;
+ dec_page_state(nr_page_table_pages);
+ pte_free_tlb(tlb, page);
+ }
+ tlb_finish_mmu(tlb, start, end);
+ spin_unlock(&mm->page_table_lock);
+
+ return 0;
+}
+
+static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs)
+{
+ unsigned long i;
+
+ newsegs &= ~(mm->context.htlb_segs);
+ if (! newsegs)
+ return 0; /* The segments we want are already open */
+
+ for (i = 0; i < 16; i++)
+ if ((1 << i) & newsegs)
+ if (prepare_low_seg_for_htlb(mm, i) != 0)
+ return -EBUSY;
+
+ mm->context.htlb_segs |= newsegs;
+
+ /* update the paca copy of the context struct */
+ get_paca()->context = mm->context;
+
+ /* the context change must make it to memory before the flush,
+ * so that further SLB misses do the right thing. */
+ mb();
+ on_each_cpu(flush_segments, (void *)(unsigned long)newsegs, 0, 1);
+
+ return 0;
+}
+
+int prepare_hugepage_range(unsigned long addr, unsigned long len)
+{
+ if (within_hugepage_high_range(addr, len))
+ return 0;
+ else if ((addr < 0x100000000UL) && ((addr+len) < 0x100000000UL)) {
+ int err;
+ /* Yes, we need both tests, in case addr+len overflows
+ * 64-bit arithmetic */
+ err = open_low_hpage_segs(current->mm,
+ LOW_ESID_MASK(addr, len));
+ if (err)
+ printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
+ " failed (segs: 0x%04hx)\n", addr, len,
+ LOW_ESID_MASK(addr, len));
+ return err;
+ }
+
+ return -EINVAL;
+}
+
+int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
+ struct vm_area_struct *vma)
+{
+ pte_t *src_pte, *dst_pte, entry;
+ struct page *ptepage;
+ unsigned long addr = vma->vm_start;
+ unsigned long end = vma->vm_end;
+ int err = -ENOMEM;
+
+ while (addr < end) {
+ dst_pte = huge_pte_alloc(dst, addr);
+ if (!dst_pte)
+ goto out;
+
+ src_pte = huge_pte_offset(src, addr);
+ entry = *src_pte;
+
+ ptepage = pte_page(entry);
+ get_page(ptepage);
+ add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
+ set_pte_at(dst, addr, dst_pte, entry);
+
+ addr += HPAGE_SIZE;
+ }
+
+ err = 0;
+ out:
+ return err;
+}
+
+int
+follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page **pages, struct vm_area_struct **vmas,
+ unsigned long *position, int *length, int i)
+{
+ unsigned long vpfn, vaddr = *position;
+ int remainder = *length;
+
+ WARN_ON(!is_vm_hugetlb_page(vma));
+
+ vpfn = vaddr/PAGE_SIZE;
+ while (vaddr < vma->vm_end && remainder) {
+ if (pages) {
+ pte_t *pte;
+ struct page *page;
+
+ pte = huge_pte_offset(mm, vaddr);
+
+ /* hugetlb should be locked, and hence, prefaulted */
+ WARN_ON(!pte || pte_none(*pte));
+
+ page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+
+ WARN_ON(!PageCompound(page));
+
+ get_page(page);
+ pages[i] = page;
+ }
+
+ if (vmas)
+ vmas[i] = vma;
+
+ vaddr += PAGE_SIZE;
+ ++vpfn;
+ --remainder;
+ ++i;
+ }
+
+ *length = remainder;
+ *position = vaddr;
+
+ return i;
+}
+
+struct page *
+follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
+{
+ pte_t *ptep;
+ struct page *page;
+
+ if (! in_hugepage_area(mm->context, address))
+ return ERR_PTR(-EINVAL);
+
+ ptep = huge_pte_offset(mm, address);
+ page = pte_page(*ptep);
+ if (page)
+ page += (address % HPAGE_SIZE) / PAGE_SIZE;
+
+ return page;
+}
+
+int pmd_huge(pmd_t pmd)
+{
+ return 0;
+}
+
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+ pmd_t *pmd, int write)
+{
+ BUG();
+ return NULL;
+}
+
+void unmap_hugepage_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr;
+ pte_t *ptep;
+ struct page *page;
+
+ WARN_ON(!is_vm_hugetlb_page(vma));
+ BUG_ON((start % HPAGE_SIZE) != 0);
+ BUG_ON((end % HPAGE_SIZE) != 0);
+
+ for (addr = start; addr < end; addr += HPAGE_SIZE) {
+ pte_t pte;
+
+ ptep = huge_pte_offset(mm, addr);
+ if (!ptep || pte_none(*ptep))
+ continue;
+
+ pte = *ptep;
+ page = pte_page(pte);
+ pte_clear(mm, addr, ptep);
+
+ put_page(page);
+ }
+ add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT));
+ flush_tlb_pending();
+}
+
+void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
+ unsigned long start, unsigned long end)
+{
+ /* Because the huge pgtables are only 2 level, they can take
+ * at most around 4M, much less than one hugepage which the
+ * process is presumably entitled to use. So we don't bother
+ * freeing up the pagetables on unmap, and wait until
+ * destroy_context() to clean up the lot. */
+}
+
+int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr;
+ int ret = 0;
+
+ WARN_ON(!is_vm_hugetlb_page(vma));
+ BUG_ON((vma->vm_start % HPAGE_SIZE) != 0);
+ BUG_ON((vma->vm_end % HPAGE_SIZE) != 0);
+
+ spin_lock(&mm->page_table_lock);
+ for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+ unsigned long idx;
+ pte_t *pte = huge_pte_alloc(mm, addr);
+ struct page *page;
+
+ if (!pte) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ if (! pte_none(*pte))
+ continue;
+
+ idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+ + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+ page = find_get_page(mapping, idx);
+ if (!page) {
+ /* charge the fs quota first */
+ if (hugetlb_get_quota(mapping)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ page = alloc_huge_page();
+ if (!page) {
+ hugetlb_put_quota(mapping);
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
+ if (! ret) {
+ unlock_page(page);
+ } else {
+ hugetlb_put_quota(mapping);
+ free_huge_page(page);
+ goto out;
+ }
+ }
+ set_huge_pte(mm, vma, addr, page, pte, vma->vm_flags & VM_WRITE);
+ }
+out:
+ spin_unlock(&mm->page_table_lock);
+ return ret;
+}
+
+/* Because we have an exclusive hugepage region which lies within the
+ * normal user address space, we have to take special measures to make
+ * non-huge mmap()s evade the hugepage reserved regions. */
+unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long start_addr;
+
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
+ if (addr) {
+ addr = PAGE_ALIGN(addr);
+ vma = find_vma(mm, addr);
+ if (((TASK_SIZE - len) >= addr)
+ && (!vma || (addr+len) <= vma->vm_start)
+ && !is_hugepage_only_range(mm, addr,len))
+ return addr;
+ }
+ start_addr = addr = mm->free_area_cache;
+
+full_search:
+ vma = find_vma(mm, addr);
+ while (TASK_SIZE - len >= addr) {
+ BUG_ON(vma && (addr >= vma->vm_end));
+
+ if (touches_hugepage_low_range(mm, addr, len)) {
+ addr = ALIGN(addr+1, 1<<SID_SHIFT);
+ vma = find_vma(mm, addr);
+ continue;
+ }
+ if (touches_hugepage_high_range(addr, len)) {
+ addr = TASK_HPAGE_END;
+ vma = find_vma(mm, addr);
+ continue;
+ }
+ if (!vma || addr + len <= vma->vm_start) {
+ /*
+ * Remember the place where we stopped the search:
+ */
+ mm->free_area_cache = addr + len;
+ return addr;
+ }
+ addr = vma->vm_end;
+ vma = vma->vm_next;
+ }
+
+ /* Make sure we didn't miss any holes */
+ if (start_addr != TASK_UNMAPPED_BASE) {
+ start_addr = addr = TASK_UNMAPPED_BASE;
+ goto full_search;
+ }
+ return -ENOMEM;
+}
+
+/*
+ * This mmap-allocator allocates new areas top-down from below the
+ * stack's low limit (the base):
+ *
+ * Because we have an exclusive hugepage region which lies within the
+ * normal user address space, we have to take special measures to make
+ * non-huge mmap()s evade the hugepage reserved regions.
+ */
+unsigned long
+arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
+ const unsigned long len, const unsigned long pgoff,
+ const unsigned long flags)
+{
+ struct vm_area_struct *vma, *prev_vma;
+ struct mm_struct *mm = current->mm;
+ unsigned long base = mm->mmap_base, addr = addr0;
+ int first_time = 1;
+
+ /* requested length too big for entire address space */
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
+ /* dont allow allocations above current base */
+ if (mm->free_area_cache > base)
+ mm->free_area_cache = base;
+
+ /* requesting a specific address */
+ if (addr) {
+ addr = PAGE_ALIGN(addr);
+ vma = find_vma(mm, addr);
+ if (TASK_SIZE - len >= addr &&
+ (!vma || addr + len <= vma->vm_start)
+ && !is_hugepage_only_range(mm, addr,len))
+ return addr;
+ }
+
+try_again:
+ /* make sure it can fit in the remaining address space */
+ if (mm->free_area_cache < len)
+ goto fail;
+
+ /* either no address requested or cant fit in requested address hole */
+ addr = (mm->free_area_cache - len) & PAGE_MASK;
+ do {
+hugepage_recheck:
+ if (touches_hugepage_low_range(mm, addr, len)) {
+ addr = (addr & ((~0) << SID_SHIFT)) - len;
+ goto hugepage_recheck;
+ } else if (touches_hugepage_high_range(addr, len)) {
+ addr = TASK_HPAGE_BASE - len;
+ }
+
+ /*
+ * Lookup failure means no vma is above this address,
+ * i.e. return with success:
+ */
+ if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
+ return addr;
+
+ /*
+ * new region fits between prev_vma->vm_end and
+ * vma->vm_start, use it:
+ */
+ if (addr+len <= vma->vm_start &&
+ (!prev_vma || (addr >= prev_vma->vm_end)))
+ /* remember the address as a hint for next time */
+ return (mm->free_area_cache = addr);
+ else
+ /* pull free_area_cache down to the first hole */
+ if (mm->free_area_cache == vma->vm_end)
+ mm->free_area_cache = vma->vm_start;
+
+ /* try just below the current vma->vm_start */
+ addr = vma->vm_start-len;
+ } while (len <= vma->vm_start);
+
+fail:
+ /*
+ * if hint left us with no space for the requested
+ * mapping then try again:
+ */
+ if (first_time) {
+ mm->free_area_cache = base;
+ first_time = 0;
+ goto try_again;
+ }
+ /*
+ * A failed mmap() very likely causes application failure,
+ * so fall back to the bottom-up function here. This scenario
+ * can happen with large stack limits and large mmap()
+ * allocations.
+ */
+ mm->free_area_cache = TASK_UNMAPPED_BASE;
+ addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
+ /*
+ * Restore the topdown base:
+ */
+ mm->free_area_cache = base;
+
+ return addr;
+}
+
+static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
+{
+ unsigned long addr = 0;
+ struct vm_area_struct *vma;
+
+ vma = find_vma(current->mm, addr);
+ while (addr + len <= 0x100000000UL) {
+ BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
+
+ if (! __within_hugepage_low_range(addr, len, segmask)) {
+ addr = ALIGN(addr+1, 1<<SID_SHIFT);
+ vma = find_vma(current->mm, addr);
+ continue;
+ }
+
+ if (!vma || (addr + len) <= vma->vm_start)
+ return addr;
+ addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+ /* Depending on segmask this might not be a confirmed
+ * hugepage region, so the ALIGN could have skipped
+ * some VMAs */
+ vma = find_vma(current->mm, addr);
+ }
+
+ return -ENOMEM;
+}
+
+static unsigned long htlb_get_high_area(unsigned long len)
+{
+ unsigned long addr = TASK_HPAGE_BASE;
+ struct vm_area_struct *vma;
+
+ vma = find_vma(current->mm, addr);
+ for (vma = find_vma(current->mm, addr);
+ addr + len <= TASK_HPAGE_END;
+ vma = vma->vm_next) {
+ BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
+ BUG_ON(! within_hugepage_high_range(addr, len));
+
+ if (!vma || (addr + len) <= vma->vm_start)
+ return addr;
+ addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+ /* Because we're in a hugepage region, this alignment
+ * should not skip us over any VMAs */
+ }
+
+ return -ENOMEM;
+}
+
+unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ if (len & ~HPAGE_MASK)
+ return -EINVAL;
+
+ if (!cpu_has_feature(CPU_FTR_16M_PAGE))
+ return -EINVAL;
+
+ if (test_thread_flag(TIF_32BIT)) {
+ int lastshift = 0;
+ u16 segmask, cursegs = current->mm->context.htlb_segs;
+
+ /* First see if we can do the mapping in the existing
+ * low hpage segments */
+ addr = htlb_get_low_area(len, cursegs);
+ if (addr != -ENOMEM)
+ return addr;
+
+ for (segmask = LOW_ESID_MASK(0x100000000UL-len, len);
+ ! lastshift; segmask >>=1) {
+ if (segmask & 1)
+ lastshift = 1;
+
+ addr = htlb_get_low_area(len, cursegs | segmask);
+ if ((addr != -ENOMEM)
+ && open_low_hpage_segs(current->mm, segmask) == 0)
+ return addr;
+ }
+ printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
+ " enough segments\n");
+ return -ENOMEM;
+ } else {
+ return htlb_get_high_area(len);
+ }
+}
+
+void hugetlb_mm_free_pgd(struct mm_struct *mm)
+{
+ int i;
+ pgd_t *pgdir;
+
+ spin_lock(&mm->page_table_lock);
+
+ pgdir = mm->context.huge_pgdir;
+ if (! pgdir)
+ goto out;
+
+ mm->context.huge_pgdir = NULL;
+
+ /* cleanup any hugepte pages leftover */
+ for (i = 0; i < PTRS_PER_HUGEPGD; i++) {
+ pgd_t *pgd = pgdir + i;
+
+ if (! pgd_none(*pgd)) {
+ pte_t *pte = (pte_t *)pgd_page(*pgd);
+ struct page *ptepage = virt_to_page(pte);
+
+ ptepage->mapping = NULL;
+
+ BUG_ON(memcmp(pte, empty_zero_page, PAGE_SIZE));
+ kmem_cache_free(zero_cache, pte);
+ }
+ pgd_clear(pgd);
+ }
+
+ BUG_ON(memcmp(pgdir, empty_zero_page, PAGE_SIZE));
+ kmem_cache_free(zero_cache, pgdir);
+
+ out:
+ spin_unlock(&mm->page_table_lock);
+}
+
+int hash_huge_page(struct mm_struct *mm, unsigned long access,
+ unsigned long ea, unsigned long vsid, int local)
+{
+ pte_t *ptep;
+ unsigned long va, vpn;
+ pte_t old_pte, new_pte;
+ unsigned long hpteflags, prpn;
+ long slot;
+ int err = 1;
+
+ spin_lock(&mm->page_table_lock);
+
+ ptep = huge_pte_offset(mm, ea);
+
+ /* Search the Linux page table for a match with va */
+ va = (vsid << 28) | (ea & 0x0fffffff);
+ vpn = va >> HPAGE_SHIFT;
+
+ /*
+ * If no pte found or not present, send the problem up to
+ * do_page_fault
+ */
+ if (unlikely(!ptep || pte_none(*ptep)))
+ goto out;
+
+/* BUG_ON(pte_bad(*ptep)); */
+
+ /*
+ * Check the user's access rights to the page. If access should be
+ * prevented then send the problem up to do_page_fault.
+ */
+ if (unlikely(access & ~pte_val(*ptep)))
+ goto out;
+ /*
+ * At this point, we have a pte (old_pte) which can be used to build
+ * or update an HPTE. There are 2 cases:
+ *
+ * 1. There is a valid (present) pte with no associated HPTE (this is
+ * the most common case)
+ * 2. There is a valid (present) pte with an associated HPTE. The
+ * current values of the pp bits in the HPTE prevent access
+ * because we are doing software DIRTY bit management and the
+ * page is currently not DIRTY.
+ */
+
+
+ old_pte = *ptep;
+ new_pte = old_pte;
+
+ hpteflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW));
+ /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
+ hpteflags |= ((pte_val(new_pte) & _PAGE_EXEC) ? 0 : HW_NO_EXEC);
+
+ /* Check if pte already has an hpte (case 2) */
+ if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) {
+ /* There MIGHT be an HPTE for this pte */
+ unsigned long hash, slot;
+
+ hash = hpt_hash(vpn, 1);
+ if (pte_val(old_pte) & _PAGE_SECONDARY)
+ hash = ~hash;
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12;
+
+ if (ppc_md.hpte_updatepp(slot, hpteflags, va, 1, local) == -1)
+ pte_val(old_pte) &= ~_PAGE_HPTEFLAGS;
+ }
+
+ if (likely(!(pte_val(old_pte) & _PAGE_HASHPTE))) {
+ unsigned long hash = hpt_hash(vpn, 1);
+ unsigned long hpte_group;
+
+ prpn = pte_pfn(old_pte);
+
+repeat:
+ hpte_group = ((hash & htab_hash_mask) *
+ HPTES_PER_GROUP) & ~0x7UL;
+
+ /* Update the linux pte with the HPTE slot */
+ pte_val(new_pte) &= ~_PAGE_HPTEFLAGS;
+ pte_val(new_pte) |= _PAGE_HASHPTE;
+
+ /* Add in WIMG bits */
+ /* XXX We should store these in the pte */
+ hpteflags |= _PAGE_COHERENT;
+
+ slot = ppc_md.hpte_insert(hpte_group, va, prpn, 0,
+ hpteflags, 0, 1);
+
+ /* Primary is full, try the secondary */
+ if (unlikely(slot == -1)) {
+ pte_val(new_pte) |= _PAGE_SECONDARY;
+ hpte_group = ((~hash & htab_hash_mask) *
+ HPTES_PER_GROUP) & ~0x7UL;
+ slot = ppc_md.hpte_insert(hpte_group, va, prpn,
+ 1, hpteflags, 0, 1);
+ if (slot == -1) {
+ if (mftb() & 0x1)
+ hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+
+ ppc_md.hpte_remove(hpte_group);
+ goto repeat;
+ }
+ }
+
+ if (unlikely(slot == -2))
+ panic("hash_huge_page: pte_insert failed\n");
+
+ pte_val(new_pte) |= (slot<<12) & _PAGE_GROUP_IX;
+
+ /*
+ * No need to use ldarx/stdcx here because all who
+ * might be updating the pte will hold the
+ * page_table_lock
+ */
+ *ptep = new_pte;
+ }
+
+ err = 0;
+
+ out:
+ spin_unlock(&mm->page_table_lock);
+
+ return err;
+}
diff --git a/arch/ppc64/mm/imalloc.c b/arch/ppc64/mm/imalloc.c
new file mode 100644
index 00000000000..9d92b0d9cde
--- /dev/null
+++ b/arch/ppc64/mm/imalloc.c
@@ -0,0 +1,312 @@
+/*
+ * c 2001 PPC 64 Team, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/semaphore.h>
+
+static DECLARE_MUTEX(imlist_sem);
+struct vm_struct * imlist = NULL;
+
+static int get_free_im_addr(unsigned long size, unsigned long *im_addr)
+{
+ unsigned long addr;
+ struct vm_struct **p, *tmp;
+
+ addr = IMALLOC_START;
+ for (p = &imlist; (tmp = *p) ; p = &tmp->next) {
+ if (size + addr < (unsigned long) tmp->addr)
+ break;
+ if ((unsigned long)tmp->addr >= IMALLOC_START)
+ addr = tmp->size + (unsigned long) tmp->addr;
+ if (addr > IMALLOC_END-size)
+ return 1;
+ }
+ *im_addr = addr;
+
+ return 0;
+}
+
+/* Return whether the region described by v_addr and size is a subset
+ * of the region described by parent
+ */
+static inline int im_region_is_subset(unsigned long v_addr, unsigned long size,
+ struct vm_struct *parent)
+{
+ return (int) (v_addr >= (unsigned long) parent->addr &&
+ v_addr < (unsigned long) parent->addr + parent->size &&
+ size < parent->size);
+}
+
+/* Return whether the region described by v_addr and size is a superset
+ * of the region described by child
+ */
+static int im_region_is_superset(unsigned long v_addr, unsigned long size,
+ struct vm_struct *child)
+{
+ struct vm_struct parent;
+
+ parent.addr = (void *) v_addr;
+ parent.size = size;
+
+ return im_region_is_subset((unsigned long) child->addr, child->size,
+ &parent);
+}
+
+/* Return whether the region described by v_addr and size overlaps
+ * the region described by vm. Overlapping regions meet the
+ * following conditions:
+ * 1) The regions share some part of the address space
+ * 2) The regions aren't identical
+ * 3) Neither region is a subset of the other
+ */
+static int im_region_overlaps(unsigned long v_addr, unsigned long size,
+ struct vm_struct *vm)
+{
+ if (im_region_is_superset(v_addr, size, vm))
+ return 0;
+
+ return (v_addr + size > (unsigned long) vm->addr + vm->size &&
+ v_addr < (unsigned long) vm->addr + vm->size) ||
+ (v_addr < (unsigned long) vm->addr &&
+ v_addr + size > (unsigned long) vm->addr);
+}
+
+/* Determine imalloc status of region described by v_addr and size.
+ * Can return one of the following:
+ * IM_REGION_UNUSED - Entire region is unallocated in imalloc space.
+ * IM_REGION_SUBSET - Region is a subset of a region that is already
+ * allocated in imalloc space.
+ * vm will be assigned to a ptr to the parent region.
+ * IM_REGION_EXISTS - Exact region already allocated in imalloc space.
+ * vm will be assigned to a ptr to the existing imlist
+ * member.
+ * IM_REGION_OVERLAPS - Region overlaps an allocated region in imalloc space.
+ * IM_REGION_SUPERSET - Region is a superset of a region that is already
+ * allocated in imalloc space.
+ */
+static int im_region_status(unsigned long v_addr, unsigned long size,
+ struct vm_struct **vm)
+{
+ struct vm_struct *tmp;
+
+ for (tmp = imlist; tmp; tmp = tmp->next)
+ if (v_addr < (unsigned long) tmp->addr + tmp->size)
+ break;
+
+ if (tmp) {
+ if (im_region_overlaps(v_addr, size, tmp))
+ return IM_REGION_OVERLAP;
+
+ *vm = tmp;
+ if (im_region_is_subset(v_addr, size, tmp)) {
+ /* Return with tmp pointing to superset */
+ return IM_REGION_SUBSET;
+ }
+ if (im_region_is_superset(v_addr, size, tmp)) {
+ /* Return with tmp pointing to first subset */
+ return IM_REGION_SUPERSET;
+ }
+ else if (v_addr == (unsigned long) tmp->addr &&
+ size == tmp->size) {
+ /* Return with tmp pointing to exact region */
+ return IM_REGION_EXISTS;
+ }
+ }
+
+ *vm = NULL;
+ return IM_REGION_UNUSED;
+}
+
+static struct vm_struct * split_im_region(unsigned long v_addr,
+ unsigned long size, struct vm_struct *parent)
+{
+ struct vm_struct *vm1 = NULL;
+ struct vm_struct *vm2 = NULL;
+ struct vm_struct *new_vm = NULL;
+
+ vm1 = (struct vm_struct *) kmalloc(sizeof(*vm1), GFP_KERNEL);
+ if (vm1 == NULL) {
+ printk(KERN_ERR "%s() out of memory\n", __FUNCTION__);
+ return NULL;
+ }
+
+ if (v_addr == (unsigned long) parent->addr) {
+ /* Use existing parent vm_struct to represent child, allocate
+ * new one for the remainder of parent range
+ */
+ vm1->size = parent->size - size;
+ vm1->addr = (void *) (v_addr + size);
+ vm1->next = parent->next;
+
+ parent->size = size;
+ parent->next = vm1;
+ new_vm = parent;
+ } else if (v_addr + size == (unsigned long) parent->addr +
+ parent->size) {
+ /* Allocate new vm_struct to represent child, use existing
+ * parent one for remainder of parent range
+ */
+ vm1->size = size;
+ vm1->addr = (void *) v_addr;
+ vm1->next = parent->next;
+ new_vm = vm1;
+
+ parent->size -= size;
+ parent->next = vm1;
+ } else {
+ /* Allocate two new vm_structs for the new child and
+ * uppermost remainder, and use existing parent one for the
+ * lower remainder of parent range
+ */
+ vm2 = (struct vm_struct *) kmalloc(sizeof(*vm2), GFP_KERNEL);
+ if (vm2 == NULL) {
+ printk(KERN_ERR "%s() out of memory\n", __FUNCTION__);
+ kfree(vm1);
+ return NULL;
+ }
+
+ vm1->size = size;
+ vm1->addr = (void *) v_addr;
+ vm1->next = vm2;
+ new_vm = vm1;
+
+ vm2->size = ((unsigned long) parent->addr + parent->size) -
+ (v_addr + size);
+ vm2->addr = (void *) v_addr + size;
+ vm2->next = parent->next;
+
+ parent->size = v_addr - (unsigned long) parent->addr;
+ parent->next = vm1;
+ }
+
+ return new_vm;
+}
+
+static struct vm_struct * __add_new_im_area(unsigned long req_addr,
+ unsigned long size)
+{
+ struct vm_struct **p, *tmp, *area;
+
+ for (p = &imlist; (tmp = *p) ; p = &tmp->next) {
+ if (req_addr + size <= (unsigned long)tmp->addr)
+ break;
+ }
+
+ area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL);
+ if (!area)
+ return NULL;
+ area->flags = 0;
+ area->addr = (void *)req_addr;
+ area->size = size;
+ area->next = *p;
+ *p = area;
+
+ return area;
+}
+
+static struct vm_struct * __im_get_area(unsigned long req_addr,
+ unsigned long size,
+ int criteria)
+{
+ struct vm_struct *tmp;
+ int status;
+
+ status = im_region_status(req_addr, size, &tmp);
+ if ((criteria & status) == 0) {
+ return NULL;
+ }
+
+ switch (status) {
+ case IM_REGION_UNUSED:
+ tmp = __add_new_im_area(req_addr, size);
+ break;
+ case IM_REGION_SUBSET:
+ tmp = split_im_region(req_addr, size, tmp);
+ break;
+ case IM_REGION_EXISTS:
+ /* Return requested region */
+ break;
+ case IM_REGION_SUPERSET:
+ /* Return first existing subset of requested region */
+ break;
+ default:
+ printk(KERN_ERR "%s() unexpected imalloc region status\n",
+ __FUNCTION__);
+ tmp = NULL;
+ }
+
+ return tmp;
+}
+
+struct vm_struct * im_get_free_area(unsigned long size)
+{
+ struct vm_struct *area;
+ unsigned long addr;
+
+ down(&imlist_sem);
+ if (get_free_im_addr(size, &addr)) {
+ printk(KERN_ERR "%s() cannot obtain addr for size 0x%lx\n",
+ __FUNCTION__, size);
+ area = NULL;
+ goto next_im_done;
+ }
+
+ area = __im_get_area(addr, size, IM_REGION_UNUSED);
+ if (area == NULL) {
+ printk(KERN_ERR
+ "%s() cannot obtain area for addr 0x%lx size 0x%lx\n",
+ __FUNCTION__, addr, size);
+ }
+next_im_done:
+ up(&imlist_sem);
+ return area;
+}
+
+struct vm_struct * im_get_area(unsigned long v_addr, unsigned long size,
+ int criteria)
+{
+ struct vm_struct *area;
+
+ down(&imlist_sem);
+ area = __im_get_area(v_addr, size, criteria);
+ up(&imlist_sem);
+ return area;
+}
+
+unsigned long im_free(void * addr)
+{
+ struct vm_struct **p, *tmp;
+ unsigned long ret_size = 0;
+
+ if (!addr)
+ return ret_size;
+ if ((PAGE_SIZE-1) & (unsigned long) addr) {
+ printk(KERN_ERR "Trying to %s bad address (%p)\n", __FUNCTION__, addr);
+ return ret_size;
+ }
+ down(&imlist_sem);
+ for (p = &imlist ; (tmp = *p) ; p = &tmp->next) {
+ if (tmp->addr == addr) {
+ ret_size = tmp->size;
+ *p = tmp->next;
+ kfree(tmp);
+ up(&imlist_sem);
+ return ret_size;
+ }
+ }
+ up(&imlist_sem);
+ printk(KERN_ERR "Trying to %s nonexistent area (%p)\n", __FUNCTION__,
+ addr);
+ return ret_size;
+}
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
new file mode 100644
index 00000000000..23813d03e1c
--- /dev/null
+++ b/arch/ppc64/mm/init.c
@@ -0,0 +1,927 @@
+/*
+ * PowerPC version
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * Dave Engebretsen <engebret@us.ibm.com>
+ * Rework for PPC64 port.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/idr.h>
+#include <linux/nodemask.h>
+#include <linux/module.h>
+
+#include <asm/pgalloc.h>
+#include <asm/page.h>
+#include <asm/abs_addr.h>
+#include <asm/prom.h>
+#include <asm/lmb.h>
+#include <asm/rtas.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/uaccess.h>
+#include <asm/smp.h>
+#include <asm/machdep.h>
+#include <asm/tlb.h>
+#include <asm/eeh.h>
+#include <asm/processor.h>
+#include <asm/mmzone.h>
+#include <asm/cputable.h>
+#include <asm/ppcdebug.h>
+#include <asm/sections.h>
+#include <asm/system.h>
+#include <asm/iommu.h>
+#include <asm/abs_addr.h>
+#include <asm/vdso.h>
+
+int mem_init_done;
+unsigned long ioremap_bot = IMALLOC_BASE;
+static unsigned long phbs_io_bot = PHBS_IO_BASE;
+
+extern pgd_t swapper_pg_dir[];
+extern struct task_struct *current_set[NR_CPUS];
+
+extern pgd_t ioremap_dir[];
+pgd_t * ioremap_pgd = (pgd_t *)&ioremap_dir;
+
+unsigned long klimit = (unsigned long)_end;
+
+unsigned long _SDR1=0;
+unsigned long _ASR=0;
+
+/* max amount of RAM to use */
+unsigned long __max_memory;
+
+/* info on what we think the IO hole is */
+unsigned long io_hole_start;
+unsigned long io_hole_size;
+
+void show_mem(void)
+{
+ unsigned long total = 0, reserved = 0;
+ unsigned long shared = 0, cached = 0;
+ struct page *page;
+ pg_data_t *pgdat;
+ unsigned long i;
+
+ printk("Mem-info:\n");
+ show_free_areas();
+ printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+ for_each_pgdat(pgdat) {
+ for (i = 0; i < pgdat->node_spanned_pages; i++) {
+ page = pgdat->node_mem_map + i;
+ total++;
+ if (PageReserved(page))
+ reserved++;
+ else if (PageSwapCache(page))
+ cached++;
+ else if (page_count(page))
+ shared += page_count(page) - 1;
+ }
+ }
+ printk("%ld pages of RAM\n", total);
+ printk("%ld reserved pages\n", reserved);
+ printk("%ld pages shared\n", shared);
+ printk("%ld pages swap cached\n", cached);
+}
+
+#ifdef CONFIG_PPC_ISERIES
+
+void __iomem *ioremap(unsigned long addr, unsigned long size)
+{
+ return (void __iomem *)addr;
+}
+
+extern void __iomem *__ioremap(unsigned long addr, unsigned long size,
+ unsigned long flags)
+{
+ return (void __iomem *)addr;
+}
+
+void iounmap(volatile void __iomem *addr)
+{
+ return;
+}
+
+#else
+
+/*
+ * map_io_page currently only called by __ioremap
+ * map_io_page adds an entry to the ioremap page table
+ * and adds an entry to the HPT, possibly bolting it
+ */
+static void map_io_page(unsigned long ea, unsigned long pa, int flags)
+{
+ pgd_t *pgdp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+ unsigned long vsid;
+
+ if (mem_init_done) {
+ spin_lock(&ioremap_mm.page_table_lock);
+ pgdp = pgd_offset_i(ea);
+ pmdp = pmd_alloc(&ioremap_mm, pgdp, ea);
+ ptep = pte_alloc_kernel(&ioremap_mm, pmdp, ea);
+
+ pa = abs_to_phys(pa);
+ set_pte_at(&ioremap_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags)));
+ spin_unlock(&ioremap_mm.page_table_lock);
+ } else {
+ unsigned long va, vpn, hash, hpteg;
+
+ /*
+ * If the mm subsystem is not fully up, we cannot create a
+ * linux page table entry for this mapping. Simply bolt an
+ * entry in the hardware page table.
+ */
+ vsid = get_kernel_vsid(ea);
+ va = (vsid << 28) | (ea & 0xFFFFFFF);
+ vpn = va >> PAGE_SHIFT;
+
+ hash = hpt_hash(vpn, 0);
+
+ hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+
+ /* Panic if a pte grpup is full */
+ if (ppc_md.hpte_insert(hpteg, va, pa >> PAGE_SHIFT, 0,
+ _PAGE_NO_CACHE|_PAGE_GUARDED|PP_RWXX,
+ 1, 0) == -1) {
+ panic("map_io_page: could not insert mapping");
+ }
+ }
+}
+
+
+static void __iomem * __ioremap_com(unsigned long addr, unsigned long pa,
+ unsigned long ea, unsigned long size,
+ unsigned long flags)
+{
+ unsigned long i;
+
+ if ((flags & _PAGE_PRESENT) == 0)
+ flags |= pgprot_val(PAGE_KERNEL);
+ if (flags & (_PAGE_NO_CACHE | _PAGE_WRITETHRU))
+ flags |= _PAGE_GUARDED;
+
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ map_io_page(ea+i, pa+i, flags);
+ }
+
+ return (void __iomem *) (ea + (addr & ~PAGE_MASK));
+}
+
+
+void __iomem *
+ioremap(unsigned long addr, unsigned long size)
+{
+ return __ioremap(addr, size, _PAGE_NO_CACHE);
+}
+
+void __iomem *
+__ioremap(unsigned long addr, unsigned long size, unsigned long flags)
+{
+ unsigned long pa, ea;
+
+ /*
+ * Choose an address to map it to.
+ * Once the imalloc system is running, we use it.
+ * Before that, we map using addresses going
+ * up from ioremap_bot. imalloc will use
+ * the addresses from ioremap_bot through
+ * IMALLOC_END (0xE000001fffffffff)
+ *
+ */
+ pa = addr & PAGE_MASK;
+ size = PAGE_ALIGN(addr + size) - pa;
+
+ if (size == 0)
+ return NULL;
+
+ if (mem_init_done) {
+ struct vm_struct *area;
+ area = im_get_free_area(size);
+ if (area == NULL)
+ return NULL;
+ ea = (unsigned long)(area->addr);
+ } else {
+ ea = ioremap_bot;
+ ioremap_bot += size;
+ }
+
+ return __ioremap_com(addr, pa, ea, size, flags);
+}
+
+#define IS_PAGE_ALIGNED(_val) ((_val) == ((_val) & PAGE_MASK))
+
+int __ioremap_explicit(unsigned long pa, unsigned long ea,
+ unsigned long size, unsigned long flags)
+{
+ struct vm_struct *area;
+
+ /* For now, require page-aligned values for pa, ea, and size */
+ if (!IS_PAGE_ALIGNED(pa) || !IS_PAGE_ALIGNED(ea) ||
+ !IS_PAGE_ALIGNED(size)) {
+ printk(KERN_ERR "unaligned value in %s\n", __FUNCTION__);
+ return 1;
+ }
+
+ if (!mem_init_done) {
+ /* Two things to consider in this case:
+ * 1) No records will be kept (imalloc, etc) that the region
+ * has been remapped
+ * 2) It won't be easy to iounmap() the region later (because
+ * of 1)
+ */
+ ;
+ } else {
+ area = im_get_area(ea, size,
+ IM_REGION_UNUSED|IM_REGION_SUBSET|IM_REGION_EXISTS);
+ if (area == NULL) {
+ /* Expected when PHB-dlpar is in play */
+ return 1;
+ }
+ if (ea != (unsigned long) area->addr) {
+ printk(KERN_ERR "unexpected addr return from im_get_area\n");
+ return 1;
+ }
+ }
+
+ if (__ioremap_com(pa, pa, ea, size, flags) != (void *) ea) {
+ printk(KERN_ERR "__ioremap_com() returned unexpected addr\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+static void unmap_im_area_pte(pmd_t *pmd, unsigned long address,
+ unsigned long size)
+{
+ unsigned long base, end;
+ pte_t *pte;
+
+ if (pmd_none(*pmd))
+ return;
+ if (pmd_bad(*pmd)) {
+ pmd_ERROR(*pmd);
+ pmd_clear(pmd);
+ return;
+ }
+
+ pte = pte_offset_kernel(pmd, address);
+ base = address & PMD_MASK;
+ address &= ~PMD_MASK;
+ end = address + size;
+ if (end > PMD_SIZE)
+ end = PMD_SIZE;
+
+ do {
+ pte_t page;
+ page = ptep_get_and_clear(&ioremap_mm, base + address, pte);
+ address += PAGE_SIZE;
+ pte++;
+ if (pte_none(page))
+ continue;
+ if (pte_present(page))
+ continue;
+ printk(KERN_CRIT "Whee.. Swapped out page in kernel page table\n");
+ } while (address < end);
+}
+
+static void unmap_im_area_pmd(pgd_t *dir, unsigned long address,
+ unsigned long size)
+{
+ unsigned long base, end;
+ pmd_t *pmd;
+
+ if (pgd_none(*dir))
+ return;
+ if (pgd_bad(*dir)) {
+ pgd_ERROR(*dir);
+ pgd_clear(dir);
+ return;
+ }
+
+ pmd = pmd_offset(dir, address);
+ base = address & PGDIR_MASK;
+ address &= ~PGDIR_MASK;
+ end = address + size;
+ if (end > PGDIR_SIZE)
+ end = PGDIR_SIZE;
+
+ do {
+ unmap_im_area_pte(pmd, base + address, end - address);
+ address = (address + PMD_SIZE) & PMD_MASK;
+ pmd++;
+ } while (address < end);
+}
+
+/*
+ * Unmap an IO region and remove it from imalloc'd list.
+ * Access to IO memory should be serialized by driver.
+ * This code is modeled after vmalloc code - unmap_vm_area()
+ *
+ * XXX what about calls before mem_init_done (ie python_countermeasures())
+ */
+void iounmap(volatile void __iomem *token)
+{
+ unsigned long address, start, end, size;
+ struct mm_struct *mm;
+ pgd_t *dir;
+ void *addr;
+
+ if (!mem_init_done) {
+ return;
+ }
+
+ addr = (void *) ((unsigned long __force) token & PAGE_MASK);
+
+ if ((size = im_free(addr)) == 0) {
+ return;
+ }
+
+ address = (unsigned long)addr;
+ start = address;
+ end = address + size;
+
+ mm = &ioremap_mm;
+ spin_lock(&mm->page_table_lock);
+
+ dir = pgd_offset_i(address);
+ flush_cache_vunmap(address, end);
+ do {
+ unmap_im_area_pmd(dir, address, end - address);
+ address = (address + PGDIR_SIZE) & PGDIR_MASK;
+ dir++;
+ } while (address && (address < end));
+ flush_tlb_kernel_range(start, end);
+
+ spin_unlock(&mm->page_table_lock);
+ return;
+}
+
+static int iounmap_subset_regions(unsigned long addr, unsigned long size)
+{
+ struct vm_struct *area;
+
+ /* Check whether subsets of this region exist */
+ area = im_get_area(addr, size, IM_REGION_SUPERSET);
+ if (area == NULL)
+ return 1;
+
+ while (area) {
+ iounmap((void __iomem *) area->addr);
+ area = im_get_area(addr, size,
+ IM_REGION_SUPERSET);
+ }
+
+ return 0;
+}
+
+int iounmap_explicit(volatile void __iomem *start, unsigned long size)
+{
+ struct vm_struct *area;
+ unsigned long addr;
+ int rc;
+
+ addr = (unsigned long __force) start & PAGE_MASK;
+
+ /* Verify that the region either exists or is a subset of an existing
+ * region. In the latter case, split the parent region to create
+ * the exact region
+ */
+ area = im_get_area(addr, size,
+ IM_REGION_EXISTS | IM_REGION_SUBSET);
+ if (area == NULL) {
+ /* Determine whether subset regions exist. If so, unmap */
+ rc = iounmap_subset_regions(addr, size);
+ if (rc) {
+ printk(KERN_ERR
+ "%s() cannot unmap nonexistent range 0x%lx\n",
+ __FUNCTION__, addr);
+ return 1;
+ }
+ } else {
+ iounmap((void __iomem *) area->addr);
+ }
+ /*
+ * FIXME! This can't be right:
+ iounmap(area->addr);
+ * Maybe it should be "iounmap(area);"
+ */
+ return 0;
+}
+
+#endif
+
+EXPORT_SYMBOL(ioremap);
+EXPORT_SYMBOL(__ioremap);
+EXPORT_SYMBOL(iounmap);
+
+void free_initmem(void)
+{
+ unsigned long addr;
+
+ addr = (unsigned long)__init_begin;
+ for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
+ ClearPageReserved(virt_to_page(addr));
+ set_page_count(virt_to_page(addr), 1);
+ free_page(addr);
+ totalram_pages++;
+ }
+ printk ("Freeing unused kernel memory: %luk freed\n",
+ ((unsigned long)__init_end - (unsigned long)__init_begin) >> 10);
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+ if (start < end)
+ printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
+ for (; start < end; start += PAGE_SIZE) {
+ ClearPageReserved(virt_to_page(start));
+ set_page_count(virt_to_page(start), 1);
+ free_page(start);
+ totalram_pages++;
+ }
+}
+#endif
+
+static DEFINE_SPINLOCK(mmu_context_lock);
+static DEFINE_IDR(mmu_context_idr);
+
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+ int index;
+ int err;
+
+#ifdef CONFIG_HUGETLB_PAGE
+ /* We leave htlb_segs as it was, but for a fork, we need to
+ * clear the huge_pgdir. */
+ mm->context.huge_pgdir = NULL;
+#endif
+
+again:
+ if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
+ return -ENOMEM;
+
+ spin_lock(&mmu_context_lock);
+ err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index);
+ spin_unlock(&mmu_context_lock);
+
+ if (err == -EAGAIN)
+ goto again;
+ else if (err)
+ return err;
+
+ if (index > MAX_CONTEXT) {
+ idr_remove(&mmu_context_idr, index);
+ return -ENOMEM;
+ }
+
+ mm->context.id = index;
+
+ return 0;
+}
+
+void destroy_context(struct mm_struct *mm)
+{
+ spin_lock(&mmu_context_lock);
+ idr_remove(&mmu_context_idr, mm->context.id);
+ spin_unlock(&mmu_context_lock);
+
+ mm->context.id = NO_CONTEXT;
+
+ hugetlb_mm_free_pgd(mm);
+}
+
+/*
+ * Do very early mm setup.
+ */
+void __init mm_init_ppc64(void)
+{
+#ifndef CONFIG_PPC_ISERIES
+ unsigned long i;
+#endif
+
+ ppc64_boot_msg(0x100, "MM Init");
+
+ /* This is the story of the IO hole... please, keep seated,
+ * unfortunately, we are out of oxygen masks at the moment.
+ * So we need some rough way to tell where your big IO hole
+ * is. On pmac, it's between 2G and 4G, on POWER3, it's around
+ * that area as well, on POWER4 we don't have one, etc...
+ * We need that as a "hint" when sizing the TCE table on POWER3
+ * So far, the simplest way that seem work well enough for us it
+ * to just assume that the first discontinuity in our physical
+ * RAM layout is the IO hole. That may not be correct in the future
+ * (and isn't on iSeries but then we don't care ;)
+ */
+
+#ifndef CONFIG_PPC_ISERIES
+ for (i = 1; i < lmb.memory.cnt; i++) {
+ unsigned long base, prevbase, prevsize;
+
+ prevbase = lmb.memory.region[i-1].physbase;
+ prevsize = lmb.memory.region[i-1].size;
+ base = lmb.memory.region[i].physbase;
+ if (base > (prevbase + prevsize)) {
+ io_hole_start = prevbase + prevsize;
+ io_hole_size = base - (prevbase + prevsize);
+ break;
+ }
+ }
+#endif /* CONFIG_PPC_ISERIES */
+ if (io_hole_start)
+ printk("IO Hole assumed to be %lx -> %lx\n",
+ io_hole_start, io_hole_start + io_hole_size - 1);
+
+ ppc64_boot_msg(0x100, "MM Init Done");
+}
+
+/*
+ * This is called by /dev/mem to know if a given address has to
+ * be mapped non-cacheable or not
+ */
+int page_is_ram(unsigned long pfn)
+{
+ int i;
+ unsigned long paddr = (pfn << PAGE_SHIFT);
+
+ for (i=0; i < lmb.memory.cnt; i++) {
+ unsigned long base;
+
+#ifdef CONFIG_MSCHUNKS
+ base = lmb.memory.region[i].physbase;
+#else
+ base = lmb.memory.region[i].base;
+#endif
+ if ((paddr >= base) &&
+ (paddr < (base + lmb.memory.region[i].size))) {
+ return 1;
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(page_is_ram);
+
+/*
+ * Initialize the bootmem system and give it all the memory we
+ * have available.
+ */
+#ifndef CONFIG_DISCONTIGMEM
+void __init do_init_bootmem(void)
+{
+ unsigned long i;
+ unsigned long start, bootmap_pages;
+ unsigned long total_pages = lmb_end_of_DRAM() >> PAGE_SHIFT;
+ int boot_mapsize;
+
+ /*
+ * Find an area to use for the bootmem bitmap. Calculate the size of
+ * bitmap required as (Total Memory) / PAGE_SIZE / BITS_PER_BYTE.
+ * Add 1 additional page in case the address isn't page-aligned.
+ */
+ bootmap_pages = bootmem_bootmap_pages(total_pages);
+
+ start = abs_to_phys(lmb_alloc(bootmap_pages<<PAGE_SHIFT, PAGE_SIZE));
+ BUG_ON(!start);
+
+ boot_mapsize = init_bootmem(start >> PAGE_SHIFT, total_pages);
+
+ max_pfn = max_low_pfn;
+
+ /* add all physical memory to the bootmem map. Also find the first */
+ for (i=0; i < lmb.memory.cnt; i++) {
+ unsigned long physbase, size;
+
+ physbase = lmb.memory.region[i].physbase;
+ size = lmb.memory.region[i].size;
+ free_bootmem(physbase, size);
+ }
+
+ /* reserve the sections we're already using */
+ for (i=0; i < lmb.reserved.cnt; i++) {
+ unsigned long physbase = lmb.reserved.region[i].physbase;
+ unsigned long size = lmb.reserved.region[i].size;
+
+ reserve_bootmem(physbase, size);
+ }
+}
+
+/*
+ * paging_init() sets up the page tables - in fact we've already done this.
+ */
+void __init paging_init(void)
+{
+ unsigned long zones_size[MAX_NR_ZONES];
+ unsigned long zholes_size[MAX_NR_ZONES];
+ unsigned long total_ram = lmb_phys_mem_size();
+ unsigned long top_of_ram = lmb_end_of_DRAM();
+
+ printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
+ top_of_ram, total_ram);
+ printk(KERN_INFO "Memory hole size: %ldMB\n",
+ (top_of_ram - total_ram) >> 20);
+ /*
+ * All pages are DMA-able so we put them all in the DMA zone.
+ */
+ memset(zones_size, 0, sizeof(zones_size));
+ memset(zholes_size, 0, sizeof(zholes_size));
+
+ zones_size[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
+ zholes_size[ZONE_DMA] = (top_of_ram - total_ram) >> PAGE_SHIFT;
+
+ free_area_init_node(0, &contig_page_data, zones_size,
+ __pa(PAGE_OFFSET) >> PAGE_SHIFT, zholes_size);
+}
+#endif /* CONFIG_DISCONTIGMEM */
+
+static struct kcore_list kcore_vmem;
+
+static int __init setup_kcore(void)
+{
+ int i;
+
+ for (i=0; i < lmb.memory.cnt; i++) {
+ unsigned long physbase, size;
+ struct kcore_list *kcore_mem;
+
+ physbase = lmb.memory.region[i].physbase;
+ size = lmb.memory.region[i].size;
+
+ /* GFP_ATOMIC to avoid might_sleep warnings during boot */
+ kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC);
+ if (!kcore_mem)
+ panic("mem_init: kmalloc failed\n");
+
+ kclist_add(kcore_mem, __va(physbase), size);
+ }
+
+ kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
+
+ return 0;
+}
+module_init(setup_kcore);
+
+void __init mem_init(void)
+{
+#ifdef CONFIG_DISCONTIGMEM
+ int nid;
+#endif
+ pg_data_t *pgdat;
+ unsigned long i;
+ struct page *page;
+ unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize;
+
+ num_physpages = max_low_pfn; /* RAM is assumed contiguous */
+ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
+
+#ifdef CONFIG_DISCONTIGMEM
+ for_each_online_node(nid) {
+ if (NODE_DATA(nid)->node_spanned_pages != 0) {
+ printk("freeing bootmem node %x\n", nid);
+ totalram_pages +=
+ free_all_bootmem_node(NODE_DATA(nid));
+ }
+ }
+#else
+ max_mapnr = num_physpages;
+ totalram_pages += free_all_bootmem();
+#endif
+
+ for_each_pgdat(pgdat) {
+ for (i = 0; i < pgdat->node_spanned_pages; i++) {
+ page = pgdat->node_mem_map + i;
+ if (PageReserved(page))
+ reservedpages++;
+ }
+ }
+
+ codesize = (unsigned long)&_etext - (unsigned long)&_stext;
+ initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin;
+ datasize = (unsigned long)&_edata - (unsigned long)&__init_end;
+ bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start;
+
+ printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, "
+ "%luk reserved, %luk data, %luk bss, %luk init)\n",
+ (unsigned long)nr_free_pages() << (PAGE_SHIFT-10),
+ num_physpages << (PAGE_SHIFT-10),
+ codesize >> 10,
+ reservedpages << (PAGE_SHIFT-10),
+ datasize >> 10,
+ bsssize >> 10,
+ initsize >> 10);
+
+ mem_init_done = 1;
+
+#ifdef CONFIG_PPC_ISERIES
+ iommu_vio_init();
+#endif
+ /* Initialize the vDSO */
+ vdso_init();
+}
+
+/*
+ * This is called when a page has been modified by the kernel.
+ * It just marks the page as not i-cache clean. We do the i-cache
+ * flush later when the page is given to a user process, if necessary.
+ */
+void flush_dcache_page(struct page *page)
+{
+ if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+ return;
+ /* avoid an atomic op if possible */
+ if (test_bit(PG_arch_1, &page->flags))
+ clear_bit(PG_arch_1, &page->flags);
+}
+EXPORT_SYMBOL(flush_dcache_page);
+
+void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
+{
+ clear_page(page);
+
+ if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+ return;
+ /*
+ * We shouldnt have to do this, but some versions of glibc
+ * require it (ld.so assumes zero filled pages are icache clean)
+ * - Anton
+ */
+
+ /* avoid an atomic op if possible */
+ if (test_bit(PG_arch_1, &pg->flags))
+ clear_bit(PG_arch_1, &pg->flags);
+}
+EXPORT_SYMBOL(clear_user_page);
+
+void copy_user_page(void *vto, void *vfrom, unsigned long vaddr,
+ struct page *pg)
+{
+ copy_page(vto, vfrom);
+
+ /*
+ * We should be able to use the following optimisation, however
+ * there are two problems.
+ * Firstly a bug in some versions of binutils meant PLT sections
+ * were not marked executable.
+ * Secondly the first word in the GOT section is blrl, used
+ * to establish the GOT address. Until recently the GOT was
+ * not marked executable.
+ * - Anton
+ */
+#if 0
+ if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0))
+ return;
+#endif
+
+ if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+ return;
+
+ /* avoid an atomic op if possible */
+ if (test_bit(PG_arch_1, &pg->flags))
+ clear_bit(PG_arch_1, &pg->flags);
+}
+
+void flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
+ unsigned long addr, int len)
+{
+ unsigned long maddr;
+
+ maddr = (unsigned long)page_address(page) + (addr & ~PAGE_MASK);
+ flush_icache_range(maddr, maddr + len);
+}
+EXPORT_SYMBOL(flush_icache_user_range);
+
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a PTE in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux PTE.
+ *
+ * This must always be called with the mm->page_table_lock held
+ */
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long ea,
+ pte_t pte)
+{
+ unsigned long vsid;
+ void *pgdir;
+ pte_t *ptep;
+ int local = 0;
+ cpumask_t tmp;
+ unsigned long flags;
+
+ /* handle i-cache coherency */
+ if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) &&
+ !cpu_has_feature(CPU_FTR_NOEXECUTE)) {
+ unsigned long pfn = pte_pfn(pte);
+ if (pfn_valid(pfn)) {
+ struct page *page = pfn_to_page(pfn);
+ if (!PageReserved(page)
+ && !test_bit(PG_arch_1, &page->flags)) {
+ __flush_dcache_icache(page_address(page));
+ set_bit(PG_arch_1, &page->flags);
+ }
+ }
+ }
+
+ /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
+ if (!pte_young(pte))
+ return;
+
+ pgdir = vma->vm_mm->pgd;
+ if (pgdir == NULL)
+ return;
+
+ ptep = find_linux_pte(pgdir, ea);
+ if (!ptep)
+ return;
+
+ vsid = get_vsid(vma->vm_mm->context.id, ea);
+
+ local_irq_save(flags);
+ tmp = cpumask_of_cpu(smp_processor_id());
+ if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp))
+ local = 1;
+
+ __hash_page(ea, pte_val(pte) & (_PAGE_USER|_PAGE_RW), vsid, ptep,
+ 0x300, local);
+ local_irq_restore(flags);
+}
+
+void __iomem * reserve_phb_iospace(unsigned long size)
+{
+ void __iomem *virt_addr;
+
+ if (phbs_io_bot >= IMALLOC_BASE)
+ panic("reserve_phb_iospace(): phb io space overflow\n");
+
+ virt_addr = (void __iomem *) phbs_io_bot;
+ phbs_io_bot += size;
+
+ return virt_addr;
+}
+
+kmem_cache_t *zero_cache;
+
+static void zero_ctor(void *pte, kmem_cache_t *cache, unsigned long flags)
+{
+ memset(pte, 0, PAGE_SIZE);
+}
+
+void pgtable_cache_init(void)
+{
+ zero_cache = kmem_cache_create("zero",
+ PAGE_SIZE,
+ 0,
+ SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
+ zero_ctor,
+ NULL);
+ if (!zero_cache)
+ panic("pgtable_cache_init(): could not create zero_cache!\n");
+}
+
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
+ unsigned long size, pgprot_t vma_prot)
+{
+ if (ppc_md.phys_mem_access_prot)
+ return ppc_md.phys_mem_access_prot(file, addr, size, vma_prot);
+
+ if (!page_is_ram(addr >> PAGE_SHIFT))
+ vma_prot = __pgprot(pgprot_val(vma_prot)
+ | _PAGE_GUARDED | _PAGE_NO_CACHE);
+ return vma_prot;
+}
+EXPORT_SYMBOL(phys_mem_access_prot);
diff --git a/arch/ppc64/mm/mmap.c b/arch/ppc64/mm/mmap.c
new file mode 100644
index 00000000000..fe65f522aff
--- /dev/null
+++ b/arch/ppc64/mm/mmap.c
@@ -0,0 +1,86 @@
+/*
+ * linux/arch/ppc64/mm/mmap.c
+ *
+ * flexible mmap layout support
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ * Started by Ingo Molnar <mingo@elte.hu>
+ */
+
+#include <linux/personality.h>
+#include <linux/mm.h>
+
+/*
+ * Top of mmap area (just below the process stack).
+ *
+ * Leave an at least ~128 MB hole.
+ */
+#define MIN_GAP (128*1024*1024)
+#define MAX_GAP (TASK_SIZE/6*5)
+
+static inline unsigned long mmap_base(void)
+{
+ unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
+
+ if (gap < MIN_GAP)
+ gap = MIN_GAP;
+ else if (gap > MAX_GAP)
+ gap = MAX_GAP;
+
+ return TASK_SIZE - (gap & PAGE_MASK);
+}
+
+static inline int mmap_is_legacy(void)
+{
+ /*
+ * Force standard allocation for 64 bit programs.
+ */
+ if (!test_thread_flag(TIF_32BIT))
+ return 1;
+
+ if (current->personality & ADDR_COMPAT_LAYOUT)
+ return 1;
+
+ if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
+ return 1;
+
+ return sysctl_legacy_va_layout;
+}
+
+/*
+ * This function, called very early during the creation of a new
+ * process VM image, sets up which VM layout function to use:
+ */
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+ /*
+ * Fall back to the standard layout if the personality
+ * bit is set, or if the expected stack growth is unlimited:
+ */
+ if (mmap_is_legacy()) {
+ mm->mmap_base = TASK_UNMAPPED_BASE;
+ mm->get_unmapped_area = arch_get_unmapped_area;
+ mm->unmap_area = arch_unmap_area;
+ } else {
+ mm->mmap_base = mmap_base();
+ mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+ mm->unmap_area = arch_unmap_area_topdown;
+ }
+}
diff --git a/arch/ppc64/mm/numa.c b/arch/ppc64/mm/numa.c
new file mode 100644
index 00000000000..ea862ec643d
--- /dev/null
+++ b/arch/ppc64/mm/numa.c
@@ -0,0 +1,734 @@
+/*
+ * pSeries NUMA support
+ *
+ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/threads.h>
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <asm/lmb.h>
+#include <asm/machdep.h>
+#include <asm/abs_addr.h>
+
+static int numa_enabled = 1;
+
+static int numa_debug;
+#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
+
+#ifdef DEBUG_NUMA
+#define ARRAY_INITIALISER -1
+#else
+#define ARRAY_INITIALISER 0
+#endif
+
+int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] =
+ ARRAY_INITIALISER};
+char *numa_memory_lookup_table;
+cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
+int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0};
+
+struct pglist_data *node_data[MAX_NUMNODES];
+bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
+static int min_common_depth;
+
+/*
+ * We need somewhere to store start/span for each node until we have
+ * allocated the real node_data structures.
+ */
+static struct {
+ unsigned long node_start_pfn;
+ unsigned long node_end_pfn;
+ unsigned long node_present_pages;
+} init_node_data[MAX_NUMNODES] __initdata;
+
+EXPORT_SYMBOL(node_data);
+EXPORT_SYMBOL(numa_cpu_lookup_table);
+EXPORT_SYMBOL(numa_memory_lookup_table);
+EXPORT_SYMBOL(numa_cpumask_lookup_table);
+EXPORT_SYMBOL(nr_cpus_in_node);
+
+static inline void map_cpu_to_node(int cpu, int node)
+{
+ numa_cpu_lookup_table[cpu] = node;
+ if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) {
+ cpu_set(cpu, numa_cpumask_lookup_table[node]);
+ nr_cpus_in_node[node]++;
+ }
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void unmap_cpu_from_node(unsigned long cpu)
+{
+ int node = numa_cpu_lookup_table[cpu];
+
+ dbg("removing cpu %lu from node %d\n", cpu, node);
+
+ if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
+ cpu_clear(cpu, numa_cpumask_lookup_table[node]);
+ nr_cpus_in_node[node]--;
+ } else {
+ printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
+ cpu, node);
+ }
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static struct device_node * __devinit find_cpu_node(unsigned int cpu)
+{
+ unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
+ struct device_node *cpu_node = NULL;
+ unsigned int *interrupt_server, *reg;
+ int len;
+
+ while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
+ /* Try interrupt server first */
+ interrupt_server = (unsigned int *)get_property(cpu_node,
+ "ibm,ppc-interrupt-server#s", &len);
+
+ len = len / sizeof(u32);
+
+ if (interrupt_server && (len > 0)) {
+ while (len--) {
+ if (interrupt_server[len] == hw_cpuid)
+ return cpu_node;
+ }
+ } else {
+ reg = (unsigned int *)get_property(cpu_node,
+ "reg", &len);
+ if (reg && (len > 0) && (reg[0] == hw_cpuid))
+ return cpu_node;
+ }
+ }
+
+ return NULL;
+}
+
+/* must hold reference to node during call */
+static int *of_get_associativity(struct device_node *dev)
+{
+ return (unsigned int *)get_property(dev, "ibm,associativity", NULL);
+}
+
+static int of_node_numa_domain(struct device_node *device)
+{
+ int numa_domain;
+ unsigned int *tmp;
+
+ if (min_common_depth == -1)
+ return 0;
+
+ tmp = of_get_associativity(device);
+ if (tmp && (tmp[0] >= min_common_depth)) {
+ numa_domain = tmp[min_common_depth];
+ } else {
+ dbg("WARNING: no NUMA information for %s\n",
+ device->full_name);
+ numa_domain = 0;
+ }
+ return numa_domain;
+}
+
+/*
+ * In theory, the "ibm,associativity" property may contain multiple
+ * associativity lists because a resource may be multiply connected
+ * into the machine. This resource then has different associativity
+ * characteristics relative to its multiple connections. We ignore
+ * this for now. We also assume that all cpu and memory sets have
+ * their distances represented at a common level. This won't be
+ * true for heirarchical NUMA.
+ *
+ * In any case the ibm,associativity-reference-points should give
+ * the correct depth for a normal NUMA system.
+ *
+ * - Dave Hansen <haveblue@us.ibm.com>
+ */
+static int __init find_min_common_depth(void)
+{
+ int depth;
+ unsigned int *ref_points;
+ struct device_node *rtas_root;
+ unsigned int len;
+
+ rtas_root = of_find_node_by_path("/rtas");
+
+ if (!rtas_root)
+ return -1;
+
+ /*
+ * this property is 2 32-bit integers, each representing a level of
+ * depth in the associativity nodes. The first is for an SMP
+ * configuration (should be all 0's) and the second is for a normal
+ * NUMA configuration.
+ */
+ ref_points = (unsigned int *)get_property(rtas_root,
+ "ibm,associativity-reference-points", &len);
+
+ if ((len >= 1) && ref_points) {
+ depth = ref_points[1];
+ } else {
+ dbg("WARNING: could not find NUMA "
+ "associativity reference point\n");
+ depth = -1;
+ }
+ of_node_put(rtas_root);
+
+ return depth;
+}
+
+static int __init get_mem_addr_cells(void)
+{
+ struct device_node *memory = NULL;
+ int rc;
+
+ memory = of_find_node_by_type(memory, "memory");
+ if (!memory)
+ return 0; /* it won't matter */
+
+ rc = prom_n_addr_cells(memory);
+ return rc;
+}
+
+static int __init get_mem_size_cells(void)
+{
+ struct device_node *memory = NULL;
+ int rc;
+
+ memory = of_find_node_by_type(memory, "memory");
+ if (!memory)
+ return 0; /* it won't matter */
+ rc = prom_n_size_cells(memory);
+ return rc;
+}
+
+static unsigned long read_n_cells(int n, unsigned int **buf)
+{
+ unsigned long result = 0;
+
+ while (n--) {
+ result = (result << 32) | **buf;
+ (*buf)++;
+ }
+ return result;
+}
+
+/*
+ * Figure out to which domain a cpu belongs and stick it there.
+ * Return the id of the domain used.
+ */
+static int numa_setup_cpu(unsigned long lcpu)
+{
+ int numa_domain = 0;
+ struct device_node *cpu = find_cpu_node(lcpu);
+
+ if (!cpu) {
+ WARN_ON(1);
+ goto out;
+ }
+
+ numa_domain = of_node_numa_domain(cpu);
+
+ if (numa_domain >= num_online_nodes()) {
+ /*
+ * POWER4 LPAR uses 0xffff as invalid node,
+ * dont warn in this case.
+ */
+ if (numa_domain != 0xffff)
+ printk(KERN_ERR "WARNING: cpu %ld "
+ "maps to invalid NUMA node %d\n",
+ lcpu, numa_domain);
+ numa_domain = 0;
+ }
+out:
+ node_set_online(numa_domain);
+
+ map_cpu_to_node(lcpu, numa_domain);
+
+ of_node_put(cpu);
+
+ return numa_domain;
+}
+
+static int cpu_numa_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned long lcpu = (unsigned long)hcpu;
+ int ret = NOTIFY_DONE;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ if (min_common_depth == -1 || !numa_enabled)
+ map_cpu_to_node(lcpu, 0);
+ else
+ numa_setup_cpu(lcpu);
+ ret = NOTIFY_OK;
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DEAD:
+ case CPU_UP_CANCELED:
+ unmap_cpu_from_node(lcpu);
+ break;
+ ret = NOTIFY_OK;
+#endif
+ }
+ return ret;
+}
+
+/*
+ * Check and possibly modify a memory region to enforce the memory limit.
+ *
+ * Returns the size the region should have to enforce the memory limit.
+ * This will either be the original value of size, a truncated value,
+ * or zero. If the returned value of size is 0 the region should be
+ * discarded as it lies wholy above the memory limit.
+ */
+static unsigned long __init numa_enforce_memory_limit(unsigned long start, unsigned long size)
+{
+ /*
+ * We use lmb_end_of_DRAM() in here instead of memory_limit because
+ * we've already adjusted it for the limit and it takes care of
+ * having memory holes below the limit.
+ */
+ extern unsigned long memory_limit;
+
+ if (! memory_limit)
+ return size;
+
+ if (start + size <= lmb_end_of_DRAM())
+ return size;
+
+ if (start >= lmb_end_of_DRAM())
+ return 0;
+
+ return lmb_end_of_DRAM() - start;
+}
+
+static int __init parse_numa_properties(void)
+{
+ struct device_node *cpu = NULL;
+ struct device_node *memory = NULL;
+ int addr_cells, size_cells;
+ int max_domain = 0;
+ long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
+ unsigned long i;
+
+ if (numa_enabled == 0) {
+ printk(KERN_WARNING "NUMA disabled by user\n");
+ return -1;
+ }
+
+ numa_memory_lookup_table =
+ (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
+ memset(numa_memory_lookup_table, 0, entries * sizeof(char));
+
+ for (i = 0; i < entries ; i++)
+ numa_memory_lookup_table[i] = ARRAY_INITIALISER;
+
+ min_common_depth = find_min_common_depth();
+
+ dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
+ if (min_common_depth < 0)
+ return min_common_depth;
+
+ max_domain = numa_setup_cpu(boot_cpuid);
+
+ /*
+ * Even though we connect cpus to numa domains later in SMP init,
+ * we need to know the maximum node id now. This is because each
+ * node id must have NODE_DATA etc backing it.
+ * As a result of hotplug we could still have cpus appear later on
+ * with larger node ids. In that case we force the cpu into node 0.
+ */
+ for_each_cpu(i) {
+ int numa_domain;
+
+ cpu = find_cpu_node(i);
+
+ if (cpu) {
+ numa_domain = of_node_numa_domain(cpu);
+ of_node_put(cpu);
+
+ if (numa_domain < MAX_NUMNODES &&
+ max_domain < numa_domain)
+ max_domain = numa_domain;
+ }
+ }
+
+ addr_cells = get_mem_addr_cells();
+ size_cells = get_mem_size_cells();
+ memory = NULL;
+ while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+ unsigned long start;
+ unsigned long size;
+ int numa_domain;
+ int ranges;
+ unsigned int *memcell_buf;
+ unsigned int len;
+
+ memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
+ if (!memcell_buf || len <= 0)
+ continue;
+
+ ranges = memory->n_addrs;
+new_range:
+ /* these are order-sensitive, and modify the buffer pointer */
+ start = read_n_cells(addr_cells, &memcell_buf);
+ size = read_n_cells(size_cells, &memcell_buf);
+
+ start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
+ size = _ALIGN_UP(size, MEMORY_INCREMENT);
+
+ numa_domain = of_node_numa_domain(memory);
+
+ if (numa_domain >= MAX_NUMNODES) {
+ if (numa_domain != 0xffff)
+ printk(KERN_ERR "WARNING: memory at %lx maps "
+ "to invalid NUMA node %d\n", start,
+ numa_domain);
+ numa_domain = 0;
+ }
+
+ if (max_domain < numa_domain)
+ max_domain = numa_domain;
+
+ if (! (size = numa_enforce_memory_limit(start, size))) {
+ if (--ranges)
+ goto new_range;
+ else
+ continue;
+ }
+
+ /*
+ * Initialize new node struct, or add to an existing one.
+ */
+ if (init_node_data[numa_domain].node_end_pfn) {
+ if ((start / PAGE_SIZE) <
+ init_node_data[numa_domain].node_start_pfn)
+ init_node_data[numa_domain].node_start_pfn =
+ start / PAGE_SIZE;
+ if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) >
+ init_node_data[numa_domain].node_end_pfn)
+ init_node_data[numa_domain].node_end_pfn =
+ (start / PAGE_SIZE) +
+ (size / PAGE_SIZE);
+
+ init_node_data[numa_domain].node_present_pages +=
+ size / PAGE_SIZE;
+ } else {
+ node_set_online(numa_domain);
+
+ init_node_data[numa_domain].node_start_pfn =
+ start / PAGE_SIZE;
+ init_node_data[numa_domain].node_end_pfn =
+ init_node_data[numa_domain].node_start_pfn +
+ size / PAGE_SIZE;
+ init_node_data[numa_domain].node_present_pages =
+ size / PAGE_SIZE;
+ }
+
+ for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
+ numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
+ numa_domain;
+
+ if (--ranges)
+ goto new_range;
+ }
+
+ for (i = 0; i <= max_domain; i++)
+ node_set_online(i);
+
+ return 0;
+}
+
+static void __init setup_nonnuma(void)
+{
+ unsigned long top_of_ram = lmb_end_of_DRAM();
+ unsigned long total_ram = lmb_phys_mem_size();
+ unsigned long i;
+
+ printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
+ top_of_ram, total_ram);
+ printk(KERN_INFO "Memory hole size: %ldMB\n",
+ (top_of_ram - total_ram) >> 20);
+
+ if (!numa_memory_lookup_table) {
+ long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT;
+ numa_memory_lookup_table =
+ (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
+ memset(numa_memory_lookup_table, 0, entries * sizeof(char));
+ for (i = 0; i < entries ; i++)
+ numa_memory_lookup_table[i] = ARRAY_INITIALISER;
+ }
+
+ map_cpu_to_node(boot_cpuid, 0);
+
+ node_set_online(0);
+
+ init_node_data[0].node_start_pfn = 0;
+ init_node_data[0].node_end_pfn = lmb_end_of_DRAM() / PAGE_SIZE;
+ init_node_data[0].node_present_pages = total_ram / PAGE_SIZE;
+
+ for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
+ numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
+}
+
+static void __init dump_numa_topology(void)
+{
+ unsigned int node;
+ unsigned int count;
+
+ if (min_common_depth == -1 || !numa_enabled)
+ return;
+
+ for_each_online_node(node) {
+ unsigned long i;
+
+ printk(KERN_INFO "Node %d Memory:", node);
+
+ count = 0;
+
+ for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) {
+ if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) {
+ if (count == 0)
+ printk(" 0x%lx", i);
+ ++count;
+ } else {
+ if (count > 0)
+ printk("-0x%lx", i);
+ count = 0;
+ }
+ }
+
+ if (count > 0)
+ printk("-0x%lx", i);
+ printk("\n");
+ }
+ return;
+}
+
+/*
+ * Allocate some memory, satisfying the lmb or bootmem allocator where
+ * required. nid is the preferred node and end is the physical address of
+ * the highest address in the node.
+ *
+ * Returns the physical address of the memory.
+ */
+static unsigned long careful_allocation(int nid, unsigned long size,
+ unsigned long align, unsigned long end)
+{
+ unsigned long ret = lmb_alloc_base(size, align, end);
+
+ /* retry over all memory */
+ if (!ret)
+ ret = lmb_alloc_base(size, align, lmb_end_of_DRAM());
+
+ if (!ret)
+ panic("numa.c: cannot allocate %lu bytes on node %d",
+ size, nid);
+
+ /*
+ * If the memory came from a previously allocated node, we must
+ * retry with the bootmem allocator.
+ */
+ if (pa_to_nid(ret) < nid) {
+ nid = pa_to_nid(ret);
+ ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid),
+ size, align, 0);
+
+ if (!ret)
+ panic("numa.c: cannot allocate %lu bytes on node %d",
+ size, nid);
+
+ ret = virt_to_abs(ret);
+
+ dbg("alloc_bootmem %lx %lx\n", ret, size);
+ }
+
+ return ret;
+}
+
+void __init do_init_bootmem(void)
+{
+ int nid;
+ int addr_cells, size_cells;
+ struct device_node *memory = NULL;
+ static struct notifier_block ppc64_numa_nb = {
+ .notifier_call = cpu_numa_callback,
+ .priority = 1 /* Must run before sched domains notifier. */
+ };
+
+ min_low_pfn = 0;
+ max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
+ max_pfn = max_low_pfn;
+
+ if (parse_numa_properties())
+ setup_nonnuma();
+ else
+ dump_numa_topology();
+
+ register_cpu_notifier(&ppc64_numa_nb);
+
+ for_each_online_node(nid) {
+ unsigned long start_paddr, end_paddr;
+ int i;
+ unsigned long bootmem_paddr;
+ unsigned long bootmap_pages;
+
+ start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE;
+ end_paddr = init_node_data[nid].node_end_pfn * PAGE_SIZE;
+
+ /* Allocate the node structure node local if possible */
+ NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid,
+ sizeof(struct pglist_data),
+ SMP_CACHE_BYTES, end_paddr);
+ NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid));
+ memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+
+ dbg("node %d\n", nid);
+ dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
+
+ NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
+ NODE_DATA(nid)->node_start_pfn =
+ init_node_data[nid].node_start_pfn;
+ NODE_DATA(nid)->node_spanned_pages =
+ end_paddr - start_paddr;
+
+ if (NODE_DATA(nid)->node_spanned_pages == 0)
+ continue;
+
+ dbg("start_paddr = %lx\n", start_paddr);
+ dbg("end_paddr = %lx\n", end_paddr);
+
+ bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT);
+
+ bootmem_paddr = careful_allocation(nid,
+ bootmap_pages << PAGE_SHIFT,
+ PAGE_SIZE, end_paddr);
+ memset(abs_to_virt(bootmem_paddr), 0,
+ bootmap_pages << PAGE_SHIFT);
+ dbg("bootmap_paddr = %lx\n", bootmem_paddr);
+
+ init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
+ start_paddr >> PAGE_SHIFT,
+ end_paddr >> PAGE_SHIFT);
+
+ /*
+ * We need to do another scan of all memory sections to
+ * associate memory with the correct node.
+ */
+ addr_cells = get_mem_addr_cells();
+ size_cells = get_mem_size_cells();
+ memory = NULL;
+ while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+ unsigned long mem_start, mem_size;
+ int numa_domain, ranges;
+ unsigned int *memcell_buf;
+ unsigned int len;
+
+ memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
+ if (!memcell_buf || len <= 0)
+ continue;
+
+ ranges = memory->n_addrs; /* ranges in cell */
+new_range:
+ mem_start = read_n_cells(addr_cells, &memcell_buf);
+ mem_size = read_n_cells(size_cells, &memcell_buf);
+ numa_domain = numa_enabled ? of_node_numa_domain(memory) : 0;
+
+ if (numa_domain != nid)
+ continue;
+
+ mem_size = numa_enforce_memory_limit(mem_start, mem_size);
+ if (mem_size) {
+ dbg("free_bootmem %lx %lx\n", mem_start, mem_size);
+ free_bootmem_node(NODE_DATA(nid), mem_start, mem_size);
+ }
+
+ if (--ranges) /* process all ranges in cell */
+ goto new_range;
+ }
+
+ /*
+ * Mark reserved regions on this node
+ */
+ for (i = 0; i < lmb.reserved.cnt; i++) {
+ unsigned long physbase = lmb.reserved.region[i].physbase;
+ unsigned long size = lmb.reserved.region[i].size;
+
+ if (pa_to_nid(physbase) != nid &&
+ pa_to_nid(physbase+size-1) != nid)
+ continue;
+
+ if (physbase < end_paddr &&
+ (physbase+size) > start_paddr) {
+ /* overlaps */
+ if (physbase < start_paddr) {
+ size -= start_paddr - physbase;
+ physbase = start_paddr;
+ }
+
+ if (size > end_paddr - physbase)
+ size = end_paddr - physbase;
+
+ dbg("reserve_bootmem %lx %lx\n", physbase,
+ size);
+ reserve_bootmem_node(NODE_DATA(nid), physbase,
+ size);
+ }
+ }
+ }
+}
+
+void __init paging_init(void)
+{
+ unsigned long zones_size[MAX_NR_ZONES];
+ unsigned long zholes_size[MAX_NR_ZONES];
+ int nid;
+
+ memset(zones_size, 0, sizeof(zones_size));
+ memset(zholes_size, 0, sizeof(zholes_size));
+
+ for_each_online_node(nid) {
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+
+ start_pfn = init_node_data[nid].node_start_pfn;
+ end_pfn = init_node_data[nid].node_end_pfn;
+
+ zones_size[ZONE_DMA] = end_pfn - start_pfn;
+ zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
+ init_node_data[nid].node_present_pages;
+
+ dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
+ zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);
+
+ free_area_init_node(nid, NODE_DATA(nid), zones_size,
+ start_pfn, zholes_size);
+ }
+}
+
+static int __init early_numa(char *p)
+{
+ if (!p)
+ return 0;
+
+ if (strstr(p, "off"))
+ numa_enabled = 0;
+
+ if (strstr(p, "debug"))
+ numa_debug = 1;
+
+ return 0;
+}
+early_param("numa", early_numa);
diff --git a/arch/ppc64/mm/slb.c b/arch/ppc64/mm/slb.c
new file mode 100644
index 00000000000..6a20773f695
--- /dev/null
+++ b/arch/ppc64/mm/slb.c
@@ -0,0 +1,159 @@
+/*
+ * PowerPC64 SLB support.
+ *
+ * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
+ * Based on earlier code writteh by:
+ * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
+ * Copyright (c) 2001 Dave Engebretsen
+ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/paca.h>
+#include <asm/cputable.h>
+
+extern void slb_allocate(unsigned long ea);
+
+static inline unsigned long mk_esid_data(unsigned long ea, unsigned long slot)
+{
+ return (ea & ESID_MASK) | SLB_ESID_V | slot;
+}
+
+static inline unsigned long mk_vsid_data(unsigned long ea, unsigned long flags)
+{
+ return (get_kernel_vsid(ea) << SLB_VSID_SHIFT) | flags;
+}
+
+static inline void create_slbe(unsigned long ea, unsigned long vsid,
+ unsigned long flags, unsigned long entry)
+{
+ asm volatile("slbmte %0,%1" :
+ : "r" (mk_vsid_data(ea, flags)),
+ "r" (mk_esid_data(ea, entry))
+ : "memory" );
+}
+
+static void slb_flush_and_rebolt(void)
+{
+ /* If you change this make sure you change SLB_NUM_BOLTED
+ * appropriately too. */
+ unsigned long ksp_flags = SLB_VSID_KERNEL;
+ unsigned long ksp_esid_data;
+
+ WARN_ON(!irqs_disabled());
+
+ if (cpu_has_feature(CPU_FTR_16M_PAGE))
+ ksp_flags |= SLB_VSID_L;
+
+ ksp_esid_data = mk_esid_data(get_paca()->kstack, 2);
+ if ((ksp_esid_data & ESID_MASK) == KERNELBASE)
+ ksp_esid_data &= ~SLB_ESID_V;
+
+ /* We need to do this all in asm, so we're sure we don't touch
+ * the stack between the slbia and rebolting it. */
+ asm volatile("isync\n"
+ "slbia\n"
+ /* Slot 1 - first VMALLOC segment */
+ "slbmte %0,%1\n"
+ /* Slot 2 - kernel stack */
+ "slbmte %2,%3\n"
+ "isync"
+ :: "r"(mk_vsid_data(VMALLOCBASE, SLB_VSID_KERNEL)),
+ "r"(mk_esid_data(VMALLOCBASE, 1)),
+ "r"(mk_vsid_data(ksp_esid_data, ksp_flags)),
+ "r"(ksp_esid_data)
+ : "memory");
+}
+
+/* Flush all user entries from the segment table of the current processor. */
+void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
+{
+ unsigned long offset = get_paca()->slb_cache_ptr;
+ unsigned long esid_data = 0;
+ unsigned long pc = KSTK_EIP(tsk);
+ unsigned long stack = KSTK_ESP(tsk);
+ unsigned long unmapped_base;
+
+ if (offset <= SLB_CACHE_ENTRIES) {
+ int i;
+ asm volatile("isync" : : : "memory");
+ for (i = 0; i < offset; i++) {
+ esid_data = (unsigned long)get_paca()->slb_cache[i]
+ << SID_SHIFT;
+ asm volatile("slbie %0" : : "r" (esid_data));
+ }
+ asm volatile("isync" : : : "memory");
+ } else {
+ slb_flush_and_rebolt();
+ }
+
+ /* Workaround POWER5 < DD2.1 issue */
+ if (offset == 1 || offset > SLB_CACHE_ENTRIES)
+ asm volatile("slbie %0" : : "r" (esid_data));
+
+ get_paca()->slb_cache_ptr = 0;
+ get_paca()->context = mm->context;
+
+ /*
+ * preload some userspace segments into the SLB.
+ */
+ if (test_tsk_thread_flag(tsk, TIF_32BIT))
+ unmapped_base = TASK_UNMAPPED_BASE_USER32;
+ else
+ unmapped_base = TASK_UNMAPPED_BASE_USER64;
+
+ if (pc >= KERNELBASE)
+ return;
+ slb_allocate(pc);
+
+ if (GET_ESID(pc) == GET_ESID(stack))
+ return;
+
+ if (stack >= KERNELBASE)
+ return;
+ slb_allocate(stack);
+
+ if ((GET_ESID(pc) == GET_ESID(unmapped_base))
+ || (GET_ESID(stack) == GET_ESID(unmapped_base)))
+ return;
+
+ if (unmapped_base >= KERNELBASE)
+ return;
+ slb_allocate(unmapped_base);
+}
+
+void slb_initialize(void)
+{
+ /* On iSeries the bolted entries have already been set up by
+ * the hypervisor from the lparMap data in head.S */
+#ifndef CONFIG_PPC_ISERIES
+ unsigned long flags = SLB_VSID_KERNEL;
+
+ /* Invalidate the entire SLB (even slot 0) & all the ERATS */
+ if (cpu_has_feature(CPU_FTR_16M_PAGE))
+ flags |= SLB_VSID_L;
+
+ asm volatile("isync":::"memory");
+ asm volatile("slbmte %0,%0"::"r" (0) : "memory");
+ asm volatile("isync; slbia; isync":::"memory");
+ create_slbe(KERNELBASE, get_kernel_vsid(KERNELBASE), flags, 0);
+ create_slbe(VMALLOCBASE, get_kernel_vsid(KERNELBASE),
+ SLB_VSID_KERNEL, 1);
+ /* We don't bolt the stack for the time being - we're in boot,
+ * so the stack is in the bolted segment. By the time it goes
+ * elsewhere, we'll call _switch() which will bolt in the new
+ * one. */
+ asm volatile("isync":::"memory");
+#endif
+
+ get_paca()->stab_rr = SLB_NUM_BOLTED;
+}
diff --git a/arch/ppc64/mm/slb_low.S b/arch/ppc64/mm/slb_low.S
new file mode 100644
index 00000000000..8379d678f70
--- /dev/null
+++ b/arch/ppc64/mm/slb_low.S
@@ -0,0 +1,154 @@
+/*
+ * arch/ppc64/mm/slb_low.S
+ *
+ * Low-level SLB routines
+ *
+ * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
+ *
+ * Based on earlier C version:
+ * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
+ * Copyright (c) 2001 Dave Engebretsen
+ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/ppc_asm.h>
+#include <asm/offsets.h>
+#include <asm/cputable.h>
+
+/* void slb_allocate(unsigned long ea);
+ *
+ * Create an SLB entry for the given EA (user or kernel).
+ * r3 = faulting address, r13 = PACA
+ * r9, r10, r11 are clobbered by this function
+ * No other registers are examined or changed.
+ */
+_GLOBAL(slb_allocate)
+ /*
+ * First find a slot, round robin. Previously we tried to find
+ * a free slot first but that took too long. Unfortunately we
+ * dont have any LRU information to help us choose a slot.
+ */
+#ifdef CONFIG_PPC_ISERIES
+ /*
+ * On iSeries, the "bolted" stack segment can be cast out on
+ * shared processor switch so we need to check for a miss on
+ * it and restore it to the right slot.
+ */
+ ld r9,PACAKSAVE(r13)
+ clrrdi r9,r9,28
+ clrrdi r11,r3,28
+ li r10,SLB_NUM_BOLTED-1 /* Stack goes in last bolted slot */
+ cmpld r9,r11
+ beq 3f
+#endif /* CONFIG_PPC_ISERIES */
+
+ ld r10,PACASTABRR(r13)
+ addi r10,r10,1
+ /* use a cpu feature mask if we ever change our slb size */
+ cmpldi r10,SLB_NUM_ENTRIES
+
+ blt+ 4f
+ li r10,SLB_NUM_BOLTED
+
+4:
+ std r10,PACASTABRR(r13)
+3:
+ /* r3 = faulting address, r10 = entry */
+
+ srdi r9,r3,60 /* get region */
+ srdi r3,r3,28 /* get esid */
+ cmpldi cr7,r9,0xc /* cmp KERNELBASE for later use */
+
+ rldimi r10,r3,28,0 /* r10= ESID<<28 | entry */
+ oris r10,r10,SLB_ESID_V@h /* r10 |= SLB_ESID_V */
+
+ /* r3 = esid, r10 = esid_data, cr7 = <>KERNELBASE */
+
+ blt cr7,0f /* user or kernel? */
+
+ /* kernel address: proto-VSID = ESID */
+ /* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but
+ * this code will generate the protoVSID 0xfffffffff for the
+ * top segment. That's ok, the scramble below will translate
+ * it to VSID 0, which is reserved as a bad VSID - one which
+ * will never have any pages in it. */
+ li r11,SLB_VSID_KERNEL
+BEGIN_FTR_SECTION
+ bne cr7,9f
+ li r11,(SLB_VSID_KERNEL|SLB_VSID_L)
+END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
+ b 9f
+
+0: /* user address: proto-VSID = context<<15 | ESID */
+ li r11,SLB_VSID_USER
+
+ srdi. r9,r3,13
+ bne- 8f /* invalid ea bits set */
+
+#ifdef CONFIG_HUGETLB_PAGE
+BEGIN_FTR_SECTION
+ /* check against the hugepage ranges */
+ cmpldi r3,(TASK_HPAGE_END>>SID_SHIFT)
+ bge 6f /* >= TASK_HPAGE_END */
+ cmpldi r3,(TASK_HPAGE_BASE>>SID_SHIFT)
+ bge 5f /* TASK_HPAGE_BASE..TASK_HPAGE_END */
+ cmpldi r3,16
+ bge 6f /* 4GB..TASK_HPAGE_BASE */
+
+ lhz r9,PACAHTLBSEGS(r13)
+ srd r9,r9,r3
+ andi. r9,r9,1
+ beq 6f
+
+5: /* this is a hugepage user address */
+ li r11,(SLB_VSID_USER|SLB_VSID_L)
+END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
+#endif /* CONFIG_HUGETLB_PAGE */
+
+6: ld r9,PACACONTEXTID(r13)
+ rldimi r3,r9,USER_ESID_BITS,0
+
+9: /* r3 = protovsid, r11 = flags, r10 = esid_data, cr7 = <>KERNELBASE */
+ ASM_VSID_SCRAMBLE(r3,r9)
+
+ rldimi r11,r3,SLB_VSID_SHIFT,16 /* combine VSID and flags */
+
+ /*
+ * No need for an isync before or after this slbmte. The exception
+ * we enter with and the rfid we exit with are context synchronizing.
+ */
+ slbmte r11,r10
+
+ bgelr cr7 /* we're done for kernel addresses */
+
+ /* Update the slb cache */
+ lhz r3,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */
+ cmpldi r3,SLB_CACHE_ENTRIES
+ bge 1f
+
+ /* still room in the slb cache */
+ sldi r11,r3,1 /* r11 = offset * sizeof(u16) */
+ rldicl r10,r10,36,28 /* get low 16 bits of the ESID */
+ add r11,r11,r13 /* r11 = (u16 *)paca + offset */
+ sth r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */
+ addi r3,r3,1 /* offset++ */
+ b 2f
+1: /* offset >= SLB_CACHE_ENTRIES */
+ li r3,SLB_CACHE_ENTRIES+1
+2:
+ sth r3,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */
+ blr
+
+8: /* invalid EA */
+ li r3,0 /* BAD_VSID */
+ li r11,SLB_VSID_USER /* flags don't much matter */
+ b 9b
diff --git a/arch/ppc64/mm/stab.c b/arch/ppc64/mm/stab.c
new file mode 100644
index 00000000000..31491131d5e
--- /dev/null
+++ b/arch/ppc64/mm/stab.c
@@ -0,0 +1,239 @@
+/*
+ * PowerPC64 Segment Translation Support.
+ *
+ * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
+ * Copyright (c) 2001 Dave Engebretsen
+ *
+ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/paca.h>
+#include <asm/cputable.h>
+
+/* Both the segment table and SLB code uses the following cache */
+#define NR_STAB_CACHE_ENTRIES 8
+DEFINE_PER_CPU(long, stab_cache_ptr);
+DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]);
+
+/*
+ * Create a segment table entry for the given esid/vsid pair.
+ */
+static int make_ste(unsigned long stab, unsigned long esid, unsigned long vsid)
+{
+ unsigned long esid_data, vsid_data;
+ unsigned long entry, group, old_esid, castout_entry, i;
+ unsigned int global_entry;
+ struct stab_entry *ste, *castout_ste;
+ unsigned long kernel_segment = (esid << SID_SHIFT) >= KERNELBASE;
+
+ vsid_data = vsid << STE_VSID_SHIFT;
+ esid_data = esid << SID_SHIFT | STE_ESID_KP | STE_ESID_V;
+ if (! kernel_segment)
+ esid_data |= STE_ESID_KS;
+
+ /* Search the primary group first. */
+ global_entry = (esid & 0x1f) << 3;
+ ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7));
+
+ /* Find an empty entry, if one exists. */
+ for (group = 0; group < 2; group++) {
+ for (entry = 0; entry < 8; entry++, ste++) {
+ if (!(ste->esid_data & STE_ESID_V)) {
+ ste->vsid_data = vsid_data;
+ asm volatile("eieio":::"memory");
+ ste->esid_data = esid_data;
+ return (global_entry | entry);
+ }
+ }
+ /* Now search the secondary group. */
+ global_entry = ((~esid) & 0x1f) << 3;
+ ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7));
+ }
+
+ /*
+ * Could not find empty entry, pick one with a round robin selection.
+ * Search all entries in the two groups.
+ */
+ castout_entry = get_paca()->stab_rr;
+ for (i = 0; i < 16; i++) {
+ if (castout_entry < 8) {
+ global_entry = (esid & 0x1f) << 3;
+ ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7));
+ castout_ste = ste + castout_entry;
+ } else {
+ global_entry = ((~esid) & 0x1f) << 3;
+ ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7));
+ castout_ste = ste + (castout_entry - 8);
+ }
+
+ /* Dont cast out the first kernel segment */
+ if ((castout_ste->esid_data & ESID_MASK) != KERNELBASE)
+ break;
+
+ castout_entry = (castout_entry + 1) & 0xf;
+ }
+
+ get_paca()->stab_rr = (castout_entry + 1) & 0xf;
+
+ /* Modify the old entry to the new value. */
+
+ /* Force previous translations to complete. DRENG */
+ asm volatile("isync" : : : "memory");
+
+ old_esid = castout_ste->esid_data >> SID_SHIFT;
+ castout_ste->esid_data = 0; /* Invalidate old entry */
+
+ asm volatile("sync" : : : "memory"); /* Order update */
+
+ castout_ste->vsid_data = vsid_data;
+ asm volatile("eieio" : : : "memory"); /* Order update */
+ castout_ste->esid_data = esid_data;
+
+ asm volatile("slbie %0" : : "r" (old_esid << SID_SHIFT));
+ /* Ensure completion of slbie */
+ asm volatile("sync" : : : "memory");
+
+ return (global_entry | (castout_entry & 0x7));
+}
+
+/*
+ * Allocate a segment table entry for the given ea and mm
+ */
+static int __ste_allocate(unsigned long ea, struct mm_struct *mm)
+{
+ unsigned long vsid;
+ unsigned char stab_entry;
+ unsigned long offset;
+
+ /* Kernel or user address? */
+ if (ea >= KERNELBASE) {
+ vsid = get_kernel_vsid(ea);
+ } else {
+ if ((ea >= TASK_SIZE_USER64) || (! mm))
+ return 1;
+
+ vsid = get_vsid(mm->context.id, ea);
+ }
+
+ stab_entry = make_ste(get_paca()->stab_addr, GET_ESID(ea), vsid);
+
+ if (ea < KERNELBASE) {
+ offset = __get_cpu_var(stab_cache_ptr);
+ if (offset < NR_STAB_CACHE_ENTRIES)
+ __get_cpu_var(stab_cache[offset++]) = stab_entry;
+ else
+ offset = NR_STAB_CACHE_ENTRIES+1;
+ __get_cpu_var(stab_cache_ptr) = offset;
+
+ /* Order update */
+ asm volatile("sync":::"memory");
+ }
+
+ return 0;
+}
+
+int ste_allocate(unsigned long ea)
+{
+ return __ste_allocate(ea, current->mm);
+}
+
+/*
+ * Do the segment table work for a context switch: flush all user
+ * entries from the table, then preload some probably useful entries
+ * for the new task
+ */
+void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
+{
+ struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr;
+ struct stab_entry *ste;
+ unsigned long offset = __get_cpu_var(stab_cache_ptr);
+ unsigned long pc = KSTK_EIP(tsk);
+ unsigned long stack = KSTK_ESP(tsk);
+ unsigned long unmapped_base;
+
+ /* Force previous translations to complete. DRENG */
+ asm volatile("isync" : : : "memory");
+
+ if (offset <= NR_STAB_CACHE_ENTRIES) {
+ int i;
+
+ for (i = 0; i < offset; i++) {
+ ste = stab + __get_cpu_var(stab_cache[i]);
+ ste->esid_data = 0; /* invalidate entry */
+ }
+ } else {
+ unsigned long entry;
+
+ /* Invalidate all entries. */
+ ste = stab;
+
+ /* Never flush the first entry. */
+ ste += 1;
+ for (entry = 1;
+ entry < (PAGE_SIZE / sizeof(struct stab_entry));
+ entry++, ste++) {
+ unsigned long ea;
+ ea = ste->esid_data & ESID_MASK;
+ if (ea < KERNELBASE) {
+ ste->esid_data = 0;
+ }
+ }
+ }
+
+ asm volatile("sync; slbia; sync":::"memory");
+
+ __get_cpu_var(stab_cache_ptr) = 0;
+
+ /* Now preload some entries for the new task */
+ if (test_tsk_thread_flag(tsk, TIF_32BIT))
+ unmapped_base = TASK_UNMAPPED_BASE_USER32;
+ else
+ unmapped_base = TASK_UNMAPPED_BASE_USER64;
+
+ __ste_allocate(pc, mm);
+
+ if (GET_ESID(pc) == GET_ESID(stack))
+ return;
+
+ __ste_allocate(stack, mm);
+
+ if ((GET_ESID(pc) == GET_ESID(unmapped_base))
+ || (GET_ESID(stack) == GET_ESID(unmapped_base)))
+ return;
+
+ __ste_allocate(unmapped_base, mm);
+
+ /* Order update */
+ asm volatile("sync" : : : "memory");
+}
+
+extern void slb_initialize(void);
+
+/*
+ * Build an entry for the base kernel segment and put it into
+ * the segment table or SLB. All other segment table or SLB
+ * entries are faulted in.
+ */
+void stab_initialize(unsigned long stab)
+{
+ unsigned long vsid = get_kernel_vsid(KERNELBASE);
+
+ if (cpu_has_feature(CPU_FTR_SLB)) {
+ slb_initialize();
+ } else {
+ asm volatile("isync; slbia; isync":::"memory");
+ make_ste(stab, GET_ESID(KERNELBASE), vsid);
+
+ /* Order update */
+ asm volatile("sync":::"memory");
+ }
+}
diff --git a/arch/ppc64/mm/tlb.c b/arch/ppc64/mm/tlb.c
new file mode 100644
index 00000000000..26f0172c452
--- /dev/null
+++ b/arch/ppc64/mm/tlb.c
@@ -0,0 +1,180 @@
+/*
+ * This file contains the routines for flushing entries from the
+ * TLB and MMU hash table.
+ *
+ * Derived from arch/ppc64/mm/init.c:
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * Dave Engebretsen <engebret@us.ibm.com>
+ * Rework for PPC64 port.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <linux/highmem.h>
+
+DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
+
+/* This is declared as we are using the more or less generic
+ * include/asm-ppc64/tlb.h file -- tgall
+ */
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
+unsigned long pte_freelist_forced_free;
+
+void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage)
+{
+ /* This is safe as we are holding page_table_lock */
+ cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
+ struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+
+ if (atomic_read(&tlb->mm->mm_users) < 2 ||
+ cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) {
+ pte_free(ptepage);
+ return;
+ }
+
+ if (*batchp == NULL) {
+ *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
+ if (*batchp == NULL) {
+ pte_free_now(ptepage);
+ return;
+ }
+ (*batchp)->index = 0;
+ }
+ (*batchp)->pages[(*batchp)->index++] = ptepage;
+ if ((*batchp)->index == PTE_FREELIST_SIZE) {
+ pte_free_submit(*batchp);
+ *batchp = NULL;
+ }
+}
+
+/*
+ * Update the MMU hash table to correspond with a change to
+ * a Linux PTE. If wrprot is true, it is permissible to
+ * change the existing HPTE to read-only rather than removing it
+ * (if we remove it we should clear the _PTE_HPTEFLAGS bits).
+ */
+void hpte_update(struct mm_struct *mm, unsigned long addr,
+ unsigned long pte, int wrprot)
+{
+ int i;
+ unsigned long context = 0;
+ struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+
+ if (REGION_ID(addr) == USER_REGION_ID)
+ context = mm->context.id;
+ i = batch->index;
+
+ /*
+ * This can happen when we are in the middle of a TLB batch and
+ * we encounter memory pressure (eg copy_page_range when it tries
+ * to allocate a new pte). If we have to reclaim memory and end
+ * up scanning and resetting referenced bits then our batch context
+ * will change mid stream.
+ */
+ if (unlikely(i != 0 && context != batch->context)) {
+ flush_tlb_pending();
+ i = 0;
+ }
+
+ if (i == 0) {
+ batch->context = context;
+ batch->mm = mm;
+ }
+ batch->pte[i] = __pte(pte);
+ batch->addr[i] = addr;
+ batch->index = ++i;
+ if (i >= PPC64_TLB_BATCH_NR)
+ flush_tlb_pending();
+}
+
+void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
+{
+ int i;
+ int cpu;
+ cpumask_t tmp;
+ int local = 0;
+
+ BUG_ON(in_interrupt());
+
+ cpu = get_cpu();
+ i = batch->index;
+ tmp = cpumask_of_cpu(cpu);
+ if (cpus_equal(batch->mm->cpu_vm_mask, tmp))
+ local = 1;
+
+ if (i == 1)
+ flush_hash_page(batch->context, batch->addr[0], batch->pte[0],
+ local);
+ else
+ flush_hash_range(batch->context, i, local);
+ batch->index = 0;
+ put_cpu();
+}
+
+#ifdef CONFIG_SMP
+static void pte_free_smp_sync(void *arg)
+{
+ /* Do nothing, just ensure we sync with all CPUs */
+}
+#endif
+
+/* This is only called when we are critically out of memory
+ * (and fail to get a page in pte_free_tlb).
+ */
+void pte_free_now(struct page *ptepage)
+{
+ pte_freelist_forced_free++;
+
+ smp_call_function(pte_free_smp_sync, NULL, 0, 1);
+
+ pte_free(ptepage);
+}
+
+static void pte_free_rcu_callback(struct rcu_head *head)
+{
+ struct pte_freelist_batch *batch =
+ container_of(head, struct pte_freelist_batch, rcu);
+ unsigned int i;
+
+ for (i = 0; i < batch->index; i++)
+ pte_free(batch->pages[i]);
+ free_page((unsigned long)batch);
+}
+
+void pte_free_submit(struct pte_freelist_batch *batch)
+{
+ INIT_RCU_HEAD(&batch->rcu);
+ call_rcu(&batch->rcu, pte_free_rcu_callback);
+}
+
+void pte_free_finish(void)
+{
+ /* This is safe as we are holding page_table_lock */
+ struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+
+ if (*batchp == NULL)
+ return;
+ pte_free_submit(*batchp);
+ *batchp = NULL;
+}