From 95119fbd87aabc263746731462062af5a38c0222 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 11 Oct 2007 11:17:18 +0200 Subject: x86_64: move mm Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/Makefile | 2 +- arch/x86/mm/Makefile_64 | 10 + arch/x86/mm/extable_64.c | 34 ++ arch/x86/mm/fault_64.c | 636 ++++++++++++++++++++++++++++++++++ arch/x86/mm/init_64.c | 750 +++++++++++++++++++++++++++++++++++++++++ arch/x86/mm/ioremap_64.c | 210 ++++++++++++ arch/x86/mm/k8topology_64.c | 182 ++++++++++ arch/x86/mm/mmap_64.c | 29 ++ arch/x86/mm/numa_64.c | 648 +++++++++++++++++++++++++++++++++++ arch/x86/mm/pageattr_64.c | 249 ++++++++++++++ arch/x86/mm/srat_64.c | 566 +++++++++++++++++++++++++++++++ arch/x86_64/Makefile | 2 +- arch/x86_64/mm/Makefile | 5 - arch/x86_64/mm/Makefile_64 | 11 - arch/x86_64/mm/extable_64.c | 34 -- arch/x86_64/mm/fault_64.c | 636 ---------------------------------- arch/x86_64/mm/init_64.c | 750 ----------------------------------------- arch/x86_64/mm/ioremap_64.c | 210 ------------ arch/x86_64/mm/k8topology_64.c | 182 ---------- arch/x86_64/mm/mmap_64.c | 29 -- arch/x86_64/mm/numa_64.c | 648 ----------------------------------- arch/x86_64/mm/pageattr_64.c | 249 -------------- arch/x86_64/mm/srat_64.c | 566 ------------------------------- 23 files changed, 3316 insertions(+), 3322 deletions(-) create mode 100644 arch/x86/mm/Makefile_64 create mode 100644 arch/x86/mm/extable_64.c create mode 100644 arch/x86/mm/fault_64.c create mode 100644 arch/x86/mm/init_64.c create mode 100644 arch/x86/mm/ioremap_64.c create mode 100644 arch/x86/mm/k8topology_64.c create mode 100644 arch/x86/mm/mmap_64.c create mode 100644 arch/x86/mm/numa_64.c create mode 100644 arch/x86/mm/pageattr_64.c create mode 100644 arch/x86/mm/srat_64.c delete mode 100644 arch/x86_64/mm/Makefile delete mode 100644 arch/x86_64/mm/Makefile_64 delete mode 100644 arch/x86_64/mm/extable_64.c delete mode 100644 arch/x86_64/mm/fault_64.c delete mode 100644 arch/x86_64/mm/init_64.c delete mode 100644 arch/x86_64/mm/ioremap_64.c delete mode 100644 arch/x86_64/mm/k8topology_64.c delete mode 100644 arch/x86_64/mm/mmap_64.c delete mode 100644 arch/x86_64/mm/numa_64.c delete mode 100644 arch/x86_64/mm/pageattr_64.c delete mode 100644 arch/x86_64/mm/srat_64.c (limited to 'arch') diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 7317648e658..98329109684 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,5 +1,5 @@ ifeq ($(CONFIG_X86_32),y) include ${srctree}/arch/x86/mm/Makefile_32 else -include ${srctree}/arch/x86_64/mm/Makefile_64 +include ${srctree}/arch/x86/mm/Makefile_64 endif diff --git a/arch/x86/mm/Makefile_64 b/arch/x86/mm/Makefile_64 new file mode 100644 index 00000000000..6bcb47945b8 --- /dev/null +++ b/arch/x86/mm/Makefile_64 @@ -0,0 +1,10 @@ +# +# Makefile for the linux x86_64-specific parts of the memory manager. +# + +obj-y := init_64.o fault_64.o ioremap_64.o extable_64.o pageattr_64.o mmap_64.o +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +obj-$(CONFIG_NUMA) += numa_64.o +obj-$(CONFIG_K8_NUMA) += k8topology_64.o +obj-$(CONFIG_ACPI_NUMA) += srat_64.o + diff --git a/arch/x86/mm/extable_64.c b/arch/x86/mm/extable_64.c new file mode 100644 index 00000000000..79ac6e7100a --- /dev/null +++ b/arch/x86/mm/extable_64.c @@ -0,0 +1,34 @@ +/* + * linux/arch/x86_64/mm/extable.c + */ + +#include +#include +#include +#include + +/* Simple binary search */ +const struct exception_table_entry * +search_extable(const struct exception_table_entry *first, + const struct exception_table_entry *last, + unsigned long value) +{ + /* Work around a B stepping K8 bug */ + if ((value >> 32) == 0) + value |= 0xffffffffUL << 32; + + while (first <= last) { + const struct exception_table_entry *mid; + long diff; + + mid = (last - first) / 2 + first; + diff = mid->insn - value; + if (diff == 0) + return mid; + else if (diff < 0) + first = mid+1; + else + last = mid-1; + } + return NULL; +} diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c new file mode 100644 index 00000000000..54816adb8e9 --- /dev/null +++ b/arch/x86/mm/fault_64.c @@ -0,0 +1,636 @@ +/* + * linux/arch/x86-64/mm/fault.c + * + * Copyright (C) 1995 Linus Torvalds + * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For unblank_screen() */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* Page fault error code bits */ +#define PF_PROT (1<<0) /* or no page found */ +#define PF_WRITE (1<<1) +#define PF_USER (1<<2) +#define PF_RSVD (1<<3) +#define PF_INSTR (1<<4) + +static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); + +/* Hook to register for page fault notifications */ +int register_page_fault_notifier(struct notifier_block *nb) +{ + vmalloc_sync_all(); + return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); +} +EXPORT_SYMBOL_GPL(register_page_fault_notifier); + +int unregister_page_fault_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); +} +EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); + +static inline int notify_page_fault(struct pt_regs *regs, long err) +{ + struct die_args args = { + .regs = regs, + .str = "page fault", + .err = err, + .trapnr = 14, + .signr = SIGSEGV + }; + return atomic_notifier_call_chain(¬ify_page_fault_chain, + DIE_PAGE_FAULT, &args); +} + +/* Sometimes the CPU reports invalid exceptions on prefetch. + Check that here and ignore. + Opcode checker based on code by Richard Brunner */ +static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, + unsigned long error_code) +{ + unsigned char *instr; + int scan_more = 1; + int prefetch = 0; + unsigned char *max_instr; + + /* If it was a exec fault ignore */ + if (error_code & PF_INSTR) + return 0; + + instr = (unsigned char __user *)convert_rip_to_linear(current, regs); + max_instr = instr + 15; + + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) + return 0; + + while (scan_more && instr < max_instr) { + unsigned char opcode; + unsigned char instr_hi; + unsigned char instr_lo; + + if (probe_kernel_address(instr, opcode)) + break; + + instr_hi = opcode & 0xf0; + instr_lo = opcode & 0x0f; + instr++; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* Values 0x26,0x2E,0x36,0x3E are valid x86 + prefixes. In long mode, the CPU will signal + invalid opcode if some of these prefixes are + present so we will never get here anyway */ + scan_more = ((instr_lo & 7) == 0x6); + break; + + case 0x40: + /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes + Need to figure out under what instruction mode the + instruction was issued ... */ + /* Could check the LDT for lm, but for now it's good + enough to assume that long mode only uses well known + segments or kernel. */ + scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); + break; + + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + scan_more = (instr_lo & 0xC) == 0x4; + break; + case 0xF0: + /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */ + scan_more = !instr_lo || (instr_lo>>1) == 1; + break; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + scan_more = 0; + if (probe_kernel_address(instr, opcode)) + break; + prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + break; + default: + scan_more = 0; + break; + } + } + return prefetch; +} + +static int bad_address(void *p) +{ + unsigned long dummy; + return probe_kernel_address((unsigned long *)p, dummy); +} + +void dump_pagetable(unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = (pgd_t *)read_cr3(); + + pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); + pgd += pgd_index(address); + if (bad_address(pgd)) goto bad; + printk("PGD %lx ", pgd_val(*pgd)); + if (!pgd_present(*pgd)) goto ret; + + pud = pud_offset(pgd, address); + if (bad_address(pud)) goto bad; + printk("PUD %lx ", pud_val(*pud)); + if (!pud_present(*pud)) goto ret; + + pmd = pmd_offset(pud, address); + if (bad_address(pmd)) goto bad; + printk("PMD %lx ", pmd_val(*pmd)); + if (!pmd_present(*pmd)) goto ret; + + pte = pte_offset_kernel(pmd, address); + if (bad_address(pte)) goto bad; + printk("PTE %lx", pte_val(*pte)); +ret: + printk("\n"); + return; +bad: + printk("BAD\n"); +} + +static const char errata93_warning[] = +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" +KERN_ERR "******* Please consider a BIOS update.\n" +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; + +/* Workaround for K8 erratum #93 & buggy BIOS. + BIOS SMM functions are required to use a specific workaround + to avoid corruption of the 64bit RIP register on C stepping K8. + A lot of BIOS that didn't get tested properly miss this. + The OS sees this as a page fault with the upper 32bits of RIP cleared. + Try to work around it here. + Note we only handle faults in kernel here. */ + +static int is_errata93(struct pt_regs *regs, unsigned long address) +{ + static int warned; + if (address != regs->rip) + return 0; + if ((address >> 32) != 0) + return 0; + address |= 0xffffffffUL << 32; + if ((address >= (u64)_stext && address <= (u64)_etext) || + (address >= MODULES_VADDR && address <= MODULES_END)) { + if (!warned) { + printk(errata93_warning); + warned = 1; + } + regs->rip = address; + return 1; + } + return 0; +} + +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, + unsigned long error_code) +{ + unsigned long flags = oops_begin(); + struct task_struct *tsk; + + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", + current->comm, address); + dump_pagetable(address); + tsk = current; + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + __die("Bad pagetable", regs, error_code); + oops_end(flags); + do_exit(SIGKILL); +} + +/* + * Handle a fault on the vmalloc area + * + * This assumes no large pages in there. + */ +static int vmalloc_fault(unsigned long address) +{ + pgd_t *pgd, *pgd_ref; + pud_t *pud, *pud_ref; + pmd_t *pmd, *pmd_ref; + pte_t *pte, *pte_ref; + + /* Copy kernel mappings over when needed. This can also + happen within a race in page table update. In the later + case just flush. */ + + pgd = pgd_offset(current->mm ?: &init_mm, address); + pgd_ref = pgd_offset_k(address); + if (pgd_none(*pgd_ref)) + return -1; + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + /* Below here mismatches are bugs because these lower tables + are shared */ + + pud = pud_offset(pgd, address); + pud_ref = pud_offset(pgd_ref, address); + if (pud_none(*pud_ref)) + return -1; + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) + BUG(); + pmd = pmd_offset(pud, address); + pmd_ref = pmd_offset(pud_ref, address); + if (pmd_none(*pmd_ref)) + return -1; + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) + BUG(); + pte_ref = pte_offset_kernel(pmd_ref, address); + if (!pte_present(*pte_ref)) + return -1; + pte = pte_offset_kernel(pmd, address); + /* Don't use pte_page here, because the mappings can point + outside mem_map, and the NUMA hash lookup cannot handle + that. */ + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) + BUG(); + return 0; +} + +static int page_fault_trace; +int show_unhandled_signals = 1; + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + */ +asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, + unsigned long error_code) +{ + struct task_struct *tsk; + struct mm_struct *mm; + struct vm_area_struct * vma; + unsigned long address; + const struct exception_table_entry *fixup; + int write, fault; + unsigned long flags; + siginfo_t info; + + tsk = current; + mm = tsk->mm; + prefetchw(&mm->mmap_sem); + + /* get the address */ + address = read_cr2(); + + info.si_code = SEGV_MAPERR; + + + /* + * We fault-in kernel-space virtual memory on-demand. The + * 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + * + * This verifies that the fault happens in kernel space + * (error_code & 4) == 0, and that the fault was not a + * protection error (error_code & 9) == 0. + */ + if (unlikely(address >= TASK_SIZE64)) { + /* + * Don't check for the module range here: its PML4 + * is always initialized because it's shared with the main + * kernel text. Only vmalloc may need PML4 syncups. + */ + if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && + ((address >= VMALLOC_START && address < VMALLOC_END))) { + if (vmalloc_fault(address) >= 0) + return; + } + if (notify_page_fault(regs, error_code) == NOTIFY_STOP) + return; + /* + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock. + */ + goto bad_area_nosemaphore; + } + + if (notify_page_fault(regs, error_code) == NOTIFY_STOP) + return; + + if (likely(regs->eflags & X86_EFLAGS_IF)) + local_irq_enable(); + + if (unlikely(page_fault_trace)) + printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", + regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); + + if (unlikely(error_code & PF_RSVD)) + pgtable_bad(address, regs, error_code); + + /* + * If we're in an interrupt or have no user + * context, we must not take the fault.. + */ + if (unlikely(in_atomic() || !mm)) + goto bad_area_nosemaphore; + + /* + * User-mode registers count as a user access even for any + * potential system fault or CPU buglet. + */ + if (user_mode_vm(regs)) + error_code |= PF_USER; + + again: + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunatly, in the case of an + * erroneous fault occurring in a code path which already holds mmap_sem + * we will deadlock attempting to validate the fault against the + * address space. Luckily the kernel only validly references user + * space from well defined areas of code, which are listed in the + * exceptions table. + * + * As the vast majority of faults will be valid we will only perform + * the source reference check when there is a possibilty of a deadlock. + * Attempt to lock the address space, if we cannot we then validate the + * source. If this is invalid we can skip the address space check, + * thus avoiding the deadlock. + */ + if (!down_read_trylock(&mm->mmap_sem)) { + if ((error_code & PF_USER) == 0 && + !search_exception_tables(regs->rip)) + goto bad_area_nosemaphore; + down_read(&mm->mmap_sem); + } + + vma = find_vma(mm, address); + if (!vma) + goto bad_area; + if (likely(vma->vm_start <= address)) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + if (error_code & 4) { + /* Allow userspace just enough access below the stack pointer + * to let the 'enter' instruction work. + */ + if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp) + goto bad_area; + } + if (expand_stack(vma, address)) + goto bad_area; +/* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +good_area: + info.si_code = SEGV_ACCERR; + write = 0; + switch (error_code & (PF_PROT|PF_WRITE)) { + default: /* 3: write, present */ + /* fall through */ + case PF_WRITE: /* write, not present */ + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; + write++; + break; + case PF_PROT: /* read, present */ + goto bad_area; + case 0: /* read, not present */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + goto bad_area; + } + + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ + fault = handle_mm_fault(mm, vma, address, write); + if (unlikely(fault & VM_FAULT_ERROR)) { + if (fault & VM_FAULT_OOM) + goto out_of_memory; + else if (fault & VM_FAULT_SIGBUS) + goto do_sigbus; + BUG(); + } + if (fault & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + up_read(&mm->mmap_sem); + return; + +/* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ +bad_area: + up_read(&mm->mmap_sem); + +bad_area_nosemaphore: + /* User mode accesses just cause a SIGSEGV */ + if (error_code & PF_USER) { + + /* + * It's possible to have interrupts off here. + */ + local_irq_enable(); + + if (is_prefetch(regs, address, error_code)) + return; + + /* Work around K8 erratum #100 K8 in compat mode + occasionally jumps to illegal addresses >4GB. We + catch this here in the page fault handler because + these addresses are not reachable. Just detect this + case and return. Any code segment in LDT is + compatibility mode. */ + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && + (address >> 32)) + return; + + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + printk( + "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", + tsk->pid > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, tsk->pid, address, regs->rip, + regs->rsp, error_code); + } + + tsk->thread.cr2 = address; + /* Kernel addresses are always protection faults */ + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; + info.si_signo = SIGSEGV; + info.si_errno = 0; + /* info.si_code has been set above */ + info.si_addr = (void __user *)address; + force_sig_info(SIGSEGV, &info, tsk); + return; + } + +no_context: + + /* Are we prepared to handle this kernel fault? */ + fixup = search_exception_tables(regs->rip); + if (fixup) { + regs->rip = fixup->fixup; + return; + } + + /* + * Hall of shame of CPU/BIOS bugs. + */ + + if (is_prefetch(regs, address, error_code)) + return; + + if (is_errata93(regs, address)) + return; + +/* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ + + flags = oops_begin(); + + if (address < PAGE_SIZE) + printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); + else + printk(KERN_ALERT "Unable to handle kernel paging request"); + printk(" at %016lx RIP: \n" KERN_ALERT,address); + printk_address(regs->rip); + dump_pagetable(address); + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + __die("Oops", regs, error_code); + /* Executive summary in case the body of the oops scrolled away */ + printk(KERN_EMERG "CR2: %016lx\n", address); + oops_end(flags); + do_exit(SIGKILL); + +/* + * We ran out of memory, or some other thing happened to us that made + * us unable to handle the page fault gracefully. + */ +out_of_memory: + up_read(&mm->mmap_sem); + if (is_init(current)) { + yield(); + goto again; + } + printk("VM: killing process %s\n", tsk->comm); + if (error_code & 4) + do_group_exit(SIGKILL); + goto no_context; + +do_sigbus: + up_read(&mm->mmap_sem); + + /* Kernel mode? Handle exceptions or die */ + if (!(error_code & PF_USER)) + goto no_context; + + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_ADRERR; + info.si_addr = (void __user *)address; + force_sig_info(SIGBUS, &info, tsk); + return; +} + +DEFINE_SPINLOCK(pgd_lock); +LIST_HEAD(pgd_list); + +void vmalloc_sync_all(void) +{ + /* Note that races in the updates of insync and start aren't + problematic: + insync can only get set bits added, and updates to start are only + improving performance (without affecting correctness if undone). */ + static DECLARE_BITMAP(insync, PTRS_PER_PGD); + static unsigned long start = VMALLOC_START & PGDIR_MASK; + unsigned long address; + + for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { + if (!test_bit(pgd_index(address), insync)) { + const pgd_t *pgd_ref = pgd_offset_k(address); + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } + spin_unlock(&pgd_lock); + set_bit(pgd_index(address), insync); + } + if (address == start) + start = address + PGDIR_SIZE; + } + /* Check that there is no need to do the same for the modules area. */ + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); + BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == + (__START_KERNEL & PGDIR_MASK))); +} + +static int __init enable_pagefaulttrace(char *str) +{ + page_fault_trace = 1; + return 1; +} +__setup("pagefaulttrace", enable_pagefaulttrace); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c new file mode 100644 index 00000000000..458893b376f --- /dev/null +++ b/arch/x86/mm/init_64.c @@ -0,0 +1,750 @@ +/* + * linux/arch/x86_64/mm/init.c + * + * Copyright (C) 1995 Linus Torvalds + * Copyright (C) 2000 Pavel Machek + * Copyright (C) 2002,2003 Andi Kleen + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef Dprintk +#define Dprintk(x...) +#endif + +const struct dma_mapping_ops* dma_ops; +EXPORT_SYMBOL(dma_ops); + +static unsigned long dma_reserve __initdata; + +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); + +/* + * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the + * physical space so we can cache the place of the first one and move + * around without checking the pgd every time. + */ + +void show_mem(void) +{ + long i, total = 0, reserved = 0; + long shared = 0, cached = 0; + pg_data_t *pgdat; + struct page *page; + + printk(KERN_INFO "Mem-info:\n"); + show_free_areas(); + printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); + + for_each_online_pgdat(pgdat) { + for (i = 0; i < pgdat->node_spanned_pages; ++i) { + /* this loop can take a while with 256 GB and 4k pages + so update the NMI watchdog */ + if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) { + touch_nmi_watchdog(); + } + if (!pfn_valid(pgdat->node_start_pfn + i)) + continue; + page = pfn_to_page(pgdat->node_start_pfn + i); + total++; + if (PageReserved(page)) + reserved++; + else if (PageSwapCache(page)) + cached++; + else if (page_count(page)) + shared += page_count(page) - 1; + } + } + printk(KERN_INFO "%lu pages of RAM\n", total); + printk(KERN_INFO "%lu reserved pages\n",reserved); + printk(KERN_INFO "%lu pages shared\n",shared); + printk(KERN_INFO "%lu pages swap cached\n",cached); +} + +int after_bootmem; + +static __init void *spp_getpage(void) +{ + void *ptr; + if (after_bootmem) + ptr = (void *) get_zeroed_page(GFP_ATOMIC); + else + ptr = alloc_bootmem_pages(PAGE_SIZE); + if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) + panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":""); + + Dprintk("spp_getpage %p\n", ptr); + return ptr; +} + +static __init void set_pte_phys(unsigned long vaddr, + unsigned long phys, pgprot_t prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, new_pte; + + Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); + + pgd = pgd_offset_k(vaddr); + if (pgd_none(*pgd)) { + printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); + return; + } + pud = pud_offset(pgd, vaddr); + if (pud_none(*pud)) { + pmd = (pmd_t *) spp_getpage(); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); + if (pmd != pmd_offset(pud, 0)) { + printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); + return; + } + } + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + pte = (pte_t *) spp_getpage(); + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); + if (pte != pte_offset_kernel(pmd, 0)) { + printk("PAGETABLE BUG #02!\n"); + return; + } + } + new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); + + pte = pte_offset_kernel(pmd, vaddr); + if (!pte_none(*pte) && + pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) + pte_ERROR(*pte); + set_pte(pte, new_pte); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +/* NOTE: this is meant to be run only at boot */ +void __init +__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) +{ + unsigned long address = __fix_to_virt(idx); + + if (idx >= __end_of_fixed_addresses) { + printk("Invalid __set_fixmap\n"); + return; + } + set_pte_phys(address, phys, prot); +} + +unsigned long __meminitdata table_start, table_end; + +static __meminit void *alloc_low_page(unsigned long *phys) +{ + unsigned long pfn = table_end++; + void *adr; + + if (after_bootmem) { + adr = (void *)get_zeroed_page(GFP_ATOMIC); + *phys = __pa(adr); + return adr; + } + + if (pfn >= end_pfn) + panic("alloc_low_page: ran out of memory"); + + adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); + memset(adr, 0, PAGE_SIZE); + *phys = pfn * PAGE_SIZE; + return adr; +} + +static __meminit void unmap_low_page(void *adr) +{ + + if (after_bootmem) + return; + + early_iounmap(adr, PAGE_SIZE); +} + +/* Must run before zap_low_mappings */ +__meminit void *early_ioremap(unsigned long addr, unsigned long size) +{ + unsigned long vaddr; + pmd_t *pmd, *last_pmd; + int i, pmds; + + pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; + vaddr = __START_KERNEL_map; + pmd = level2_kernel_pgt; + last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; + for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { + for (i = 0; i < pmds; i++) { + if (pmd_present(pmd[i])) + goto next; + } + vaddr += addr & ~PMD_MASK; + addr &= PMD_MASK; + for (i = 0; i < pmds; i++, addr += PMD_SIZE) + set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE)); + __flush_tlb(); + return (void *)vaddr; + next: + ; + } + printk("early_ioremap(0x%lx, %lu) failed\n", addr, size); + return NULL; +} + +/* To avoid virtual aliases later */ +__meminit void early_iounmap(void *addr, unsigned long size) +{ + unsigned long vaddr; + pmd_t *pmd; + int i, pmds; + + vaddr = (unsigned long)addr; + pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; + pmd = level2_kernel_pgt + pmd_index(vaddr); + for (i = 0; i < pmds; i++) + pmd_clear(pmd + i); + __flush_tlb(); +} + +static void __meminit +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) +{ + int i = pmd_index(address); + + for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { + unsigned long entry; + pmd_t *pmd = pmd_page + pmd_index(address); + + if (address >= end) { + if (!after_bootmem) + for (; i < PTRS_PER_PMD; i++, pmd++) + set_pmd(pmd, __pmd(0)); + break; + } + + if (pmd_val(*pmd)) + continue; + + entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address; + entry &= __supported_pte_mask; + set_pmd(pmd, __pmd(entry)); + } +} + +static void __meminit +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) +{ + pmd_t *pmd = pmd_offset(pud,0); + spin_lock(&init_mm.page_table_lock); + phys_pmd_init(pmd, address, end); + spin_unlock(&init_mm.page_table_lock); + __flush_tlb_all(); +} + +static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) +{ + int i = pud_index(addr); + + + for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) { + unsigned long pmd_phys; + pud_t *pud = pud_page + pud_index(addr); + pmd_t *pmd; + + if (addr >= end) + break; + + if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) { + set_pud(pud, __pud(0)); + continue; + } + + if (pud_val(*pud)) { + phys_pmd_update(pud, addr, end); + continue; + } + + pmd = alloc_low_page(&pmd_phys); + spin_lock(&init_mm.page_table_lock); + set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); + phys_pmd_init(pmd, addr, end); + spin_unlock(&init_mm.page_table_lock); + unmap_low_page(pmd); + } + __flush_tlb(); +} + +static void __init find_early_table_space(unsigned long end) +{ + unsigned long puds, pmds, tables, start; + + puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) + + round_up(pmds * sizeof(pmd_t), PAGE_SIZE); + + /* RED-PEN putting page tables only on node 0 could + cause a hotspot and fill up ZONE_DMA. The page tables + need roughly 0.5KB per GB. */ + start = 0x8000; + table_start = find_e820_area(start, end, tables); + if (table_start == -1UL) + panic("Cannot find space for the kernel page tables"); + + table_start >>= PAGE_SHIFT; + table_end = table_start; + + early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", + end, table_start << PAGE_SHIFT, + (table_start << PAGE_SHIFT) + tables); +} + +/* Setup the direct mapping of the physical memory at PAGE_OFFSET. + This runs before bootmem is initialized and gets pages directly from the + physical memory. To access them they are temporarily mapped. */ +void __meminit init_memory_mapping(unsigned long start, unsigned long end) +{ + unsigned long next; + + Dprintk("init_memory_mapping\n"); + + /* + * Find space for the kernel direct mapping tables. + * Later we should allocate these tables in the local node of the memory + * mapped. Unfortunately this is done currently before the nodes are + * discovered. + */ + if (!after_bootmem) + find_early_table_space(end); + + start = (unsigned long)__va(start); + end = (unsigned long)__va(end); + + for (; start < end; start = next) { + unsigned long pud_phys; + pgd_t *pgd = pgd_offset_k(start); + pud_t *pud; + + if (after_bootmem) + pud = pud_offset(pgd, start & PGDIR_MASK); + else + pud = alloc_low_page(&pud_phys); + + next = start + PGDIR_SIZE; + if (next > end) + next = end; + phys_pud_init(pud, __pa(start), __pa(next)); + if (!after_bootmem) + set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); + unmap_low_page(pud); + } + + if (!after_bootmem) + mmu_cr4_features = read_cr4(); + __flush_tlb_all(); +} + +#ifndef CONFIG_NUMA +void __init paging_init(void) +{ + unsigned long max_zone_pfns[MAX_NR_ZONES]; + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; + max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; + max_zone_pfns[ZONE_NORMAL] = end_pfn; + + memory_present(0, 0, end_pfn); + sparse_init(); + free_area_init_nodes(max_zone_pfns); +} +#endif + +/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches + from the CPU leading to inconsistent cache lines. address and size + must be aligned to 2MB boundaries. + Does nothing when the mapping doesn't exist. */ +void __init clear_kernel_mapping(unsigned long address, unsigned long size) +{ + unsigned long end = address + size; + + BUG_ON(address & ~LARGE_PAGE_MASK); + BUG_ON(size & ~LARGE_PAGE_MASK); + + for (; address < end; address += LARGE_PAGE_SIZE) { + pgd_t *pgd = pgd_offset_k(address); + pud_t *pud; + pmd_t *pmd; + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, address); + if (!pmd || pmd_none(*pmd)) + continue; + if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { + /* Could handle this, but it should not happen currently. */ + printk(KERN_ERR + "clear_kernel_mapping: mapping has been split. will leak memory\n"); + pmd_ERROR(*pmd); + } + set_pmd(pmd, __pmd(0)); + } + __flush_tlb_all(); +} + +/* + * Memory hotplug specific functions + */ +void online_page(struct page *page) +{ + ClearPageReserved(page); + init_page_count(page); + __free_page(page); + totalram_pages++; + num_physpages++; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * Memory is added always to NORMAL zone. This means you will never get + * additional DMA/DMA32 memory. + */ +int arch_add_memory(int nid, u64 start, u64 size) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *zone = pgdat->node_zones + ZONE_NORMAL; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + int ret; + + init_memory_mapping(start, (start + size -1)); + + ret = __add_pages(zone, start_pfn, nr_pages); + if (ret) + goto error; + + return ret; +error: + printk("%s: Problem encountered in __add_pages!\n", __func__); + return ret; +} +EXPORT_SYMBOL_GPL(arch_add_memory); + +int remove_memory(u64 start, u64 size) +{ + return -EINVAL; +} +EXPORT_SYMBOL_GPL(remove_memory); + +#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA) +int memory_add_physaddr_to_nid(u64 start) +{ + return 0; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); +#endif + +#endif /* CONFIG_MEMORY_HOTPLUG */ + +#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE +/* + * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance, + * just online the pages. + */ +int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) +{ + int err = -EIO; + unsigned long pfn; + unsigned long total = 0, mem = 0; + for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { + if (pfn_valid(pfn)) { + online_page(pfn_to_page(pfn)); + err = 0; + mem++; + } + total++; + } + if (!err) { + z->spanned_pages += total; + z->present_pages += mem; + z->zone_pgdat->node_spanned_pages += total; + z->zone_pgdat->node_present_pages += mem; + } + return err; +} +#endif + +static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, + kcore_vsyscall; + +void __init mem_init(void) +{ + long codesize, reservedpages, datasize, initsize; + + pci_iommu_alloc(); + + /* clear the zero-page */ + memset(empty_zero_page, 0, PAGE_SIZE); + + reservedpages = 0; + + /* this will put all low memory onto the freelists */ +#ifdef CONFIG_NUMA + totalram_pages = numa_free_all_bootmem(); +#else + totalram_pages = free_all_bootmem(); +#endif + reservedpages = end_pfn - totalram_pages - + absent_pages_in_range(0, end_pfn); + + after_bootmem = 1; + + codesize = (unsigned long) &_etext - (unsigned long) &_text; + datasize = (unsigned long) &_edata - (unsigned long) &_etext; + initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; + + /* Register memory areas for /proc/kcore */ + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, + VMALLOC_END-VMALLOC_START); + kclist_add(&kcore_kernel, &_stext, _end - _stext); + kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, + VSYSCALL_END - VSYSCALL_START); + + printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n", + (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + end_pfn << (PAGE_SHIFT-10), + codesize >> 10, + reservedpages << (PAGE_SHIFT-10), + datasize >> 10, + initsize >> 10); +} + +void free_init_pages(char *what, unsigned long begin, unsigned long end) +{ + unsigned long addr; + + if (begin >= end) + return; + + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); + for (addr = begin; addr < end; addr += PAGE_SIZE) { + ClearPageReserved(virt_to_page(addr)); + init_page_count(virt_to_page(addr)); + memset((void *)(addr & ~(PAGE_SIZE-1)), + POISON_FREE_INITMEM, PAGE_SIZE); + if (addr >= __START_KERNEL_map) + change_page_attr_addr(addr, 1, __pgprot(0)); + free_page(addr); + totalram_pages++; + } + if (addr > __START_KERNEL_map) + global_flush_tlb(); +} + +void free_initmem(void) +{ + free_init_pages("unused kernel memory", + (unsigned long)(&__init_begin), + (unsigned long)(&__init_end)); +} + +#ifdef CONFIG_DEBUG_RODATA + +void mark_rodata_ro(void) +{ + unsigned long start = (unsigned long)_stext, end; + +#ifdef CONFIG_HOTPLUG_CPU + /* It must still be possible to apply SMP alternatives. */ + if (num_possible_cpus() > 1) + start = (unsigned long)_etext; +#endif + +#ifdef CONFIG_KPROBES + start = (unsigned long)__start_rodata; +#endif + + end = (unsigned long)__end_rodata; + start = (start + PAGE_SIZE - 1) & PAGE_MASK; + end &= PAGE_MASK; + if (end <= start) + return; + + change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO); + + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", + (end - start) >> 10); + + /* + * change_page_attr_addr() requires a global_flush_tlb() call after it. + * We do this after the printk so that if something went wrong in the + * change, the printk gets out at least to give a better debug hint + * of who is the culprit. + */ + global_flush_tlb(); +} +#endif + +#ifdef CONFIG_BLK_DEV_INITRD +void free_initrd_mem(unsigned long start, unsigned long end) +{ + free_init_pages("initrd memory", start, end); +} +#endif + +void __init reserve_bootmem_generic(unsigned long phys, unsigned len) +{ +#ifdef CONFIG_NUMA + int nid = phys_to_nid(phys); +#endif + unsigned long pfn = phys >> PAGE_SHIFT; + if (pfn >= end_pfn) { + /* This can happen with kdump kernels when accessing firmware + tables. */ + if (pfn < end_pfn_map) + return; + printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", + phys, len); + return; + } + + /* Should check here against the e820 map to avoid double free */ +#ifdef CONFIG_NUMA + reserve_bootmem_node(NODE_DATA(nid), phys, len); +#else + reserve_bootmem(phys, len); +#endif + if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { + dma_reserve += len / PAGE_SIZE; + set_dma_reserve(dma_reserve); + } +} + +int kern_addr_valid(unsigned long addr) +{ + unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + if (above != 0 && above != -1UL) + return 0; + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) + return 0; + + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) + return 0; + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return 0; + if (pmd_large(*pmd)) + return pfn_valid(pmd_pfn(*pmd)); + + pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) + return 0; + return pfn_valid(pte_pfn(*pte)); +} + +/* A pseudo VMA to allow ptrace access for the vsyscall page. This only + covers the 64bit vsyscall page now. 32bit has a real VMA now and does + not need special handling anymore. */ + +static struct vm_area_struct gate_vma = { + .vm_start = VSYSCALL_START, + .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT), + .vm_page_prot = PAGE_READONLY_EXEC, + .vm_flags = VM_READ | VM_EXEC +}; + +struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +{ +#ifdef CONFIG_IA32_EMULATION + if (test_tsk_thread_flag(tsk, TIF_IA32)) + return NULL; +#endif + return &gate_vma; +} + +int in_gate_area(struct task_struct *task, unsigned long addr) +{ + struct vm_area_struct *vma = get_gate_vma(task); + if (!vma) + return 0; + return (addr >= vma->vm_start) && (addr < vma->vm_end); +} + +/* Use this when you have no reliable task/vma, typically from interrupt + * context. It is less reliable than using the task's vma and may give + * false positives. + */ +int in_gate_area_no_task(unsigned long addr) +{ + return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); +} + +void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) +{ + return __alloc_bootmem_core(pgdat->bdata, size, + SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0); +} + +const char *arch_vma_name(struct vm_area_struct *vma) +{ + if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) + return "[vdso]"; + if (vma == &gate_vma) + return "[vsyscall]"; + return NULL; +} diff --git a/arch/x86/mm/ioremap_64.c b/arch/x86/mm/ioremap_64.c new file mode 100644 index 00000000000..6cac90aa503 --- /dev/null +++ b/arch/x86/mm/ioremap_64.c @@ -0,0 +1,210 @@ +/* + * arch/x86_64/mm/ioremap.c + * + * Re-map IO memory to kernel address space so that we can access it. + * This is needed for high PCI addresses that aren't mapped in the + * 640k-1MB IO memory area on PC's + * + * (C) Copyright 1995 1996 Linus Torvalds + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +unsigned long __phys_addr(unsigned long x) +{ + if (x >= __START_KERNEL_map) + return x - __START_KERNEL_map + phys_base; + return x - PAGE_OFFSET; +} +EXPORT_SYMBOL(__phys_addr); + +#define ISA_START_ADDRESS 0xa0000 +#define ISA_END_ADDRESS 0x100000 + +/* + * Fix up the linear direct mapping of the kernel to avoid cache attribute + * conflicts. + */ +static int +ioremap_change_attr(unsigned long phys_addr, unsigned long size, + unsigned long flags) +{ + int err = 0; + if (phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) { + unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long vaddr = (unsigned long) __va(phys_addr); + + /* + * Must use a address here and not struct page because the phys addr + * can be a in hole between nodes and not have an memmap entry. + */ + err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags)); + if (!err) + global_flush_tlb(); + } + return err; +} + +/* + * Generic mapping function + */ + +/* + * Remap an arbitrary physical address space into the kernel virtual + * address space. Needed when the kernel wants to access high addresses + * directly. + * + * NOTE! We need to allow non-page-aligned mappings too: we will obviously + * have to convert them into an offset in a page-aligned mapping, but the + * caller shouldn't need to know that small detail. + */ +void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) +{ + void * addr; + struct vm_struct * area; + unsigned long offset, last_addr; + pgprot_t pgprot; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) + return NULL; + + /* + * Don't remap the low PCI/ISA area, it's always mapped.. + */ + if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) + return (__force void __iomem *)phys_to_virt(phys_addr); + +#ifdef CONFIG_FLATMEM + /* + * Don't allow anybody to remap normal RAM that we're using.. + */ + if (last_addr < virt_to_phys(high_memory)) { + char *t_addr, *t_end; + struct page *page; + + t_addr = __va(phys_addr); + t_end = t_addr + (size - 1); + + for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) + if(!PageReserved(page)) + return NULL; + } +#endif + + pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_GLOBAL + | _PAGE_DIRTY | _PAGE_ACCESSED | flags); + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr+1) - phys_addr; + + /* + * Ok, go for it.. + */ + area = get_vm_area(size, VM_IOREMAP | (flags << 20)); + if (!area) + return NULL; + area->phys_addr = phys_addr; + addr = area->addr; + if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, + phys_addr, pgprot)) { + remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); + return NULL; + } + if (flags && ioremap_change_attr(phys_addr, size, flags) < 0) { + area->flags &= 0xffffff; + vunmap(addr); + return NULL; + } + return (__force void __iomem *) (offset + (char *)addr); +} +EXPORT_SYMBOL(__ioremap); + +/** + * ioremap_nocache - map bus memory into CPU space + * @offset: bus address of the memory + * @size: size of the resource to map + * + * ioremap_nocache performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * This version of ioremap ensures that the memory is marked uncachable + * on the CPU as well as honouring existing caching rules from things like + * the PCI bus. Note that there are other caches and buffers on many + * busses. In particular driver authors should read up on PCI writes + * + * It's useful if some control registers are in such an area and + * write combining or read caching is not desirable: + * + * Must be freed with iounmap. + */ + +void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) +{ + return __ioremap(phys_addr, size, _PAGE_PCD); +} +EXPORT_SYMBOL(ioremap_nocache); + +/** + * iounmap - Free a IO remapping + * @addr: virtual address from ioremap_* + * + * Caller must ensure there is only one unmapping for the same pointer. + */ +void iounmap(volatile void __iomem *addr) +{ + struct vm_struct *p, *o; + + if (addr <= high_memory) + return; + if (addr >= phys_to_virt(ISA_START_ADDRESS) && + addr < phys_to_virt(ISA_END_ADDRESS)) + return; + + addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr); + /* Use the vm area unlocked, assuming the caller + ensures there isn't another iounmap for the same address + in parallel. Reuse of the virtual address is prevented by + leaving it in the global lists until we're done with it. + cpa takes care of the direct mappings. */ + read_lock(&vmlist_lock); + for (p = vmlist; p; p = p->next) { + if (p->addr == addr) + break; + } + read_unlock(&vmlist_lock); + + if (!p) { + printk("iounmap: bad address %p\n", addr); + dump_stack(); + return; + } + + /* Reset the direct mapping. Can block */ + if (p->flags >> 20) + ioremap_change_attr(p->phys_addr, p->size, 0); + + /* Finally remove it */ + o = remove_vm_area((void *)addr); + BUG_ON(p != o || o == NULL); + kfree(p); +} +EXPORT_SYMBOL(iounmap); + diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c new file mode 100644 index 00000000000..a96006f7ae0 --- /dev/null +++ b/arch/x86/mm/k8topology_64.c @@ -0,0 +1,182 @@ +/* + * AMD K8 NUMA support. + * Discover the memory map and associated nodes. + * + * This version reads it directly from the K8 northbridge. + * + * Copyright 2002,2003 Andi Kleen, SuSE Labs. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static __init int find_northbridge(void) +{ + int num; + + for (num = 0; num < 32; num++) { + u32 header; + + header = read_pci_config(0, num, 0, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16))) + continue; + + header = read_pci_config(0, num, 1, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16))) + continue; + return num; + } + + return -1; +} + +int __init k8_scan_nodes(unsigned long start, unsigned long end) +{ + unsigned long prevbase; + struct bootnode nodes[8]; + int nodeid, i, j, nb; + unsigned char nodeids[8]; + int found = 0; + u32 reg; + unsigned numnodes; + unsigned num_cores; + + if (!early_pci_allowed()) + return -1; + + nb = find_northbridge(); + if (nb < 0) + return nb; + + printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); + + num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; + printk(KERN_INFO "CPU has %d num_cores\n", num_cores); + + reg = read_pci_config(0, nb, 0, 0x60); + numnodes = ((reg >> 4) & 0xF) + 1; + if (numnodes <= 1) + return -1; + + printk(KERN_INFO "Number of nodes %d\n", numnodes); + + memset(&nodes,0,sizeof(nodes)); + prevbase = 0; + for (i = 0; i < 8; i++) { + unsigned long base,limit; + u32 nodeid; + + base = read_pci_config(0, nb, 1, 0x40 + i*8); + limit = read_pci_config(0, nb, 1, 0x44 + i*8); + + nodeid = limit & 7; + nodeids[i] = nodeid; + if ((base & 3) == 0) { + if (i < numnodes) + printk("Skipping disabled node %d\n", i); + continue; + } + if (nodeid >= numnodes) { + printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, + base, limit); + continue; + } + + if (!limit) { + printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i, + base); + continue; + } + if ((base >> 8) & 3 || (limit >> 8) & 3) { + printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", + nodeid, (base>>8)&3, (limit>>8) & 3); + return -1; + } + if (node_isset(nodeid, node_possible_map)) { + printk(KERN_INFO "Node %d already present. Skipping\n", + nodeid); + continue; + } + + limit >>= 16; + limit <<= 24; + limit |= (1<<24)-1; + limit++; + + if (limit > end_pfn << PAGE_SHIFT) + limit = end_pfn << PAGE_SHIFT; + if (limit <= base) + continue; + + base >>= 16; + base <<= 24; + + if (base < start) + base = start; + if (limit > end) + limit = end; + if (limit == base) { + printk(KERN_ERR "Empty node %d\n", nodeid); + continue; + } + if (limit < base) { + printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", + nodeid, base, limit); + continue; + } + + /* Could sort here, but pun for now. Should not happen anyroads. */ + if (prevbase > base) { + printk(KERN_ERR "Node map not sorted %lx,%lx\n", + prevbase,base); + return -1; + } + + printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", + nodeid, base, limit); + + found++; + + nodes[nodeid].start = base; + nodes[nodeid].end = limit; + e820_register_active_regions(nodeid, + nodes[nodeid].start >> PAGE_SHIFT, + nodes[nodeid].end >> PAGE_SHIFT); + + prevbase = base; + + node_set(nodeid, node_possible_map); + } + + if (!found) + return -1; + + memnode_shift = compute_hash_shift(nodes, 8); + if (memnode_shift < 0) { + printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); + return -1; + } + printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); + + for (i = 0; i < 8; i++) { + if (nodes[i].start != nodes[i].end) { + nodeid = nodeids[i]; + for (j = 0; j < num_cores; j++) + apicid_to_node[(nodeid * num_cores) + j] = i; + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + } + } + + numa_init_array(); + return 0; +} diff --git a/arch/x86/mm/mmap_64.c b/arch/x86/mm/mmap_64.c new file mode 100644 index 00000000000..80bba0dc000 --- /dev/null +++ b/arch/x86/mm/mmap_64.c @@ -0,0 +1,29 @@ +/* Copyright 2005 Andi Kleen, SuSE Labs. + * Licensed under GPL, v.2 + */ +#include +#include +#include +#include + +/* Notebook: move the mmap code from sys_x86_64.c over here. */ + +void arch_pick_mmap_layout(struct mm_struct *mm) +{ +#ifdef CONFIG_IA32_EMULATION + if (current_thread_info()->flags & _TIF_IA32) + return ia32_pick_mmap_layout(mm); +#endif + mm->mmap_base = TASK_UNMAPPED_BASE; + if (current->flags & PF_RANDOMIZE) { + /* Add 28bit randomness which is about 40bits of address space + because mmap base has to be page aligned. + or ~1/128 of the total user VM + (total user address space is 47bits) */ + unsigned rnd = get_random_int() & 0xfffffff; + mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT; + } + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; +} + diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c new file mode 100644 index 00000000000..6da23552226 --- /dev/null +++ b/arch/x86/mm/numa_64.c @@ -0,0 +1,648 @@ +/* + * Generic VM initialization for x86-64 NUMA setups. + * Copyright 2002,2003 Andi Kleen, SuSE Labs. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifndef Dprintk +#define Dprintk(x...) +#endif + +struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; +bootmem_data_t plat_node_bdata[MAX_NUMNODES]; + +struct memnode memnode; + +unsigned char cpu_to_node[NR_CPUS] __read_mostly = { + [0 ... NR_CPUS-1] = NUMA_NO_NODE +}; +unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +}; +cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; + +int numa_off __initdata; +unsigned long __initdata nodemap_addr; +unsigned long __initdata nodemap_size; + + +/* + * Given a shift value, try to populate memnodemap[] + * Returns : + * 1 if OK + * 0 if memnodmap[] too small (of shift too small) + * -1 if node overlap or lost ram (shift too big) + */ +static int __init +populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) +{ + int i; + int res = -1; + unsigned long addr, end; + + memset(memnodemap, 0xff, memnodemapsize); + for (i = 0; i < numnodes; i++) { + addr = nodes[i].start; + end = nodes[i].end; + if (addr >= end) + continue; + if ((end >> shift) >= memnodemapsize) + return 0; + do { + if (memnodemap[addr >> shift] != 0xff) + return -1; + memnodemap[addr >> shift] = i; + addr += (1UL << shift); + } while (addr < end); + res = 1; + } + return res; +} + +static int __init allocate_cachealigned_memnodemap(void) +{ + unsigned long pad, pad_addr; + + memnodemap = memnode.embedded_map; + if (memnodemapsize <= 48) + return 0; + + pad = L1_CACHE_BYTES - 1; + pad_addr = 0x8000; + nodemap_size = pad + memnodemapsize; + nodemap_addr = find_e820_area(pad_addr, end_pfn<= end) + continue; + bitfield |= start; + nodes_used++; + if (end > memtop) + memtop = end; + } + if (nodes_used <= 1) + i = 63; + else + i = find_first_bit(&bitfield, sizeof(unsigned long)*8); + memnodemapsize = (memtop >> i)+1; + return i; +} + +int __init compute_hash_shift(struct bootnode *nodes, int numnodes) +{ + int shift; + + shift = extract_lsb_from_nodes(nodes, numnodes); + if (allocate_cachealigned_memnodemap()) + return -1; + printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", + shift); + + if (populate_memnodemap(nodes, numnodes, shift) != 1) { + printk(KERN_INFO + "Your memory is not aligned you need to rebuild your kernel " + "with a bigger NODEMAPSIZE shift=%d\n", + shift); + return -1; + } + return shift; +} + +#ifdef CONFIG_SPARSEMEM +int early_pfn_to_nid(unsigned long pfn) +{ + return phys_to_nid(pfn << PAGE_SHIFT); +} +#endif + +static void * __init +early_node_mem(int nodeid, unsigned long start, unsigned long end, + unsigned long size) +{ + unsigned long mem = find_e820_area(start, end, size); + void *ptr; + if (mem != -1L) + return __va(mem); + ptr = __alloc_bootmem_nopanic(size, + SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)); + if (ptr == 0) { + printk(KERN_ERR "Cannot find %lu bytes in node %d\n", + size, nodeid); + return NULL; + } + return ptr; +} + +/* Initialize bootmem allocator for a node */ +void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) +{ + unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; + unsigned long nodedata_phys; + void *bootmap; + const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); + + start = round_up(start, ZONE_ALIGN); + + printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); + + start_pfn = start >> PAGE_SHIFT; + end_pfn = end >> PAGE_SHIFT; + + node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size); + if (node_data[nodeid] == NULL) + return; + nodedata_phys = __pa(node_data[nodeid]); + + memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); + NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; + NODE_DATA(nodeid)->node_start_pfn = start_pfn; + NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; + + /* Find a place for the bootmem map */ + bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); + bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); + bootmap = early_node_mem(nodeid, bootmap_start, end, + bootmap_pages<= end) + free_bootmem((unsigned long)node_data[nodeid],pgdat_size); + node_data[nodeid] = NULL; + return; + } + bootmap_start = __pa(bootmap); + Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); + + bootmap_size = init_bootmem_node(NODE_DATA(nodeid), + bootmap_start >> PAGE_SHIFT, + start_pfn, end_pfn); + + free_bootmem_with_active_regions(nodeid, end); + + reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); + reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<node_mem_map = + __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, + memmapsize, SMP_CACHE_BYTES, + round_down(limit - memmapsize, PAGE_SIZE), + limit); +#endif +} + +void __init numa_init_array(void) +{ + int rr, i; + /* There are unfortunately some poorly designed mainboards around + that only connect memory to a single CPU. This breaks the 1:1 cpu->node + mapping. To avoid this fill in the mapping for all possible + CPUs, as the number of CPUs is not known yet. + We round robin the existing nodes. */ + rr = first_node(node_online_map); + for (i = 0; i < NR_CPUS; i++) { + if (cpu_to_node[i] != NUMA_NO_NODE) + continue; + numa_set_node(i, rr); + rr = next_node(rr, node_online_map); + if (rr == MAX_NUMNODES) + rr = first_node(node_online_map); + } + +} + +#ifdef CONFIG_NUMA_EMU +/* Numa emulation */ +char *cmdline __initdata; + +/* + * Setups up nid to range from addr to addr + size. If the end boundary is + * greater than max_addr, then max_addr is used instead. The return value is 0 + * if there is additional memory left for allocation past addr and -1 otherwise. + * addr is adjusted to be at the end of the node. + */ +static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, + u64 size, u64 max_addr) +{ + int ret = 0; + nodes[nid].start = *addr; + *addr += size; + if (*addr >= max_addr) { + *addr = max_addr; + ret = -1; + } + nodes[nid].end = *addr; + node_set(nid, node_possible_map); + printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, + nodes[nid].start, nodes[nid].end, + (nodes[nid].end - nodes[nid].start) >> 20); + return ret; +} + +/* + * Splits num_nodes nodes up equally starting at node_start. The return value + * is the number of nodes split up and addr is adjusted to be at the end of the + * last node allocated. + */ +static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, + u64 max_addr, int node_start, + int num_nodes) +{ + unsigned int big; + u64 size; + int i; + + if (num_nodes <= 0) + return -1; + if (num_nodes > MAX_NUMNODES) + num_nodes = MAX_NUMNODES; + size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) / + num_nodes; + /* + * Calculate the number of big nodes that can be allocated as a result + * of consolidating the leftovers. + */ + big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / + FAKE_NODE_MIN_SIZE; + + /* Round down to nearest FAKE_NODE_MIN_SIZE. */ + size &= FAKE_NODE_MIN_HASH_MASK; + if (!size) { + printk(KERN_ERR "Not enough memory for each node. " + "NUMA emulation disabled.\n"); + return -1; + } + + for (i = node_start; i < num_nodes + node_start; i++) { + u64 end = *addr + size; + if (i < big) + end += FAKE_NODE_MIN_SIZE; + /* + * The final node can have the remaining system RAM. Other + * nodes receive roughly the same amount of available pages. + */ + if (i == num_nodes + node_start - 1) + end = max_addr; + else + while (end - *addr - e820_hole_size(*addr, end) < + size) { + end += FAKE_NODE_MIN_SIZE; + if (end > max_addr) { + end = max_addr; + break; + } + } + if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) + break; + } + return i - node_start + 1; +} + +/* + * Splits the remaining system RAM into chunks of size. The remaining memory is + * always assigned to a final node and can be asymmetric. Returns the number of + * nodes split. + */ +static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, + u64 max_addr, int node_start, u64 size) +{ + int i = node_start; + size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; + while (!setup_node_range(i++, nodes, addr, size, max_addr)) + ; + return i - node_start; +} + +/* + * Sets up the system RAM area from start_pfn to end_pfn according to the + * numa=fake command-line option. + */ +static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) +{ + struct bootnode nodes[MAX_NUMNODES]; + u64 addr = start_pfn << PAGE_SHIFT; + u64 max_addr = end_pfn << PAGE_SHIFT; + int num_nodes = 0; + int coeff_flag; + int coeff = -1; + int num = 0; + u64 size; + int i; + + memset(&nodes, 0, sizeof(nodes)); + /* + * If the numa=fake command-line is just a single number N, split the + * system RAM into N fake nodes. + */ + if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { + num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, + simple_strtol(cmdline, NULL, 0)); + if (num_nodes < 0) + return num_nodes; + goto out; + } + + /* Parse the command line. */ + for (coeff_flag = 0; ; cmdline++) { + if (*cmdline && isdigit(*cmdline)) { + num = num * 10 + *cmdline - '0'; + continue; + } + if (*cmdline == '*') { + if (num > 0) + coeff = num; + coeff_flag = 1; + } + if (!*cmdline || *cmdline == ',') { + if (!coeff_flag) + coeff = 1; + /* + * Round down to the nearest FAKE_NODE_MIN_SIZE. + * Command-line coefficients are in megabytes. + */ + size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; + if (size) + for (i = 0; i < coeff; i++, num_nodes++) + if (setup_node_range(num_nodes, nodes, + &addr, size, max_addr) < 0) + goto done; + if (!*cmdline) + break; + coeff_flag = 0; + coeff = -1; + } + num = 0; + } +done: + if (!num_nodes) + return -1; + /* Fill remainder of system RAM, if appropriate. */ + if (addr < max_addr) { + if (coeff_flag && coeff < 0) { + /* Split remaining nodes into num-sized chunks */ + num_nodes += split_nodes_by_size(nodes, &addr, max_addr, + num_nodes, num); + goto out; + } + switch (*(cmdline - 1)) { + case '*': + /* Split remaining nodes into coeff chunks */ + if (coeff <= 0) + break; + num_nodes += split_nodes_equally(nodes, &addr, max_addr, + num_nodes, coeff); + break; + case ',': + /* Do not allocate remaining system RAM */ + break; + default: + /* Give one final node */ + setup_node_range(num_nodes, nodes, &addr, + max_addr - addr, max_addr); + num_nodes++; + } + } +out: + memnode_shift = compute_hash_shift(nodes, num_nodes); + if (memnode_shift < 0) { + memnode_shift = 0; + printk(KERN_ERR "No NUMA hash function found. NUMA emulation " + "disabled.\n"); + return -1; + } + + /* + * We need to vacate all active ranges that may have been registered by + * SRAT and set acpi_numa to -1 so that srat_disabled() always returns + * true. NUMA emulation has succeeded so we will not scan ACPI nodes. + */ + remove_all_active_ranges(); +#ifdef CONFIG_ACPI_NUMA + acpi_numa = -1; +#endif + for_each_node_mask(i, node_possible_map) { + e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, + nodes[i].end >> PAGE_SHIFT); + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + } + acpi_fake_nodes(nodes, num_nodes); + numa_init_array(); + return 0; +} +#endif /* CONFIG_NUMA_EMU */ + +void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) +{ + int i; + + nodes_clear(node_possible_map); + +#ifdef CONFIG_NUMA_EMU + if (cmdline && !numa_emulation(start_pfn, end_pfn)) + return; + nodes_clear(node_possible_map); +#endif + +#ifdef CONFIG_ACPI_NUMA + if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, + end_pfn << PAGE_SHIFT)) + return; + nodes_clear(node_possible_map); +#endif + +#ifdef CONFIG_K8_NUMA + if (!numa_off && !k8_scan_nodes(start_pfn<nodenumber = node; + cpu_to_node[cpu] = node; +} + +unsigned long __init numa_free_all_bootmem(void) +{ + int i; + unsigned long pages = 0; + for_each_online_node(i) { + pages += free_all_bootmem_node(NODE_DATA(i)); + } + return pages; +} + +void __init paging_init(void) +{ + int i; + unsigned long max_zone_pfns[MAX_NR_ZONES]; + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; + max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; + max_zone_pfns[ZONE_NORMAL] = end_pfn; + + sparse_memory_present_with_active_regions(MAX_NUMNODES); + sparse_init(); + + for_each_online_node(i) { + setup_node_zones(i); + } + + free_area_init_nodes(max_zone_pfns); +} + +static __init int numa_setup(char *opt) +{ + if (!opt) + return -EINVAL; + if (!strncmp(opt,"off",3)) + numa_off = 1; +#ifdef CONFIG_NUMA_EMU + if (!strncmp(opt, "fake=", 5)) + cmdline = opt + 5; +#endif +#ifdef CONFIG_ACPI_NUMA + if (!strncmp(opt,"noacpi",6)) + acpi_numa = -1; + if (!strncmp(opt,"hotadd=", 7)) + hotadd_percent = simple_strtoul(opt+7, NULL, 10); +#endif + return 0; +} + +early_param("numa", numa_setup); + +/* + * Setup early cpu_to_node. + * + * Populate cpu_to_node[] only if x86_cpu_to_apicid[], + * and apicid_to_node[] tables have valid entries for a CPU. + * This means we skip cpu_to_node[] initialisation for NUMA + * emulation and faking node case (when running a kernel compiled + * for NUMA on a non NUMA box), which is OK as cpu_to_node[] + * is already initialized in a round robin manner at numa_init_array, + * prior to this call, and this initialization is good enough + * for the fake NUMA cases. + */ +void __init init_cpu_to_node(void) +{ + int i; + for (i = 0; i < NR_CPUS; i++) { + u8 apicid = x86_cpu_to_apicid[i]; + if (apicid == BAD_APICID) + continue; + if (apicid_to_node[apicid] == NUMA_NO_NODE) + continue; + numa_set_node(i,apicid_to_node[apicid]); + } +} + +EXPORT_SYMBOL(cpu_to_node); +EXPORT_SYMBOL(node_to_cpumask); +EXPORT_SYMBOL(memnode); +EXPORT_SYMBOL(node_data); + +#ifdef CONFIG_DISCONTIGMEM +/* + * Functions to convert PFNs from/to per node page addresses. + * These are out of line because they are quite big. + * They could be all tuned by pre caching more state. + * Should do that. + */ + +int pfn_valid(unsigned long pfn) +{ + unsigned nid; + if (pfn >= num_physpages) + return 0; + nid = pfn_to_nid(pfn); + if (nid == 0xff) + return 0; + return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid); +} +EXPORT_SYMBOL(pfn_valid); +#endif diff --git a/arch/x86/mm/pageattr_64.c b/arch/x86/mm/pageattr_64.c new file mode 100644 index 00000000000..10b9809ce82 --- /dev/null +++ b/arch/x86/mm/pageattr_64.c @@ -0,0 +1,249 @@ +/* + * Copyright 2002 Andi Kleen, SuSE Labs. + * Thanks to Ben LaHaise for precious feedback. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +pte_t *lookup_address(unsigned long address) +{ + pgd_t *pgd = pgd_offset_k(address); + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + if (pgd_none(*pgd)) + return NULL; + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return NULL; + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return NULL; + if (pmd_large(*pmd)) + return (pte_t *)pmd; + pte = pte_offset_kernel(pmd, address); + if (pte && !pte_present(*pte)) + pte = NULL; + return pte; +} + +static struct page *split_large_page(unsigned long address, pgprot_t prot, + pgprot_t ref_prot) +{ + int i; + unsigned long addr; + struct page *base = alloc_pages(GFP_KERNEL, 0); + pte_t *pbase; + if (!base) + return NULL; + /* + * page_private is used to track the number of entries in + * the page table page have non standard attributes. + */ + SetPagePrivate(base); + page_private(base) = 0; + + address = __pa(address); + addr = address & LARGE_PAGE_MASK; + pbase = (pte_t *)page_address(base); + for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { + pbase[i] = pfn_pte(addr >> PAGE_SHIFT, + addr == address ? prot : ref_prot); + } + return base; +} + +static void cache_flush_page(void *adr) +{ + int i; + for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) + asm volatile("clflush (%0)" :: "r" (adr + i)); +} + +static void flush_kernel_map(void *arg) +{ + struct list_head *l = (struct list_head *)arg; + struct page *pg; + + /* When clflush is available always use it because it is + much cheaper than WBINVD. */ + /* clflush is still broken. Disable for now. */ + if (1 || !cpu_has_clflush) + asm volatile("wbinvd" ::: "memory"); + else list_for_each_entry(pg, l, lru) { + void *adr = page_address(pg); + cache_flush_page(adr); + } + __flush_tlb_all(); +} + +static inline void flush_map(struct list_head *l) +{ + on_each_cpu(flush_kernel_map, l, 1, 1); +} + +static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */ + +static inline void save_page(struct page *fpage) +{ + if (!test_and_set_bit(PG_arch_1, &fpage->flags)) + list_add(&fpage->lru, &deferred_pages); +} + +/* + * No more special protections in this 2/4MB area - revert to a + * large page again. + */ +static void revert_page(unsigned long address, pgprot_t ref_prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t large_pte; + unsigned long pfn; + + pgd = pgd_offset_k(address); + BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd,address); + BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, address); + BUG_ON(pmd_val(*pmd) & _PAGE_PSE); + pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT; + large_pte = pfn_pte(pfn, ref_prot); + large_pte = pte_mkhuge(large_pte); + set_pte((pte_t *)pmd, large_pte); +} + +static int +__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, + pgprot_t ref_prot) +{ + pte_t *kpte; + struct page *kpte_page; + pgprot_t ref_prot2; + + kpte = lookup_address(address); + if (!kpte) return 0; + kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); + BUG_ON(PageLRU(kpte_page)); + BUG_ON(PageCompound(kpte_page)); + if (pgprot_val(prot) != pgprot_val(ref_prot)) { + if (!pte_huge(*kpte)) { + set_pte(kpte, pfn_pte(pfn, prot)); + } else { + /* + * split_large_page will take the reference for this + * change_page_attr on the split page. + */ + struct page *split; + ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); + split = split_large_page(address, prot, ref_prot2); + if (!split) + return -ENOMEM; + set_pte(kpte, mk_pte(split, ref_prot2)); + kpte_page = split; + } + page_private(kpte_page)++; + } else if (!pte_huge(*kpte)) { + set_pte(kpte, pfn_pte(pfn, ref_prot)); + BUG_ON(page_private(kpte_page) == 0); + page_private(kpte_page)--; + } else + BUG(); + + /* on x86-64 the direct mapping set at boot is not using 4k pages */ + BUG_ON(PageReserved(kpte_page)); + + save_page(kpte_page); + if (page_private(kpte_page) == 0) + revert_page(address, ref_prot); + return 0; +} + +/* + * Change the page attributes of an page in the linear mapping. + * + * This should be used when a page is mapped with a different caching policy + * than write-back somewhere - some CPUs do not like it when mappings with + * different caching policies exist. This changes the page attributes of the + * in kernel linear mapping too. + * + * The caller needs to ensure that there are no conflicting mappings elsewhere. + * This function only deals with the kernel linear map. + * + * Caller must call global_flush_tlb() after this. + */ +int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) +{ + int err = 0, kernel_map = 0; + int i; + + if (address >= __START_KERNEL_map + && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) { + address = (unsigned long)__va(__pa(address)); + kernel_map = 1; + } + + down_write(&init_mm.mmap_sem); + for (i = 0; i < numpages; i++, address += PAGE_SIZE) { + unsigned long pfn = __pa(address) >> PAGE_SHIFT; + + if (!kernel_map || pte_present(pfn_pte(0, prot))) { + err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); + if (err) + break; + } + /* Handle kernel mapping too which aliases part of the + * lowmem */ + if (__pa(address) < KERNEL_TEXT_SIZE) { + unsigned long addr2; + pgprot_t prot2; + addr2 = __START_KERNEL_map + __pa(address); + /* Make sure the kernel mappings stay executable */ + prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); + err = __change_page_attr(addr2, pfn, prot2, + PAGE_KERNEL_EXEC); + } + } + up_write(&init_mm.mmap_sem); + return err; +} + +/* Don't call this for MMIO areas that may not have a mem_map entry */ +int change_page_attr(struct page *page, int numpages, pgprot_t prot) +{ + unsigned long addr = (unsigned long)page_address(page); + return change_page_attr_addr(addr, numpages, prot); +} + +void global_flush_tlb(void) +{ + struct page *pg, *next; + struct list_head l; + + down_read(&init_mm.mmap_sem); + list_replace_init(&deferred_pages, &l); + up_read(&init_mm.mmap_sem); + + flush_map(&l); + + list_for_each_entry_safe(pg, next, &l, lru) { + list_del(&pg->lru); + clear_bit(PG_arch_1, &pg->flags); + if (page_private(pg) != 0) + continue; + ClearPagePrivate(pg); + __free_page(pg); + } +} + +EXPORT_SYMBOL(change_page_attr); +EXPORT_SYMBOL(global_flush_tlb); diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c new file mode 100644 index 00000000000..acdf03e1914 --- /dev/null +++ b/arch/x86/mm/srat_64.c @@ -0,0 +1,566 @@ +/* + * ACPI 3.0 based NUMA setup + * Copyright 2004 Andi Kleen, SuSE Labs. + * + * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. + * + * Called from acpi_numa_init while reading the SRAT and SLIT tables. + * Assumes all memory regions belonging to a single proximity domain + * are in one chunk. Holes between them will be included in the node. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int acpi_numa __initdata; + +static struct acpi_table_slit *acpi_slit; + +static nodemask_t nodes_parsed __initdata; +static struct bootnode nodes[MAX_NUMNODES] __initdata; +static struct bootnode nodes_add[MAX_NUMNODES]; +static int found_add_area __initdata; +int hotadd_percent __initdata = 0; + +/* Too small nodes confuse the VM badly. Usually they result + from BIOS bugs. */ +#define NODE_MIN_SIZE (4*1024*1024) + +static __init int setup_node(int pxm) +{ + return acpi_map_pxm_to_node(pxm); +} + +static __init int conflicting_nodes(unsigned long start, unsigned long end) +{ + int i; + for_each_node_mask(i, nodes_parsed) { + struct bootnode *nd = &nodes[i]; + if (nd->start == nd->end) + continue; + if (nd->end > start && nd->start < end) + return i; + if (nd->end == end && nd->start == start) + return i; + } + return -1; +} + +static __init void cutoff_node(int i, unsigned long start, unsigned long end) +{ + struct bootnode *nd = &nodes[i]; + + if (found_add_area) + return; + + if (nd->start < start) { + nd->start = start; + if (nd->end < nd->start) + nd->start = nd->end; + } + if (nd->end > end) { + nd->end = end; + if (nd->start > nd->end) + nd->start = nd->end; + } +} + +static __init void bad_srat(void) +{ + int i; + printk(KERN_ERR "SRAT: SRAT not used.\n"); + acpi_numa = -1; + found_add_area = 0; + for (i = 0; i < MAX_LOCAL_APIC; i++) + apicid_to_node[i] = NUMA_NO_NODE; + for (i = 0; i < MAX_NUMNODES; i++) + nodes_add[i].start = nodes[i].end = 0; + remove_all_active_ranges(); +} + +static __init inline int srat_disabled(void) +{ + return numa_off || acpi_numa < 0; +} + +/* + * A lot of BIOS fill in 10 (= no distance) everywhere. This messes + * up the NUMA heuristics which wants the local node to have a smaller + * distance than the others. + * Do some quick checks here and only use the SLIT if it passes. + */ +static __init int slit_valid(struct acpi_table_slit *slit) +{ + int i, j; + int d = slit->locality_count; + for (i = 0; i < d; i++) { + for (j = 0; j < d; j++) { + u8 val = slit->entry[d*i + j]; + if (i == j) { + if (val != LOCAL_DISTANCE) + return 0; + } else if (val <= LOCAL_DISTANCE) + return 0; + } + } + return 1; +} + +/* Callback for SLIT parsing */ +void __init acpi_numa_slit_init(struct acpi_table_slit *slit) +{ + if (!slit_valid(slit)) { + printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n"); + return; + } + acpi_slit = slit; +} + +/* Callback for Proximity Domain -> LAPIC mapping */ +void __init +acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) +{ + int pxm, node; + if (srat_disabled()) + return; + if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { + bad_srat(); + return; + } + if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) + return; + pxm = pa->proximity_domain_lo; + node = setup_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); + bad_srat(); + return; + } + apicid_to_node[pa->apic_id] = node; + acpi_numa = 1; + printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", + pxm, pa->apic_id, node); +} + +#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE +/* + * Protect against too large hotadd areas that would fill up memory. + */ +static int hotadd_enough_memory(struct bootnode *nd) +{ + static unsigned long allocated; + static unsigned long last_area_end; + unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT; + long mem = pages * sizeof(struct page); + unsigned long addr; + unsigned long allowed; + unsigned long oldpages = pages; + + if (mem < 0) + return 0; + allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE; + allowed = (allowed / 100) * hotadd_percent; + if (allocated + mem > allowed) { + unsigned long range; + /* Give them at least part of their hotadd memory upto hotadd_percent + It would be better to spread the limit out + over multiple hotplug areas, but that is too complicated + right now */ + if (allocated >= allowed) + return 0; + range = allowed - allocated; + pages = (range / PAGE_SIZE); + mem = pages * sizeof(struct page); + nd->end = nd->start + range; + } + /* Not completely fool proof, but a good sanity check */ + addr = find_e820_area(last_area_end, end_pfn<> PAGE_SHIFT) > end_pfn) + end_pfn = end >> PAGE_SHIFT; + return 1; +} + +static inline int save_add_info(void) +{ + return hotadd_percent > 0; +} +#else +int update_end_of_memory(unsigned long end) {return -1;} +static int hotadd_enough_memory(struct bootnode *nd) {return 1;} +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +static inline int save_add_info(void) {return 1;} +#else +static inline int save_add_info(void) {return 0;} +#endif +#endif +/* + * Update nodes_add and decide if to include add are in the zone. + * Both SPARSE and RESERVE need nodes_add infomation. + * This code supports one contigious hot add area per node. + */ +static int reserve_hotadd(int node, unsigned long start, unsigned long end) +{ + unsigned long s_pfn = start >> PAGE_SHIFT; + unsigned long e_pfn = end >> PAGE_SHIFT; + int ret = 0, changed = 0; + struct bootnode *nd = &nodes_add[node]; + + /* I had some trouble with strange memory hotadd regions breaking + the boot. Be very strict here and reject anything unexpected. + If you want working memory hotadd write correct SRATs. + + The node size check is a basic sanity check to guard against + mistakes */ + if ((signed long)(end - start) < NODE_MIN_SIZE) { + printk(KERN_ERR "SRAT: Hotplug area too small\n"); + return -1; + } + + /* This check might be a bit too strict, but I'm keeping it for now. */ + if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) { + printk(KERN_ERR + "SRAT: Hotplug area %lu -> %lu has existing memory\n", + s_pfn, e_pfn); + return -1; + } + + if (!hotadd_enough_memory(&nodes_add[node])) { + printk(KERN_ERR "SRAT: Hotplug area too large\n"); + return -1; + } + + /* Looks good */ + + if (nd->start == nd->end) { + nd->start = start; + nd->end = end; + changed = 1; + } else { + if (nd->start == end) { + nd->start = start; + changed = 1; + } + if (nd->end == start) { + nd->end = end; + changed = 1; + } + if (!changed) + printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); + } + + ret = update_end_of_memory(nd->end); + + if (changed) + printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); + return ret; +} + +/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ +void __init +acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) +{ + struct bootnode *nd, oldnode; + unsigned long start, end; + int node, pxm; + int i; + + if (srat_disabled()) + return; + if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) { + bad_srat(); + return; + } + if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) + return; + + if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) + return; + start = ma->base_address; + end = start + ma->length; + pxm = ma->proximity_domain; + node = setup_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains.\n"); + bad_srat(); + return; + } + i = conflicting_nodes(start, end); + if (i == node) { + printk(KERN_WARNING + "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n", + pxm, start, end, nodes[i].start, nodes[i].end); + } else if (i >= 0) { + printk(KERN_ERR + "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n", + pxm, start, end, node_to_pxm(i), + nodes[i].start, nodes[i].end); + bad_srat(); + return; + } + nd = &nodes[node]; + oldnode = *nd; + if (!node_test_and_set(node, nodes_parsed)) { + nd->start = start; + nd->end = end; + } else { + if (start < nd->start) + nd->start = start; + if (nd->end < end) + nd->end = end; + } + + printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, + nd->start, nd->end); + e820_register_active_regions(node, nd->start >> PAGE_SHIFT, + nd->end >> PAGE_SHIFT); + push_node_boundaries(node, nd->start >> PAGE_SHIFT, + nd->end >> PAGE_SHIFT); + + if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && + (reserve_hotadd(node, start, end) < 0)) { + /* Ignore hotadd region. Undo damage */ + printk(KERN_NOTICE "SRAT: Hotplug region ignored\n"); + *nd = oldnode; + if ((nd->start | nd->end) == 0) + node_clear(node, nodes_parsed); + } +} + +/* Sanity check to catch more bad SRATs (they are amazingly common). + Make sure the PXMs cover all memory. */ +static int __init nodes_cover_memory(const struct bootnode *nodes) +{ + int i; + unsigned long pxmram, e820ram; + + pxmram = 0; + for_each_node_mask(i, nodes_parsed) { + unsigned long s = nodes[i].start >> PAGE_SHIFT; + unsigned long e = nodes[i].end >> PAGE_SHIFT; + pxmram += e - s; + pxmram -= absent_pages_in_range(s, e); + if ((long)pxmram < 0) + pxmram = 0; + } + + e820ram = end_pfn - absent_pages_in_range(0, end_pfn); + /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ + if ((long)(e820ram - pxmram) >= 1*1024*1024) { + printk(KERN_ERR + "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", + (pxmram << PAGE_SHIFT) >> 20, + (e820ram << PAGE_SHIFT) >> 20); + return 0; + } + return 1; +} + +static void unparse_node(int node) +{ + int i; + node_clear(node, nodes_parsed); + for (i = 0; i < MAX_LOCAL_APIC; i++) { + if (apicid_to_node[i] == node) + apicid_to_node[i] = NUMA_NO_NODE; + } +} + +void __init acpi_numa_arch_fixup(void) {} + +/* Use the information discovered above to actually set up the nodes. */ +int __init acpi_scan_nodes(unsigned long start, unsigned long end) +{ + int i; + + if (acpi_numa <= 0) + return -1; + + /* First clean up the node list */ + for (i = 0; i < MAX_NUMNODES; i++) { + cutoff_node(i, start, end); + if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { + unparse_node(i); + node_set_offline(i); + } + } + + if (!nodes_cover_memory(nodes)) { + bad_srat(); + return -1; + } + + memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES); + if (memnode_shift < 0) { + printk(KERN_ERR + "SRAT: No NUMA node hash function found. Contact maintainer\n"); + bad_srat(); + return -1; + } + + node_possible_map = nodes_parsed; + + /* Finally register nodes */ + for_each_node_mask(i, node_possible_map) + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + /* Try again in case setup_node_bootmem missed one due + to missing bootmem */ + for_each_node_mask(i, node_possible_map) + if (!node_online(i)) + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + + for (i = 0; i < NR_CPUS; i++) { + if (cpu_to_node[i] == NUMA_NO_NODE) + continue; + if (!node_isset(cpu_to_node[i], node_possible_map)) + numa_set_node(i, NUMA_NO_NODE); + } + numa_init_array(); + return 0; +} + +#ifdef CONFIG_NUMA_EMU +static int __init find_node_by_addr(unsigned long addr) +{ + int ret = NUMA_NO_NODE; + int i; + + for_each_node_mask(i, nodes_parsed) { + /* + * Find the real node that this emulated node appears on. For + * the sake of simplicity, we only use a real node's starting + * address to determine which emulated node it appears on. + */ + if (addr >= nodes[i].start && addr < nodes[i].end) { + ret = i; + break; + } + } + return i; +} + +/* + * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID + * mappings that respect the real ACPI topology but reflect our emulated + * environment. For each emulated node, we find which real node it appears on + * and create PXM to NID mappings for those fake nodes which mirror that + * locality. SLIT will now represent the correct distances between emulated + * nodes as a result of the real topology. + */ +void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) +{ + int i, j; + int fake_node_to_pxm_map[MAX_NUMNODES] = { + [0 ... MAX_NUMNODES-1] = PXM_INVAL + }; + unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE + }; + + printk(KERN_INFO "Faking PXM affinity for fake nodes on real " + "topology.\n"); + for (i = 0; i < num_nodes; i++) { + int nid, pxm; + + nid = find_node_by_addr(fake_nodes[i].start); + if (nid == NUMA_NO_NODE) + continue; + pxm = node_to_pxm(nid); + if (pxm == PXM_INVAL) + continue; + fake_node_to_pxm_map[i] = pxm; + /* + * For each apicid_to_node mapping that exists for this real + * node, it must now point to the fake node ID. + */ + for (j = 0; j < MAX_LOCAL_APIC; j++) + if (apicid_to_node[j] == nid) + fake_apicid_to_node[j] = i; + } + for (i = 0; i < num_nodes; i++) + __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); + memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); + + nodes_clear(nodes_parsed); + for (i = 0; i < num_nodes; i++) + if (fake_nodes[i].start != fake_nodes[i].end) + node_set(i, nodes_parsed); + WARN_ON(!nodes_cover_memory(fake_nodes)); +} + +static int null_slit_node_compare(int a, int b) +{ + return node_to_pxm(a) == node_to_pxm(b); +} +#else +static int null_slit_node_compare(int a, int b) +{ + return a == b; +} +#endif /* CONFIG_NUMA_EMU */ + +void __init srat_reserve_add_area(int nodeid) +{ + if (found_add_area && nodes_add[nodeid].end) { + u64 total_mb; + + printk(KERN_INFO "SRAT: Reserving hot-add memory space " + "for node %d at %Lx-%Lx\n", + nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end); + total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start) + >> PAGE_SHIFT; + total_mb *= sizeof(struct page); + total_mb >>= 20; + printk(KERN_INFO "SRAT: This will cost you %Lu MB of " + "pre-allocated memory.\n", (unsigned long long)total_mb); + reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start, + nodes_add[nodeid].end - nodes_add[nodeid].start); + } +} + +int __node_distance(int a, int b) +{ + int index; + + if (!acpi_slit) + return null_slit_node_compare(a, b) ? LOCAL_DISTANCE : + REMOTE_DISTANCE; + index = acpi_slit->locality_count * node_to_pxm(a); + return acpi_slit->entry[index + node_to_pxm(b)]; +} + +EXPORT_SYMBOL(__node_distance); + +int memory_add_physaddr_to_nid(u64 start) +{ + int i, ret = 0; + + for_each_node(i) + if (nodes_add[i].start <= start && nodes_add[i].end > start) + ret = i; + + return ret; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); + diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 11ef2c37ba5..4208a0ded52 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -75,7 +75,7 @@ head-y := arch/x86_64/kernel/head_64.o arch/x86_64/kernel/head64.o arch/x86_64/k libs-y += arch/x86/lib/ core-y += arch/x86_64/kernel/ \ - arch/x86_64/mm/ \ + arch/x86/mm/ \ arch/x86/crypto/ \ arch/x86/vdso/ core-$(CONFIG_IA32_EMULATION) += arch/x86_64/ia32/ diff --git a/arch/x86_64/mm/Makefile b/arch/x86_64/mm/Makefile deleted file mode 100644 index 7317648e658..00000000000 --- a/arch/x86_64/mm/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -ifeq ($(CONFIG_X86_32),y) -include ${srctree}/arch/x86/mm/Makefile_32 -else -include ${srctree}/arch/x86_64/mm/Makefile_64 -endif diff --git a/arch/x86_64/mm/Makefile_64 b/arch/x86_64/mm/Makefile_64 deleted file mode 100644 index 5c2883cad11..00000000000 --- a/arch/x86_64/mm/Makefile_64 +++ /dev/null @@ -1,11 +0,0 @@ -# -# Makefile for the linux x86_64-specific parts of the memory manager. -# - -obj-y := init_64.o fault_64.o ioremap_64.o extable_64.o pageattr_64.o mmap_64.o -obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_NUMA) += numa_64.o -obj-$(CONFIG_K8_NUMA) += k8topology_64.o -obj-$(CONFIG_ACPI_NUMA) += srat_64.o - -hugetlbpage-y = ../../x86/mm/hugetlbpage.o diff --git a/arch/x86_64/mm/extable_64.c b/arch/x86_64/mm/extable_64.c deleted file mode 100644 index 79ac6e7100a..00000000000 --- a/arch/x86_64/mm/extable_64.c +++ /dev/null @@ -1,34 +0,0 @@ -/* - * linux/arch/x86_64/mm/extable.c - */ - -#include -#include -#include -#include - -/* Simple binary search */ -const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, - unsigned long value) -{ - /* Work around a B stepping K8 bug */ - if ((value >> 32) == 0) - value |= 0xffffffffUL << 32; - - while (first <= last) { - const struct exception_table_entry *mid; - long diff; - - mid = (last - first) / 2 + first; - diff = mid->insn - value; - if (diff == 0) - return mid; - else if (diff < 0) - first = mid+1; - else - last = mid-1; - } - return NULL; -} diff --git a/arch/x86_64/mm/fault_64.c b/arch/x86_64/mm/fault_64.c deleted file mode 100644 index 54816adb8e9..00000000000 --- a/arch/x86_64/mm/fault_64.c +++ /dev/null @@ -1,636 +0,0 @@ -/* - * linux/arch/x86-64/mm/fault.c - * - * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* For unblank_screen() */ -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -/* Page fault error code bits */ -#define PF_PROT (1<<0) /* or no page found */ -#define PF_WRITE (1<<1) -#define PF_USER (1<<2) -#define PF_RSVD (1<<3) -#define PF_INSTR (1<<4) - -static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); - -/* Hook to register for page fault notifications */ -int register_page_fault_notifier(struct notifier_block *nb) -{ - vmalloc_sync_all(); - return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); -} -EXPORT_SYMBOL_GPL(register_page_fault_notifier); - -int unregister_page_fault_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); -} -EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); - -static inline int notify_page_fault(struct pt_regs *regs, long err) -{ - struct die_args args = { - .regs = regs, - .str = "page fault", - .err = err, - .trapnr = 14, - .signr = SIGSEGV - }; - return atomic_notifier_call_chain(¬ify_page_fault_chain, - DIE_PAGE_FAULT, &args); -} - -/* Sometimes the CPU reports invalid exceptions on prefetch. - Check that here and ignore. - Opcode checker based on code by Richard Brunner */ -static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, - unsigned long error_code) -{ - unsigned char *instr; - int scan_more = 1; - int prefetch = 0; - unsigned char *max_instr; - - /* If it was a exec fault ignore */ - if (error_code & PF_INSTR) - return 0; - - instr = (unsigned char __user *)convert_rip_to_linear(current, regs); - max_instr = instr + 15; - - if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) - return 0; - - while (scan_more && instr < max_instr) { - unsigned char opcode; - unsigned char instr_hi; - unsigned char instr_lo; - - if (probe_kernel_address(instr, opcode)) - break; - - instr_hi = opcode & 0xf0; - instr_lo = opcode & 0x0f; - instr++; - - switch (instr_hi) { - case 0x20: - case 0x30: - /* Values 0x26,0x2E,0x36,0x3E are valid x86 - prefixes. In long mode, the CPU will signal - invalid opcode if some of these prefixes are - present so we will never get here anyway */ - scan_more = ((instr_lo & 7) == 0x6); - break; - - case 0x40: - /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes - Need to figure out under what instruction mode the - instruction was issued ... */ - /* Could check the LDT for lm, but for now it's good - enough to assume that long mode only uses well known - segments or kernel. */ - scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); - break; - - case 0x60: - /* 0x64 thru 0x67 are valid prefixes in all modes. */ - scan_more = (instr_lo & 0xC) == 0x4; - break; - case 0xF0: - /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */ - scan_more = !instr_lo || (instr_lo>>1) == 1; - break; - case 0x00: - /* Prefetch instruction is 0x0F0D or 0x0F18 */ - scan_more = 0; - if (probe_kernel_address(instr, opcode)) - break; - prefetch = (instr_lo == 0xF) && - (opcode == 0x0D || opcode == 0x18); - break; - default: - scan_more = 0; - break; - } - } - return prefetch; -} - -static int bad_address(void *p) -{ - unsigned long dummy; - return probe_kernel_address((unsigned long *)p, dummy); -} - -void dump_pagetable(unsigned long address) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pgd = (pgd_t *)read_cr3(); - - pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); - pgd += pgd_index(address); - if (bad_address(pgd)) goto bad; - printk("PGD %lx ", pgd_val(*pgd)); - if (!pgd_present(*pgd)) goto ret; - - pud = pud_offset(pgd, address); - if (bad_address(pud)) goto bad; - printk("PUD %lx ", pud_val(*pud)); - if (!pud_present(*pud)) goto ret; - - pmd = pmd_offset(pud, address); - if (bad_address(pmd)) goto bad; - printk("PMD %lx ", pmd_val(*pmd)); - if (!pmd_present(*pmd)) goto ret; - - pte = pte_offset_kernel(pmd, address); - if (bad_address(pte)) goto bad; - printk("PTE %lx", pte_val(*pte)); -ret: - printk("\n"); - return; -bad: - printk("BAD\n"); -} - -static const char errata93_warning[] = -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" -KERN_ERR "******* Please consider a BIOS update.\n" -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; - -/* Workaround for K8 erratum #93 & buggy BIOS. - BIOS SMM functions are required to use a specific workaround - to avoid corruption of the 64bit RIP register on C stepping K8. - A lot of BIOS that didn't get tested properly miss this. - The OS sees this as a page fault with the upper 32bits of RIP cleared. - Try to work around it here. - Note we only handle faults in kernel here. */ - -static int is_errata93(struct pt_regs *regs, unsigned long address) -{ - static int warned; - if (address != regs->rip) - return 0; - if ((address >> 32) != 0) - return 0; - address |= 0xffffffffUL << 32; - if ((address >= (u64)_stext && address <= (u64)_etext) || - (address >= MODULES_VADDR && address <= MODULES_END)) { - if (!warned) { - printk(errata93_warning); - warned = 1; - } - regs->rip = address; - return 1; - } - return 0; -} - -static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, - unsigned long error_code) -{ - unsigned long flags = oops_begin(); - struct task_struct *tsk; - - printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", - current->comm, address); - dump_pagetable(address); - tsk = current; - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - __die("Bad pagetable", regs, error_code); - oops_end(flags); - do_exit(SIGKILL); -} - -/* - * Handle a fault on the vmalloc area - * - * This assumes no large pages in there. - */ -static int vmalloc_fault(unsigned long address) -{ - pgd_t *pgd, *pgd_ref; - pud_t *pud, *pud_ref; - pmd_t *pmd, *pmd_ref; - pte_t *pte, *pte_ref; - - /* Copy kernel mappings over when needed. This can also - happen within a race in page table update. In the later - case just flush. */ - - pgd = pgd_offset(current->mm ?: &init_mm, address); - pgd_ref = pgd_offset_k(address); - if (pgd_none(*pgd_ref)) - return -1; - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - - /* Below here mismatches are bugs because these lower tables - are shared */ - - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); - if (pud_none(*pud_ref)) - return -1; - if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) - BUG(); - pmd = pmd_offset(pud, address); - pmd_ref = pmd_offset(pud_ref, address); - if (pmd_none(*pmd_ref)) - return -1; - if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) - BUG(); - pte_ref = pte_offset_kernel(pmd_ref, address); - if (!pte_present(*pte_ref)) - return -1; - pte = pte_offset_kernel(pmd, address); - /* Don't use pte_page here, because the mappings can point - outside mem_map, and the NUMA hash lookup cannot handle - that. */ - if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) - BUG(); - return 0; -} - -static int page_fault_trace; -int show_unhandled_signals = 1; - -/* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. - */ -asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, - unsigned long error_code) -{ - struct task_struct *tsk; - struct mm_struct *mm; - struct vm_area_struct * vma; - unsigned long address; - const struct exception_table_entry *fixup; - int write, fault; - unsigned long flags; - siginfo_t info; - - tsk = current; - mm = tsk->mm; - prefetchw(&mm->mmap_sem); - - /* get the address */ - address = read_cr2(); - - info.si_code = SEGV_MAPERR; - - - /* - * We fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * This verifies that the fault happens in kernel space - * (error_code & 4) == 0, and that the fault was not a - * protection error (error_code & 9) == 0. - */ - if (unlikely(address >= TASK_SIZE64)) { - /* - * Don't check for the module range here: its PML4 - * is always initialized because it's shared with the main - * kernel text. Only vmalloc may need PML4 syncups. - */ - if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && - ((address >= VMALLOC_START && address < VMALLOC_END))) { - if (vmalloc_fault(address) >= 0) - return; - } - if (notify_page_fault(regs, error_code) == NOTIFY_STOP) - return; - /* - * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. - */ - goto bad_area_nosemaphore; - } - - if (notify_page_fault(regs, error_code) == NOTIFY_STOP) - return; - - if (likely(regs->eflags & X86_EFLAGS_IF)) - local_irq_enable(); - - if (unlikely(page_fault_trace)) - printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", - regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); - - if (unlikely(error_code & PF_RSVD)) - pgtable_bad(address, regs, error_code); - - /* - * If we're in an interrupt or have no user - * context, we must not take the fault.. - */ - if (unlikely(in_atomic() || !mm)) - goto bad_area_nosemaphore; - - /* - * User-mode registers count as a user access even for any - * potential system fault or CPU buglet. - */ - if (user_mode_vm(regs)) - error_code |= PF_USER; - - again: - /* When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunatly, in the case of an - * erroneous fault occurring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. - * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibilty of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. - */ - if (!down_read_trylock(&mm->mmap_sem)) { - if ((error_code & PF_USER) == 0 && - !search_exception_tables(regs->rip)) - goto bad_area_nosemaphore; - down_read(&mm->mmap_sem); - } - - vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (likely(vma->vm_start <= address)) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (error_code & 4) { - /* Allow userspace just enough access below the stack pointer - * to let the 'enter' instruction work. - */ - if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp) - goto bad_area; - } - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ -good_area: - info.si_code = SEGV_ACCERR; - write = 0; - switch (error_code & (PF_PROT|PF_WRITE)) { - default: /* 3: write, present */ - /* fall through */ - case PF_WRITE: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - write++; - break; - case PF_PROT: /* read, present */ - goto bad_area; - case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; - } - - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ - fault = handle_mm_fault(mm, vma, address, write); - if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto do_sigbus; - BUG(); - } - if (fault & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - up_read(&mm->mmap_sem); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses just cause a SIGSEGV */ - if (error_code & PF_USER) { - - /* - * It's possible to have interrupts off here. - */ - local_irq_enable(); - - if (is_prefetch(regs, address, error_code)) - return; - - /* Work around K8 erratum #100 K8 in compat mode - occasionally jumps to illegal addresses >4GB. We - catch this here in the page fault handler because - these addresses are not reachable. Just detect this - case and return. Any code segment in LDT is - compatibility mode. */ - if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && - (address >> 32)) - return; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk( - "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", - tsk->pid > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, tsk->pid, address, regs->rip, - regs->rsp, error_code); - } - - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; - info.si_signo = SIGSEGV; - info.si_errno = 0; - /* info.si_code has been set above */ - info.si_addr = (void __user *)address; - force_sig_info(SIGSEGV, &info, tsk); - return; - } - -no_context: - - /* Are we prepared to handle this kernel fault? */ - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - return; - } - - /* - * Hall of shame of CPU/BIOS bugs. - */ - - if (is_prefetch(regs, address, error_code)) - return; - - if (is_errata93(regs, address)) - return; - -/* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - - flags = oops_begin(); - - if (address < PAGE_SIZE) - printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); - else - printk(KERN_ALERT "Unable to handle kernel paging request"); - printk(" at %016lx RIP: \n" KERN_ALERT,address); - printk_address(regs->rip); - dump_pagetable(address); - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - __die("Oops", regs, error_code); - /* Executive summary in case the body of the oops scrolled away */ - printk(KERN_EMERG "CR2: %016lx\n", address); - oops_end(flags); - do_exit(SIGKILL); - -/* - * We ran out of memory, or some other thing happened to us that made - * us unable to handle the page fault gracefully. - */ -out_of_memory: - up_read(&mm->mmap_sem); - if (is_init(current)) { - yield(); - goto again; - } - printk("VM: killing process %s\n", tsk->comm); - if (error_code & 4) - do_group_exit(SIGKILL); - goto no_context; - -do_sigbus: - up_read(&mm->mmap_sem); - - /* Kernel mode? Handle exceptions or die */ - if (!(error_code & PF_USER)) - goto no_context; - - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_ADRERR; - info.si_addr = (void __user *)address; - force_sig_info(SIGBUS, &info, tsk); - return; -} - -DEFINE_SPINLOCK(pgd_lock); -LIST_HEAD(pgd_list); - -void vmalloc_sync_all(void) -{ - /* Note that races in the updates of insync and start aren't - problematic: - insync can only get set bits added, and updates to start are only - improving performance (without affecting correctness if undone). */ - static DECLARE_BITMAP(insync, PTRS_PER_PGD); - static unsigned long start = VMALLOC_START & PGDIR_MASK; - unsigned long address; - - for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { - if (!test_bit(pgd_index(address), insync)) { - const pgd_t *pgd_ref = pgd_offset_k(address); - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - spin_lock(&pgd_lock); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock(&pgd_lock); - set_bit(pgd_index(address), insync); - } - if (address == start) - start = address + PGDIR_SIZE; - } - /* Check that there is no need to do the same for the modules area. */ - BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); - BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == - (__START_KERNEL & PGDIR_MASK))); -} - -static int __init enable_pagefaulttrace(char *str) -{ - page_fault_trace = 1; - return 1; -} -__setup("pagefaulttrace", enable_pagefaulttrace); diff --git a/arch/x86_64/mm/init_64.c b/arch/x86_64/mm/init_64.c deleted file mode 100644 index 458893b376f..00000000000 --- a/arch/x86_64/mm/init_64.c +++ /dev/null @@ -1,750 +0,0 @@ -/* - * linux/arch/x86_64/mm/init.c - * - * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2000 Pavel Machek - * Copyright (C) 2002,2003 Andi Kleen - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifndef Dprintk -#define Dprintk(x...) -#endif - -const struct dma_mapping_ops* dma_ops; -EXPORT_SYMBOL(dma_ops); - -static unsigned long dma_reserve __initdata; - -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - -/* - * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the - * physical space so we can cache the place of the first one and move - * around without checking the pgd every time. - */ - -void show_mem(void) -{ - long i, total = 0, reserved = 0; - long shared = 0, cached = 0; - pg_data_t *pgdat; - struct page *page; - - printk(KERN_INFO "Mem-info:\n"); - show_free_areas(); - printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); - - for_each_online_pgdat(pgdat) { - for (i = 0; i < pgdat->node_spanned_pages; ++i) { - /* this loop can take a while with 256 GB and 4k pages - so update the NMI watchdog */ - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) { - touch_nmi_watchdog(); - } - if (!pfn_valid(pgdat->node_start_pfn + i)) - continue; - page = pfn_to_page(pgdat->node_start_pfn + i); - total++; - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (page_count(page)) - shared += page_count(page) - 1; - } - } - printk(KERN_INFO "%lu pages of RAM\n", total); - printk(KERN_INFO "%lu reserved pages\n",reserved); - printk(KERN_INFO "%lu pages shared\n",shared); - printk(KERN_INFO "%lu pages swap cached\n",cached); -} - -int after_bootmem; - -static __init void *spp_getpage(void) -{ - void *ptr; - if (after_bootmem) - ptr = (void *) get_zeroed_page(GFP_ATOMIC); - else - ptr = alloc_bootmem_pages(PAGE_SIZE); - if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) - panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":""); - - Dprintk("spp_getpage %p\n", ptr); - return ptr; -} - -static __init void set_pte_phys(unsigned long vaddr, - unsigned long phys, pgprot_t prot) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte, new_pte; - - Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); - - pgd = pgd_offset_k(vaddr); - if (pgd_none(*pgd)) { - printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); - return; - } - pud = pud_offset(pgd, vaddr); - if (pud_none(*pud)) { - pmd = (pmd_t *) spp_getpage(); - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); - if (pmd != pmd_offset(pud, 0)) { - printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); - return; - } - } - pmd = pmd_offset(pud, vaddr); - if (pmd_none(*pmd)) { - pte = (pte_t *) spp_getpage(); - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); - if (pte != pte_offset_kernel(pmd, 0)) { - printk("PAGETABLE BUG #02!\n"); - return; - } - } - new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); - - pte = pte_offset_kernel(pmd, vaddr); - if (!pte_none(*pte) && - pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) - pte_ERROR(*pte); - set_pte(pte, new_pte); - - /* - * It's enough to flush this one mapping. - * (PGE mappings get flushed as well) - */ - __flush_tlb_one(vaddr); -} - -/* NOTE: this is meant to be run only at boot */ -void __init -__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) -{ - unsigned long address = __fix_to_virt(idx); - - if (idx >= __end_of_fixed_addresses) { - printk("Invalid __set_fixmap\n"); - return; - } - set_pte_phys(address, phys, prot); -} - -unsigned long __meminitdata table_start, table_end; - -static __meminit void *alloc_low_page(unsigned long *phys) -{ - unsigned long pfn = table_end++; - void *adr; - - if (after_bootmem) { - adr = (void *)get_zeroed_page(GFP_ATOMIC); - *phys = __pa(adr); - return adr; - } - - if (pfn >= end_pfn) - panic("alloc_low_page: ran out of memory"); - - adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); - memset(adr, 0, PAGE_SIZE); - *phys = pfn * PAGE_SIZE; - return adr; -} - -static __meminit void unmap_low_page(void *adr) -{ - - if (after_bootmem) - return; - - early_iounmap(adr, PAGE_SIZE); -} - -/* Must run before zap_low_mappings */ -__meminit void *early_ioremap(unsigned long addr, unsigned long size) -{ - unsigned long vaddr; - pmd_t *pmd, *last_pmd; - int i, pmds; - - pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; - vaddr = __START_KERNEL_map; - pmd = level2_kernel_pgt; - last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; - for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { - for (i = 0; i < pmds; i++) { - if (pmd_present(pmd[i])) - goto next; - } - vaddr += addr & ~PMD_MASK; - addr &= PMD_MASK; - for (i = 0; i < pmds; i++, addr += PMD_SIZE) - set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE)); - __flush_tlb(); - return (void *)vaddr; - next: - ; - } - printk("early_ioremap(0x%lx, %lu) failed\n", addr, size); - return NULL; -} - -/* To avoid virtual aliases later */ -__meminit void early_iounmap(void *addr, unsigned long size) -{ - unsigned long vaddr; - pmd_t *pmd; - int i, pmds; - - vaddr = (unsigned long)addr; - pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; - pmd = level2_kernel_pgt + pmd_index(vaddr); - for (i = 0; i < pmds; i++) - pmd_clear(pmd + i); - __flush_tlb(); -} - -static void __meminit -phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) -{ - int i = pmd_index(address); - - for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { - unsigned long entry; - pmd_t *pmd = pmd_page + pmd_index(address); - - if (address >= end) { - if (!after_bootmem) - for (; i < PTRS_PER_PMD; i++, pmd++) - set_pmd(pmd, __pmd(0)); - break; - } - - if (pmd_val(*pmd)) - continue; - - entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address; - entry &= __supported_pte_mask; - set_pmd(pmd, __pmd(entry)); - } -} - -static void __meminit -phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) -{ - pmd_t *pmd = pmd_offset(pud,0); - spin_lock(&init_mm.page_table_lock); - phys_pmd_init(pmd, address, end); - spin_unlock(&init_mm.page_table_lock); - __flush_tlb_all(); -} - -static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) -{ - int i = pud_index(addr); - - - for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) { - unsigned long pmd_phys; - pud_t *pud = pud_page + pud_index(addr); - pmd_t *pmd; - - if (addr >= end) - break; - - if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) { - set_pud(pud, __pud(0)); - continue; - } - - if (pud_val(*pud)) { - phys_pmd_update(pud, addr, end); - continue; - } - - pmd = alloc_low_page(&pmd_phys); - spin_lock(&init_mm.page_table_lock); - set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); - phys_pmd_init(pmd, addr, end); - spin_unlock(&init_mm.page_table_lock); - unmap_low_page(pmd); - } - __flush_tlb(); -} - -static void __init find_early_table_space(unsigned long end) -{ - unsigned long puds, pmds, tables, start; - - puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) + - round_up(pmds * sizeof(pmd_t), PAGE_SIZE); - - /* RED-PEN putting page tables only on node 0 could - cause a hotspot and fill up ZONE_DMA. The page tables - need roughly 0.5KB per GB. */ - start = 0x8000; - table_start = find_e820_area(start, end, tables); - if (table_start == -1UL) - panic("Cannot find space for the kernel page tables"); - - table_start >>= PAGE_SHIFT; - table_end = table_start; - - early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, table_start << PAGE_SHIFT, - (table_start << PAGE_SHIFT) + tables); -} - -/* Setup the direct mapping of the physical memory at PAGE_OFFSET. - This runs before bootmem is initialized and gets pages directly from the - physical memory. To access them they are temporarily mapped. */ -void __meminit init_memory_mapping(unsigned long start, unsigned long end) -{ - unsigned long next; - - Dprintk("init_memory_mapping\n"); - - /* - * Find space for the kernel direct mapping tables. - * Later we should allocate these tables in the local node of the memory - * mapped. Unfortunately this is done currently before the nodes are - * discovered. - */ - if (!after_bootmem) - find_early_table_space(end); - - start = (unsigned long)__va(start); - end = (unsigned long)__va(end); - - for (; start < end; start = next) { - unsigned long pud_phys; - pgd_t *pgd = pgd_offset_k(start); - pud_t *pud; - - if (after_bootmem) - pud = pud_offset(pgd, start & PGDIR_MASK); - else - pud = alloc_low_page(&pud_phys); - - next = start + PGDIR_SIZE; - if (next > end) - next = end; - phys_pud_init(pud, __pa(start), __pa(next)); - if (!after_bootmem) - set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); - unmap_low_page(pud); - } - - if (!after_bootmem) - mmu_cr4_features = read_cr4(); - __flush_tlb_all(); -} - -#ifndef CONFIG_NUMA -void __init paging_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; - max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; - max_zone_pfns[ZONE_NORMAL] = end_pfn; - - memory_present(0, 0, end_pfn); - sparse_init(); - free_area_init_nodes(max_zone_pfns); -} -#endif - -/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches - from the CPU leading to inconsistent cache lines. address and size - must be aligned to 2MB boundaries. - Does nothing when the mapping doesn't exist. */ -void __init clear_kernel_mapping(unsigned long address, unsigned long size) -{ - unsigned long end = address + size; - - BUG_ON(address & ~LARGE_PAGE_MASK); - BUG_ON(size & ~LARGE_PAGE_MASK); - - for (; address < end; address += LARGE_PAGE_SIZE) { - pgd_t *pgd = pgd_offset_k(address); - pud_t *pud; - pmd_t *pmd; - if (pgd_none(*pgd)) - continue; - pud = pud_offset(pgd, address); - if (pud_none(*pud)) - continue; - pmd = pmd_offset(pud, address); - if (!pmd || pmd_none(*pmd)) - continue; - if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { - /* Could handle this, but it should not happen currently. */ - printk(KERN_ERR - "clear_kernel_mapping: mapping has been split. will leak memory\n"); - pmd_ERROR(*pmd); - } - set_pmd(pmd, __pmd(0)); - } - __flush_tlb_all(); -} - -/* - * Memory hotplug specific functions - */ -void online_page(struct page *page) -{ - ClearPageReserved(page); - init_page_count(page); - __free_page(page); - totalram_pages++; - num_physpages++; -} - -#ifdef CONFIG_MEMORY_HOTPLUG -/* - * Memory is added always to NORMAL zone. This means you will never get - * additional DMA/DMA32 memory. - */ -int arch_add_memory(int nid, u64 start, u64 size) -{ - struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = pgdat->node_zones + ZONE_NORMAL; - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long nr_pages = size >> PAGE_SHIFT; - int ret; - - init_memory_mapping(start, (start + size -1)); - - ret = __add_pages(zone, start_pfn, nr_pages); - if (ret) - goto error; - - return ret; -error: - printk("%s: Problem encountered in __add_pages!\n", __func__); - return ret; -} -EXPORT_SYMBOL_GPL(arch_add_memory); - -int remove_memory(u64 start, u64 size) -{ - return -EINVAL; -} -EXPORT_SYMBOL_GPL(remove_memory); - -#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA) -int memory_add_physaddr_to_nid(u64 start) -{ - return 0; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif - -#endif /* CONFIG_MEMORY_HOTPLUG */ - -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE -/* - * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance, - * just online the pages. - */ -int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) -{ - int err = -EIO; - unsigned long pfn; - unsigned long total = 0, mem = 0; - for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { - if (pfn_valid(pfn)) { - online_page(pfn_to_page(pfn)); - err = 0; - mem++; - } - total++; - } - if (!err) { - z->spanned_pages += total; - z->present_pages += mem; - z->zone_pgdat->node_spanned_pages += total; - z->zone_pgdat->node_present_pages += mem; - } - return err; -} -#endif - -static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, - kcore_vsyscall; - -void __init mem_init(void) -{ - long codesize, reservedpages, datasize, initsize; - - pci_iommu_alloc(); - - /* clear the zero-page */ - memset(empty_zero_page, 0, PAGE_SIZE); - - reservedpages = 0; - - /* this will put all low memory onto the freelists */ -#ifdef CONFIG_NUMA - totalram_pages = numa_free_all_bootmem(); -#else - totalram_pages = free_all_bootmem(); -#endif - reservedpages = end_pfn - totalram_pages - - absent_pages_in_range(0, end_pfn); - - after_bootmem = 1; - - codesize = (unsigned long) &_etext - (unsigned long) &_text; - datasize = (unsigned long) &_edata - (unsigned long) &_etext; - initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - - /* Register memory areas for /proc/kcore */ - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, - VMALLOC_END-VMALLOC_START); - kclist_add(&kcore_kernel, &_stext, _end - _stext); - kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); - kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, - VSYSCALL_END - VSYSCALL_START); - - printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), - end_pfn << (PAGE_SHIFT-10), - codesize >> 10, - reservedpages << (PAGE_SHIFT-10), - datasize >> 10, - initsize >> 10); -} - -void free_init_pages(char *what, unsigned long begin, unsigned long end) -{ - unsigned long addr; - - if (begin >= end) - return; - - printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); - for (addr = begin; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)(addr & ~(PAGE_SIZE-1)), - POISON_FREE_INITMEM, PAGE_SIZE); - if (addr >= __START_KERNEL_map) - change_page_attr_addr(addr, 1, __pgprot(0)); - free_page(addr); - totalram_pages++; - } - if (addr > __START_KERNEL_map) - global_flush_tlb(); -} - -void free_initmem(void) -{ - free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); -} - -#ifdef CONFIG_DEBUG_RODATA - -void mark_rodata_ro(void) -{ - unsigned long start = (unsigned long)_stext, end; - -#ifdef CONFIG_HOTPLUG_CPU - /* It must still be possible to apply SMP alternatives. */ - if (num_possible_cpus() > 1) - start = (unsigned long)_etext; -#endif - -#ifdef CONFIG_KPROBES - start = (unsigned long)__start_rodata; -#endif - - end = (unsigned long)__end_rodata; - start = (start + PAGE_SIZE - 1) & PAGE_MASK; - end &= PAGE_MASK; - if (end <= start) - return; - - change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO); - - printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", - (end - start) >> 10); - - /* - * change_page_attr_addr() requires a global_flush_tlb() call after it. - * We do this after the printk so that if something went wrong in the - * change, the printk gets out at least to give a better debug hint - * of who is the culprit. - */ - global_flush_tlb(); -} -#endif - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_init_pages("initrd memory", start, end); -} -#endif - -void __init reserve_bootmem_generic(unsigned long phys, unsigned len) -{ -#ifdef CONFIG_NUMA - int nid = phys_to_nid(phys); -#endif - unsigned long pfn = phys >> PAGE_SHIFT; - if (pfn >= end_pfn) { - /* This can happen with kdump kernels when accessing firmware - tables. */ - if (pfn < end_pfn_map) - return; - printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", - phys, len); - return; - } - - /* Should check here against the e820 map to avoid double free */ -#ifdef CONFIG_NUMA - reserve_bootmem_node(NODE_DATA(nid), phys, len); -#else - reserve_bootmem(phys, len); -#endif - if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { - dma_reserve += len / PAGE_SIZE; - set_dma_reserve(dma_reserve); - } -} - -int kern_addr_valid(unsigned long addr) -{ - unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - if (above != 0 && above != -1UL) - return 0; - - pgd = pgd_offset_k(addr); - if (pgd_none(*pgd)) - return 0; - - pud = pud_offset(pgd, addr); - if (pud_none(*pud)) - return 0; - - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) - return 0; - if (pmd_large(*pmd)) - return pfn_valid(pmd_pfn(*pmd)); - - pte = pte_offset_kernel(pmd, addr); - if (pte_none(*pte)) - return 0; - return pfn_valid(pte_pfn(*pte)); -} - -/* A pseudo VMA to allow ptrace access for the vsyscall page. This only - covers the 64bit vsyscall page now. 32bit has a real VMA now and does - not need special handling anymore. */ - -static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_START, - .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT), - .vm_page_prot = PAGE_READONLY_EXEC, - .vm_flags = VM_READ | VM_EXEC -}; - -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) -{ -#ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(tsk, TIF_IA32)) - return NULL; -#endif - return &gate_vma; -} - -int in_gate_area(struct task_struct *task, unsigned long addr) -{ - struct vm_area_struct *vma = get_gate_vma(task); - if (!vma) - return 0; - return (addr >= vma->vm_start) && (addr < vma->vm_end); -} - -/* Use this when you have no reliable task/vma, typically from interrupt - * context. It is less reliable than using the task's vma and may give - * false positives. - */ -int in_gate_area_no_task(unsigned long addr) -{ - return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); -} - -void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) -{ - return __alloc_bootmem_core(pgdat->bdata, size, - SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0); -} - -const char *arch_vma_name(struct vm_area_struct *vma) -{ - if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) - return "[vdso]"; - if (vma == &gate_vma) - return "[vsyscall]"; - return NULL; -} diff --git a/arch/x86_64/mm/ioremap_64.c b/arch/x86_64/mm/ioremap_64.c deleted file mode 100644 index 6cac90aa503..00000000000 --- a/arch/x86_64/mm/ioremap_64.c +++ /dev/null @@ -1,210 +0,0 @@ -/* - * arch/x86_64/mm/ioremap.c - * - * Re-map IO memory to kernel address space so that we can access it. - * This is needed for high PCI addresses that aren't mapped in the - * 640k-1MB IO memory area on PC's - * - * (C) Copyright 1995 1996 Linus Torvalds - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -unsigned long __phys_addr(unsigned long x) -{ - if (x >= __START_KERNEL_map) - return x - __START_KERNEL_map + phys_base; - return x - PAGE_OFFSET; -} -EXPORT_SYMBOL(__phys_addr); - -#define ISA_START_ADDRESS 0xa0000 -#define ISA_END_ADDRESS 0x100000 - -/* - * Fix up the linear direct mapping of the kernel to avoid cache attribute - * conflicts. - */ -static int -ioremap_change_attr(unsigned long phys_addr, unsigned long size, - unsigned long flags) -{ - int err = 0; - if (phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) { - unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long vaddr = (unsigned long) __va(phys_addr); - - /* - * Must use a address here and not struct page because the phys addr - * can be a in hole between nodes and not have an memmap entry. - */ - err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags)); - if (!err) - global_flush_tlb(); - } - return err; -} - -/* - * Generic mapping function - */ - -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) -{ - void * addr; - struct vm_struct * area; - unsigned long offset, last_addr; - pgprot_t pgprot; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) - return (__force void __iomem *)phys_to_virt(phys_addr); - -#ifdef CONFIG_FLATMEM - /* - * Don't allow anybody to remap normal RAM that we're using.. - */ - if (last_addr < virt_to_phys(high_memory)) { - char *t_addr, *t_end; - struct page *page; - - t_addr = __va(phys_addr); - t_end = t_addr + (size - 1); - - for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) - if(!PageReserved(page)) - return NULL; - } -#endif - - pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_GLOBAL - | _PAGE_DIRTY | _PAGE_ACCESSED | flags); - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr+1) - phys_addr; - - /* - * Ok, go for it.. - */ - area = get_vm_area(size, VM_IOREMAP | (flags << 20)); - if (!area) - return NULL; - area->phys_addr = phys_addr; - addr = area->addr; - if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, - phys_addr, pgprot)) { - remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); - return NULL; - } - if (flags && ioremap_change_attr(phys_addr, size, flags) < 0) { - area->flags &= 0xffffff; - vunmap(addr); - return NULL; - } - return (__force void __iomem *) (offset + (char *)addr); -} -EXPORT_SYMBOL(__ioremap); - -/** - * ioremap_nocache - map bus memory into CPU space - * @offset: bus address of the memory - * @size: size of the resource to map - * - * ioremap_nocache performs a platform specific sequence of operations to - * make bus memory CPU accessible via the readb/readw/readl/writeb/ - * writew/writel functions and the other mmio helpers. The returned - * address is not guaranteed to be usable directly as a virtual - * address. - * - * This version of ioremap ensures that the memory is marked uncachable - * on the CPU as well as honouring existing caching rules from things like - * the PCI bus. Note that there are other caches and buffers on many - * busses. In particular driver authors should read up on PCI writes - * - * It's useful if some control registers are in such an area and - * write combining or read caching is not desirable: - * - * Must be freed with iounmap. - */ - -void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) -{ - return __ioremap(phys_addr, size, _PAGE_PCD); -} -EXPORT_SYMBOL(ioremap_nocache); - -/** - * iounmap - Free a IO remapping - * @addr: virtual address from ioremap_* - * - * Caller must ensure there is only one unmapping for the same pointer. - */ -void iounmap(volatile void __iomem *addr) -{ - struct vm_struct *p, *o; - - if (addr <= high_memory) - return; - if (addr >= phys_to_virt(ISA_START_ADDRESS) && - addr < phys_to_virt(ISA_END_ADDRESS)) - return; - - addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr); - /* Use the vm area unlocked, assuming the caller - ensures there isn't another iounmap for the same address - in parallel. Reuse of the virtual address is prevented by - leaving it in the global lists until we're done with it. - cpa takes care of the direct mappings. */ - read_lock(&vmlist_lock); - for (p = vmlist; p; p = p->next) { - if (p->addr == addr) - break; - } - read_unlock(&vmlist_lock); - - if (!p) { - printk("iounmap: bad address %p\n", addr); - dump_stack(); - return; - } - - /* Reset the direct mapping. Can block */ - if (p->flags >> 20) - ioremap_change_attr(p->phys_addr, p->size, 0); - - /* Finally remove it */ - o = remove_vm_area((void *)addr); - BUG_ON(p != o || o == NULL); - kfree(p); -} -EXPORT_SYMBOL(iounmap); - diff --git a/arch/x86_64/mm/k8topology_64.c b/arch/x86_64/mm/k8topology_64.c deleted file mode 100644 index a96006f7ae0..00000000000 --- a/arch/x86_64/mm/k8topology_64.c +++ /dev/null @@ -1,182 +0,0 @@ -/* - * AMD K8 NUMA support. - * Discover the memory map and associated nodes. - * - * This version reads it directly from the K8 northbridge. - * - * Copyright 2002,2003 Andi Kleen, SuSE Labs. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static __init int find_northbridge(void) -{ - int num; - - for (num = 0; num < 32; num++) { - u32 header; - - header = read_pci_config(0, num, 0, 0x00); - if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16))) - continue; - - header = read_pci_config(0, num, 1, 0x00); - if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16))) - continue; - return num; - } - - return -1; -} - -int __init k8_scan_nodes(unsigned long start, unsigned long end) -{ - unsigned long prevbase; - struct bootnode nodes[8]; - int nodeid, i, j, nb; - unsigned char nodeids[8]; - int found = 0; - u32 reg; - unsigned numnodes; - unsigned num_cores; - - if (!early_pci_allowed()) - return -1; - - nb = find_northbridge(); - if (nb < 0) - return nb; - - printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); - - num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; - printk(KERN_INFO "CPU has %d num_cores\n", num_cores); - - reg = read_pci_config(0, nb, 0, 0x60); - numnodes = ((reg >> 4) & 0xF) + 1; - if (numnodes <= 1) - return -1; - - printk(KERN_INFO "Number of nodes %d\n", numnodes); - - memset(&nodes,0,sizeof(nodes)); - prevbase = 0; - for (i = 0; i < 8; i++) { - unsigned long base,limit; - u32 nodeid; - - base = read_pci_config(0, nb, 1, 0x40 + i*8); - limit = read_pci_config(0, nb, 1, 0x44 + i*8); - - nodeid = limit & 7; - nodeids[i] = nodeid; - if ((base & 3) == 0) { - if (i < numnodes) - printk("Skipping disabled node %d\n", i); - continue; - } - if (nodeid >= numnodes) { - printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, - base, limit); - continue; - } - - if (!limit) { - printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i, - base); - continue; - } - if ((base >> 8) & 3 || (limit >> 8) & 3) { - printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", - nodeid, (base>>8)&3, (limit>>8) & 3); - return -1; - } - if (node_isset(nodeid, node_possible_map)) { - printk(KERN_INFO "Node %d already present. Skipping\n", - nodeid); - continue; - } - - limit >>= 16; - limit <<= 24; - limit |= (1<<24)-1; - limit++; - - if (limit > end_pfn << PAGE_SHIFT) - limit = end_pfn << PAGE_SHIFT; - if (limit <= base) - continue; - - base >>= 16; - base <<= 24; - - if (base < start) - base = start; - if (limit > end) - limit = end; - if (limit == base) { - printk(KERN_ERR "Empty node %d\n", nodeid); - continue; - } - if (limit < base) { - printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", - nodeid, base, limit); - continue; - } - - /* Could sort here, but pun for now. Should not happen anyroads. */ - if (prevbase > base) { - printk(KERN_ERR "Node map not sorted %lx,%lx\n", - prevbase,base); - return -1; - } - - printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", - nodeid, base, limit); - - found++; - - nodes[nodeid].start = base; - nodes[nodeid].end = limit; - e820_register_active_regions(nodeid, - nodes[nodeid].start >> PAGE_SHIFT, - nodes[nodeid].end >> PAGE_SHIFT); - - prevbase = base; - - node_set(nodeid, node_possible_map); - } - - if (!found) - return -1; - - memnode_shift = compute_hash_shift(nodes, 8); - if (memnode_shift < 0) { - printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); - return -1; - } - printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); - - for (i = 0; i < 8; i++) { - if (nodes[i].start != nodes[i].end) { - nodeid = nodeids[i]; - for (j = 0; j < num_cores; j++) - apicid_to_node[(nodeid * num_cores) + j] = i; - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - } - } - - numa_init_array(); - return 0; -} diff --git a/arch/x86_64/mm/mmap_64.c b/arch/x86_64/mm/mmap_64.c deleted file mode 100644 index 80bba0dc000..00000000000 --- a/arch/x86_64/mm/mmap_64.c +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright 2005 Andi Kleen, SuSE Labs. - * Licensed under GPL, v.2 - */ -#include -#include -#include -#include - -/* Notebook: move the mmap code from sys_x86_64.c over here. */ - -void arch_pick_mmap_layout(struct mm_struct *mm) -{ -#ifdef CONFIG_IA32_EMULATION - if (current_thread_info()->flags & _TIF_IA32) - return ia32_pick_mmap_layout(mm); -#endif - mm->mmap_base = TASK_UNMAPPED_BASE; - if (current->flags & PF_RANDOMIZE) { - /* Add 28bit randomness which is about 40bits of address space - because mmap base has to be page aligned. - or ~1/128 of the total user VM - (total user address space is 47bits) */ - unsigned rnd = get_random_int() & 0xfffffff; - mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT; - } - mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; -} - diff --git a/arch/x86_64/mm/numa_64.c b/arch/x86_64/mm/numa_64.c deleted file mode 100644 index 6da23552226..00000000000 --- a/arch/x86_64/mm/numa_64.c +++ /dev/null @@ -1,648 +0,0 @@ -/* - * Generic VM initialization for x86-64 NUMA setups. - * Copyright 2002,2003 Andi Kleen, SuSE Labs. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#ifndef Dprintk -#define Dprintk(x...) -#endif - -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -bootmem_data_t plat_node_bdata[MAX_NUMNODES]; - -struct memnode memnode; - -unsigned char cpu_to_node[NR_CPUS] __read_mostly = { - [0 ... NR_CPUS-1] = NUMA_NO_NODE -}; -unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE -}; -cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; - -int numa_off __initdata; -unsigned long __initdata nodemap_addr; -unsigned long __initdata nodemap_size; - - -/* - * Given a shift value, try to populate memnodemap[] - * Returns : - * 1 if OK - * 0 if memnodmap[] too small (of shift too small) - * -1 if node overlap or lost ram (shift too big) - */ -static int __init -populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) -{ - int i; - int res = -1; - unsigned long addr, end; - - memset(memnodemap, 0xff, memnodemapsize); - for (i = 0; i < numnodes; i++) { - addr = nodes[i].start; - end = nodes[i].end; - if (addr >= end) - continue; - if ((end >> shift) >= memnodemapsize) - return 0; - do { - if (memnodemap[addr >> shift] != 0xff) - return -1; - memnodemap[addr >> shift] = i; - addr += (1UL << shift); - } while (addr < end); - res = 1; - } - return res; -} - -static int __init allocate_cachealigned_memnodemap(void) -{ - unsigned long pad, pad_addr; - - memnodemap = memnode.embedded_map; - if (memnodemapsize <= 48) - return 0; - - pad = L1_CACHE_BYTES - 1; - pad_addr = 0x8000; - nodemap_size = pad + memnodemapsize; - nodemap_addr = find_e820_area(pad_addr, end_pfn<= end) - continue; - bitfield |= start; - nodes_used++; - if (end > memtop) - memtop = end; - } - if (nodes_used <= 1) - i = 63; - else - i = find_first_bit(&bitfield, sizeof(unsigned long)*8); - memnodemapsize = (memtop >> i)+1; - return i; -} - -int __init compute_hash_shift(struct bootnode *nodes, int numnodes) -{ - int shift; - - shift = extract_lsb_from_nodes(nodes, numnodes); - if (allocate_cachealigned_memnodemap()) - return -1; - printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", - shift); - - if (populate_memnodemap(nodes, numnodes, shift) != 1) { - printk(KERN_INFO - "Your memory is not aligned you need to rebuild your kernel " - "with a bigger NODEMAPSIZE shift=%d\n", - shift); - return -1; - } - return shift; -} - -#ifdef CONFIG_SPARSEMEM -int early_pfn_to_nid(unsigned long pfn) -{ - return phys_to_nid(pfn << PAGE_SHIFT); -} -#endif - -static void * __init -early_node_mem(int nodeid, unsigned long start, unsigned long end, - unsigned long size) -{ - unsigned long mem = find_e820_area(start, end, size); - void *ptr; - if (mem != -1L) - return __va(mem); - ptr = __alloc_bootmem_nopanic(size, - SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)); - if (ptr == 0) { - printk(KERN_ERR "Cannot find %lu bytes in node %d\n", - size, nodeid); - return NULL; - } - return ptr; -} - -/* Initialize bootmem allocator for a node */ -void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) -{ - unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; - unsigned long nodedata_phys; - void *bootmap; - const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); - - start = round_up(start, ZONE_ALIGN); - - printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); - - start_pfn = start >> PAGE_SHIFT; - end_pfn = end >> PAGE_SHIFT; - - node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size); - if (node_data[nodeid] == NULL) - return; - nodedata_phys = __pa(node_data[nodeid]); - - memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); - NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; - NODE_DATA(nodeid)->node_start_pfn = start_pfn; - NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; - - /* Find a place for the bootmem map */ - bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); - bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); - bootmap = early_node_mem(nodeid, bootmap_start, end, - bootmap_pages<= end) - free_bootmem((unsigned long)node_data[nodeid],pgdat_size); - node_data[nodeid] = NULL; - return; - } - bootmap_start = __pa(bootmap); - Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); - - bootmap_size = init_bootmem_node(NODE_DATA(nodeid), - bootmap_start >> PAGE_SHIFT, - start_pfn, end_pfn); - - free_bootmem_with_active_regions(nodeid, end); - - reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); - reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<node_mem_map = - __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, - memmapsize, SMP_CACHE_BYTES, - round_down(limit - memmapsize, PAGE_SIZE), - limit); -#endif -} - -void __init numa_init_array(void) -{ - int rr, i; - /* There are unfortunately some poorly designed mainboards around - that only connect memory to a single CPU. This breaks the 1:1 cpu->node - mapping. To avoid this fill in the mapping for all possible - CPUs, as the number of CPUs is not known yet. - We round robin the existing nodes. */ - rr = first_node(node_online_map); - for (i = 0; i < NR_CPUS; i++) { - if (cpu_to_node[i] != NUMA_NO_NODE) - continue; - numa_set_node(i, rr); - rr = next_node(rr, node_online_map); - if (rr == MAX_NUMNODES) - rr = first_node(node_online_map); - } - -} - -#ifdef CONFIG_NUMA_EMU -/* Numa emulation */ -char *cmdline __initdata; - -/* - * Setups up nid to range from addr to addr + size. If the end boundary is - * greater than max_addr, then max_addr is used instead. The return value is 0 - * if there is additional memory left for allocation past addr and -1 otherwise. - * addr is adjusted to be at the end of the node. - */ -static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, - u64 size, u64 max_addr) -{ - int ret = 0; - nodes[nid].start = *addr; - *addr += size; - if (*addr >= max_addr) { - *addr = max_addr; - ret = -1; - } - nodes[nid].end = *addr; - node_set(nid, node_possible_map); - printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, - nodes[nid].start, nodes[nid].end, - (nodes[nid].end - nodes[nid].start) >> 20); - return ret; -} - -/* - * Splits num_nodes nodes up equally starting at node_start. The return value - * is the number of nodes split up and addr is adjusted to be at the end of the - * last node allocated. - */ -static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, - u64 max_addr, int node_start, - int num_nodes) -{ - unsigned int big; - u64 size; - int i; - - if (num_nodes <= 0) - return -1; - if (num_nodes > MAX_NUMNODES) - num_nodes = MAX_NUMNODES; - size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) / - num_nodes; - /* - * Calculate the number of big nodes that can be allocated as a result - * of consolidating the leftovers. - */ - big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / - FAKE_NODE_MIN_SIZE; - - /* Round down to nearest FAKE_NODE_MIN_SIZE. */ - size &= FAKE_NODE_MIN_HASH_MASK; - if (!size) { - printk(KERN_ERR "Not enough memory for each node. " - "NUMA emulation disabled.\n"); - return -1; - } - - for (i = node_start; i < num_nodes + node_start; i++) { - u64 end = *addr + size; - if (i < big) - end += FAKE_NODE_MIN_SIZE; - /* - * The final node can have the remaining system RAM. Other - * nodes receive roughly the same amount of available pages. - */ - if (i == num_nodes + node_start - 1) - end = max_addr; - else - while (end - *addr - e820_hole_size(*addr, end) < - size) { - end += FAKE_NODE_MIN_SIZE; - if (end > max_addr) { - end = max_addr; - break; - } - } - if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) - break; - } - return i - node_start + 1; -} - -/* - * Splits the remaining system RAM into chunks of size. The remaining memory is - * always assigned to a final node and can be asymmetric. Returns the number of - * nodes split. - */ -static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, - u64 max_addr, int node_start, u64 size) -{ - int i = node_start; - size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; - while (!setup_node_range(i++, nodes, addr, size, max_addr)) - ; - return i - node_start; -} - -/* - * Sets up the system RAM area from start_pfn to end_pfn according to the - * numa=fake command-line option. - */ -static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) -{ - struct bootnode nodes[MAX_NUMNODES]; - u64 addr = start_pfn << PAGE_SHIFT; - u64 max_addr = end_pfn << PAGE_SHIFT; - int num_nodes = 0; - int coeff_flag; - int coeff = -1; - int num = 0; - u64 size; - int i; - - memset(&nodes, 0, sizeof(nodes)); - /* - * If the numa=fake command-line is just a single number N, split the - * system RAM into N fake nodes. - */ - if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { - num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, - simple_strtol(cmdline, NULL, 0)); - if (num_nodes < 0) - return num_nodes; - goto out; - } - - /* Parse the command line. */ - for (coeff_flag = 0; ; cmdline++) { - if (*cmdline && isdigit(*cmdline)) { - num = num * 10 + *cmdline - '0'; - continue; - } - if (*cmdline == '*') { - if (num > 0) - coeff = num; - coeff_flag = 1; - } - if (!*cmdline || *cmdline == ',') { - if (!coeff_flag) - coeff = 1; - /* - * Round down to the nearest FAKE_NODE_MIN_SIZE. - * Command-line coefficients are in megabytes. - */ - size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; - if (size) - for (i = 0; i < coeff; i++, num_nodes++) - if (setup_node_range(num_nodes, nodes, - &addr, size, max_addr) < 0) - goto done; - if (!*cmdline) - break; - coeff_flag = 0; - coeff = -1; - } - num = 0; - } -done: - if (!num_nodes) - return -1; - /* Fill remainder of system RAM, if appropriate. */ - if (addr < max_addr) { - if (coeff_flag && coeff < 0) { - /* Split remaining nodes into num-sized chunks */ - num_nodes += split_nodes_by_size(nodes, &addr, max_addr, - num_nodes, num); - goto out; - } - switch (*(cmdline - 1)) { - case '*': - /* Split remaining nodes into coeff chunks */ - if (coeff <= 0) - break; - num_nodes += split_nodes_equally(nodes, &addr, max_addr, - num_nodes, coeff); - break; - case ',': - /* Do not allocate remaining system RAM */ - break; - default: - /* Give one final node */ - setup_node_range(num_nodes, nodes, &addr, - max_addr - addr, max_addr); - num_nodes++; - } - } -out: - memnode_shift = compute_hash_shift(nodes, num_nodes); - if (memnode_shift < 0) { - memnode_shift = 0; - printk(KERN_ERR "No NUMA hash function found. NUMA emulation " - "disabled.\n"); - return -1; - } - - /* - * We need to vacate all active ranges that may have been registered by - * SRAT and set acpi_numa to -1 so that srat_disabled() always returns - * true. NUMA emulation has succeeded so we will not scan ACPI nodes. - */ - remove_all_active_ranges(); -#ifdef CONFIG_ACPI_NUMA - acpi_numa = -1; -#endif - for_each_node_mask(i, node_possible_map) { - e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, - nodes[i].end >> PAGE_SHIFT); - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - } - acpi_fake_nodes(nodes, num_nodes); - numa_init_array(); - return 0; -} -#endif /* CONFIG_NUMA_EMU */ - -void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) -{ - int i; - - nodes_clear(node_possible_map); - -#ifdef CONFIG_NUMA_EMU - if (cmdline && !numa_emulation(start_pfn, end_pfn)) - return; - nodes_clear(node_possible_map); -#endif - -#ifdef CONFIG_ACPI_NUMA - if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, - end_pfn << PAGE_SHIFT)) - return; - nodes_clear(node_possible_map); -#endif - -#ifdef CONFIG_K8_NUMA - if (!numa_off && !k8_scan_nodes(start_pfn<nodenumber = node; - cpu_to_node[cpu] = node; -} - -unsigned long __init numa_free_all_bootmem(void) -{ - int i; - unsigned long pages = 0; - for_each_online_node(i) { - pages += free_all_bootmem_node(NODE_DATA(i)); - } - return pages; -} - -void __init paging_init(void) -{ - int i; - unsigned long max_zone_pfns[MAX_NR_ZONES]; - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; - max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; - max_zone_pfns[ZONE_NORMAL] = end_pfn; - - sparse_memory_present_with_active_regions(MAX_NUMNODES); - sparse_init(); - - for_each_online_node(i) { - setup_node_zones(i); - } - - free_area_init_nodes(max_zone_pfns); -} - -static __init int numa_setup(char *opt) -{ - if (!opt) - return -EINVAL; - if (!strncmp(opt,"off",3)) - numa_off = 1; -#ifdef CONFIG_NUMA_EMU - if (!strncmp(opt, "fake=", 5)) - cmdline = opt + 5; -#endif -#ifdef CONFIG_ACPI_NUMA - if (!strncmp(opt,"noacpi",6)) - acpi_numa = -1; - if (!strncmp(opt,"hotadd=", 7)) - hotadd_percent = simple_strtoul(opt+7, NULL, 10); -#endif - return 0; -} - -early_param("numa", numa_setup); - -/* - * Setup early cpu_to_node. - * - * Populate cpu_to_node[] only if x86_cpu_to_apicid[], - * and apicid_to_node[] tables have valid entries for a CPU. - * This means we skip cpu_to_node[] initialisation for NUMA - * emulation and faking node case (when running a kernel compiled - * for NUMA on a non NUMA box), which is OK as cpu_to_node[] - * is already initialized in a round robin manner at numa_init_array, - * prior to this call, and this initialization is good enough - * for the fake NUMA cases. - */ -void __init init_cpu_to_node(void) -{ - int i; - for (i = 0; i < NR_CPUS; i++) { - u8 apicid = x86_cpu_to_apicid[i]; - if (apicid == BAD_APICID) - continue; - if (apicid_to_node[apicid] == NUMA_NO_NODE) - continue; - numa_set_node(i,apicid_to_node[apicid]); - } -} - -EXPORT_SYMBOL(cpu_to_node); -EXPORT_SYMBOL(node_to_cpumask); -EXPORT_SYMBOL(memnode); -EXPORT_SYMBOL(node_data); - -#ifdef CONFIG_DISCONTIGMEM -/* - * Functions to convert PFNs from/to per node page addresses. - * These are out of line because they are quite big. - * They could be all tuned by pre caching more state. - * Should do that. - */ - -int pfn_valid(unsigned long pfn) -{ - unsigned nid; - if (pfn >= num_physpages) - return 0; - nid = pfn_to_nid(pfn); - if (nid == 0xff) - return 0; - return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid); -} -EXPORT_SYMBOL(pfn_valid); -#endif diff --git a/arch/x86_64/mm/pageattr_64.c b/arch/x86_64/mm/pageattr_64.c deleted file mode 100644 index 10b9809ce82..00000000000 --- a/arch/x86_64/mm/pageattr_64.c +++ /dev/null @@ -1,249 +0,0 @@ -/* - * Copyright 2002 Andi Kleen, SuSE Labs. - * Thanks to Ben LaHaise for precious feedback. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -pte_t *lookup_address(unsigned long address) -{ - pgd_t *pgd = pgd_offset_k(address); - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - if (pgd_none(*pgd)) - return NULL; - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return NULL; - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) - return NULL; - if (pmd_large(*pmd)) - return (pte_t *)pmd; - pte = pte_offset_kernel(pmd, address); - if (pte && !pte_present(*pte)) - pte = NULL; - return pte; -} - -static struct page *split_large_page(unsigned long address, pgprot_t prot, - pgprot_t ref_prot) -{ - int i; - unsigned long addr; - struct page *base = alloc_pages(GFP_KERNEL, 0); - pte_t *pbase; - if (!base) - return NULL; - /* - * page_private is used to track the number of entries in - * the page table page have non standard attributes. - */ - SetPagePrivate(base); - page_private(base) = 0; - - address = __pa(address); - addr = address & LARGE_PAGE_MASK; - pbase = (pte_t *)page_address(base); - for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { - pbase[i] = pfn_pte(addr >> PAGE_SHIFT, - addr == address ? prot : ref_prot); - } - return base; -} - -static void cache_flush_page(void *adr) -{ - int i; - for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) - asm volatile("clflush (%0)" :: "r" (adr + i)); -} - -static void flush_kernel_map(void *arg) -{ - struct list_head *l = (struct list_head *)arg; - struct page *pg; - - /* When clflush is available always use it because it is - much cheaper than WBINVD. */ - /* clflush is still broken. Disable for now. */ - if (1 || !cpu_has_clflush) - asm volatile("wbinvd" ::: "memory"); - else list_for_each_entry(pg, l, lru) { - void *adr = page_address(pg); - cache_flush_page(adr); - } - __flush_tlb_all(); -} - -static inline void flush_map(struct list_head *l) -{ - on_each_cpu(flush_kernel_map, l, 1, 1); -} - -static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */ - -static inline void save_page(struct page *fpage) -{ - if (!test_and_set_bit(PG_arch_1, &fpage->flags)) - list_add(&fpage->lru, &deferred_pages); -} - -/* - * No more special protections in this 2/4MB area - revert to a - * large page again. - */ -static void revert_page(unsigned long address, pgprot_t ref_prot) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t large_pte; - unsigned long pfn; - - pgd = pgd_offset_k(address); - BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd,address); - BUG_ON(pud_none(*pud)); - pmd = pmd_offset(pud, address); - BUG_ON(pmd_val(*pmd) & _PAGE_PSE); - pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT; - large_pte = pfn_pte(pfn, ref_prot); - large_pte = pte_mkhuge(large_pte); - set_pte((pte_t *)pmd, large_pte); -} - -static int -__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, - pgprot_t ref_prot) -{ - pte_t *kpte; - struct page *kpte_page; - pgprot_t ref_prot2; - - kpte = lookup_address(address); - if (!kpte) return 0; - kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); - BUG_ON(PageLRU(kpte_page)); - BUG_ON(PageCompound(kpte_page)); - if (pgprot_val(prot) != pgprot_val(ref_prot)) { - if (!pte_huge(*kpte)) { - set_pte(kpte, pfn_pte(pfn, prot)); - } else { - /* - * split_large_page will take the reference for this - * change_page_attr on the split page. - */ - struct page *split; - ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); - split = split_large_page(address, prot, ref_prot2); - if (!split) - return -ENOMEM; - set_pte(kpte, mk_pte(split, ref_prot2)); - kpte_page = split; - } - page_private(kpte_page)++; - } else if (!pte_huge(*kpte)) { - set_pte(kpte, pfn_pte(pfn, ref_prot)); - BUG_ON(page_private(kpte_page) == 0); - page_private(kpte_page)--; - } else - BUG(); - - /* on x86-64 the direct mapping set at boot is not using 4k pages */ - BUG_ON(PageReserved(kpte_page)); - - save_page(kpte_page); - if (page_private(kpte_page) == 0) - revert_page(address, ref_prot); - return 0; -} - -/* - * Change the page attributes of an page in the linear mapping. - * - * This should be used when a page is mapped with a different caching policy - * than write-back somewhere - some CPUs do not like it when mappings with - * different caching policies exist. This changes the page attributes of the - * in kernel linear mapping too. - * - * The caller needs to ensure that there are no conflicting mappings elsewhere. - * This function only deals with the kernel linear map. - * - * Caller must call global_flush_tlb() after this. - */ -int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) -{ - int err = 0, kernel_map = 0; - int i; - - if (address >= __START_KERNEL_map - && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) { - address = (unsigned long)__va(__pa(address)); - kernel_map = 1; - } - - down_write(&init_mm.mmap_sem); - for (i = 0; i < numpages; i++, address += PAGE_SIZE) { - unsigned long pfn = __pa(address) >> PAGE_SHIFT; - - if (!kernel_map || pte_present(pfn_pte(0, prot))) { - err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); - if (err) - break; - } - /* Handle kernel mapping too which aliases part of the - * lowmem */ - if (__pa(address) < KERNEL_TEXT_SIZE) { - unsigned long addr2; - pgprot_t prot2; - addr2 = __START_KERNEL_map + __pa(address); - /* Make sure the kernel mappings stay executable */ - prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); - err = __change_page_attr(addr2, pfn, prot2, - PAGE_KERNEL_EXEC); - } - } - up_write(&init_mm.mmap_sem); - return err; -} - -/* Don't call this for MMIO areas that may not have a mem_map entry */ -int change_page_attr(struct page *page, int numpages, pgprot_t prot) -{ - unsigned long addr = (unsigned long)page_address(page); - return change_page_attr_addr(addr, numpages, prot); -} - -void global_flush_tlb(void) -{ - struct page *pg, *next; - struct list_head l; - - down_read(&init_mm.mmap_sem); - list_replace_init(&deferred_pages, &l); - up_read(&init_mm.mmap_sem); - - flush_map(&l); - - list_for_each_entry_safe(pg, next, &l, lru) { - list_del(&pg->lru); - clear_bit(PG_arch_1, &pg->flags); - if (page_private(pg) != 0) - continue; - ClearPagePrivate(pg); - __free_page(pg); - } -} - -EXPORT_SYMBOL(change_page_attr); -EXPORT_SYMBOL(global_flush_tlb); diff --git a/arch/x86_64/mm/srat_64.c b/arch/x86_64/mm/srat_64.c deleted file mode 100644 index acdf03e1914..00000000000 --- a/arch/x86_64/mm/srat_64.c +++ /dev/null @@ -1,566 +0,0 @@ -/* - * ACPI 3.0 based NUMA setup - * Copyright 2004 Andi Kleen, SuSE Labs. - * - * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. - * - * Called from acpi_numa_init while reading the SRAT and SLIT tables. - * Assumes all memory regions belonging to a single proximity domain - * are in one chunk. Holes between them will be included in the node. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -int acpi_numa __initdata; - -static struct acpi_table_slit *acpi_slit; - -static nodemask_t nodes_parsed __initdata; -static struct bootnode nodes[MAX_NUMNODES] __initdata; -static struct bootnode nodes_add[MAX_NUMNODES]; -static int found_add_area __initdata; -int hotadd_percent __initdata = 0; - -/* Too small nodes confuse the VM badly. Usually they result - from BIOS bugs. */ -#define NODE_MIN_SIZE (4*1024*1024) - -static __init int setup_node(int pxm) -{ - return acpi_map_pxm_to_node(pxm); -} - -static __init int conflicting_nodes(unsigned long start, unsigned long end) -{ - int i; - for_each_node_mask(i, nodes_parsed) { - struct bootnode *nd = &nodes[i]; - if (nd->start == nd->end) - continue; - if (nd->end > start && nd->start < end) - return i; - if (nd->end == end && nd->start == start) - return i; - } - return -1; -} - -static __init void cutoff_node(int i, unsigned long start, unsigned long end) -{ - struct bootnode *nd = &nodes[i]; - - if (found_add_area) - return; - - if (nd->start < start) { - nd->start = start; - if (nd->end < nd->start) - nd->start = nd->end; - } - if (nd->end > end) { - nd->end = end; - if (nd->start > nd->end) - nd->start = nd->end; - } -} - -static __init void bad_srat(void) -{ - int i; - printk(KERN_ERR "SRAT: SRAT not used.\n"); - acpi_numa = -1; - found_add_area = 0; - for (i = 0; i < MAX_LOCAL_APIC; i++) - apicid_to_node[i] = NUMA_NO_NODE; - for (i = 0; i < MAX_NUMNODES; i++) - nodes_add[i].start = nodes[i].end = 0; - remove_all_active_ranges(); -} - -static __init inline int srat_disabled(void) -{ - return numa_off || acpi_numa < 0; -} - -/* - * A lot of BIOS fill in 10 (= no distance) everywhere. This messes - * up the NUMA heuristics which wants the local node to have a smaller - * distance than the others. - * Do some quick checks here and only use the SLIT if it passes. - */ -static __init int slit_valid(struct acpi_table_slit *slit) -{ - int i, j; - int d = slit->locality_count; - for (i = 0; i < d; i++) { - for (j = 0; j < d; j++) { - u8 val = slit->entry[d*i + j]; - if (i == j) { - if (val != LOCAL_DISTANCE) - return 0; - } else if (val <= LOCAL_DISTANCE) - return 0; - } - } - return 1; -} - -/* Callback for SLIT parsing */ -void __init acpi_numa_slit_init(struct acpi_table_slit *slit) -{ - if (!slit_valid(slit)) { - printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n"); - return; - } - acpi_slit = slit; -} - -/* Callback for Proximity Domain -> LAPIC mapping */ -void __init -acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) -{ - int pxm, node; - if (srat_disabled()) - return; - if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { - bad_srat(); - return; - } - if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) - return; - pxm = pa->proximity_domain_lo; - node = setup_node(pxm); - if (node < 0) { - printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); - bad_srat(); - return; - } - apicid_to_node[pa->apic_id] = node; - acpi_numa = 1; - printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", - pxm, pa->apic_id, node); -} - -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE -/* - * Protect against too large hotadd areas that would fill up memory. - */ -static int hotadd_enough_memory(struct bootnode *nd) -{ - static unsigned long allocated; - static unsigned long last_area_end; - unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT; - long mem = pages * sizeof(struct page); - unsigned long addr; - unsigned long allowed; - unsigned long oldpages = pages; - - if (mem < 0) - return 0; - allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE; - allowed = (allowed / 100) * hotadd_percent; - if (allocated + mem > allowed) { - unsigned long range; - /* Give them at least part of their hotadd memory upto hotadd_percent - It would be better to spread the limit out - over multiple hotplug areas, but that is too complicated - right now */ - if (allocated >= allowed) - return 0; - range = allowed - allocated; - pages = (range / PAGE_SIZE); - mem = pages * sizeof(struct page); - nd->end = nd->start + range; - } - /* Not completely fool proof, but a good sanity check */ - addr = find_e820_area(last_area_end, end_pfn<> PAGE_SHIFT) > end_pfn) - end_pfn = end >> PAGE_SHIFT; - return 1; -} - -static inline int save_add_info(void) -{ - return hotadd_percent > 0; -} -#else -int update_end_of_memory(unsigned long end) {return -1;} -static int hotadd_enough_memory(struct bootnode *nd) {return 1;} -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE -static inline int save_add_info(void) {return 1;} -#else -static inline int save_add_info(void) {return 0;} -#endif -#endif -/* - * Update nodes_add and decide if to include add are in the zone. - * Both SPARSE and RESERVE need nodes_add infomation. - * This code supports one contigious hot add area per node. - */ -static int reserve_hotadd(int node, unsigned long start, unsigned long end) -{ - unsigned long s_pfn = start >> PAGE_SHIFT; - unsigned long e_pfn = end >> PAGE_SHIFT; - int ret = 0, changed = 0; - struct bootnode *nd = &nodes_add[node]; - - /* I had some trouble with strange memory hotadd regions breaking - the boot. Be very strict here and reject anything unexpected. - If you want working memory hotadd write correct SRATs. - - The node size check is a basic sanity check to guard against - mistakes */ - if ((signed long)(end - start) < NODE_MIN_SIZE) { - printk(KERN_ERR "SRAT: Hotplug area too small\n"); - return -1; - } - - /* This check might be a bit too strict, but I'm keeping it for now. */ - if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) { - printk(KERN_ERR - "SRAT: Hotplug area %lu -> %lu has existing memory\n", - s_pfn, e_pfn); - return -1; - } - - if (!hotadd_enough_memory(&nodes_add[node])) { - printk(KERN_ERR "SRAT: Hotplug area too large\n"); - return -1; - } - - /* Looks good */ - - if (nd->start == nd->end) { - nd->start = start; - nd->end = end; - changed = 1; - } else { - if (nd->start == end) { - nd->start = start; - changed = 1; - } - if (nd->end == start) { - nd->end = end; - changed = 1; - } - if (!changed) - printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); - } - - ret = update_end_of_memory(nd->end); - - if (changed) - printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); - return ret; -} - -/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ -void __init -acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) -{ - struct bootnode *nd, oldnode; - unsigned long start, end; - int node, pxm; - int i; - - if (srat_disabled()) - return; - if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) { - bad_srat(); - return; - } - if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) - return; - - if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) - return; - start = ma->base_address; - end = start + ma->length; - pxm = ma->proximity_domain; - node = setup_node(pxm); - if (node < 0) { - printk(KERN_ERR "SRAT: Too many proximity domains.\n"); - bad_srat(); - return; - } - i = conflicting_nodes(start, end); - if (i == node) { - printk(KERN_WARNING - "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n", - pxm, start, end, nodes[i].start, nodes[i].end); - } else if (i >= 0) { - printk(KERN_ERR - "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n", - pxm, start, end, node_to_pxm(i), - nodes[i].start, nodes[i].end); - bad_srat(); - return; - } - nd = &nodes[node]; - oldnode = *nd; - if (!node_test_and_set(node, nodes_parsed)) { - nd->start = start; - nd->end = end; - } else { - if (start < nd->start) - nd->start = start; - if (nd->end < end) - nd->end = end; - } - - printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, - nd->start, nd->end); - e820_register_active_regions(node, nd->start >> PAGE_SHIFT, - nd->end >> PAGE_SHIFT); - push_node_boundaries(node, nd->start >> PAGE_SHIFT, - nd->end >> PAGE_SHIFT); - - if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && - (reserve_hotadd(node, start, end) < 0)) { - /* Ignore hotadd region. Undo damage */ - printk(KERN_NOTICE "SRAT: Hotplug region ignored\n"); - *nd = oldnode; - if ((nd->start | nd->end) == 0) - node_clear(node, nodes_parsed); - } -} - -/* Sanity check to catch more bad SRATs (they are amazingly common). - Make sure the PXMs cover all memory. */ -static int __init nodes_cover_memory(const struct bootnode *nodes) -{ - int i; - unsigned long pxmram, e820ram; - - pxmram = 0; - for_each_node_mask(i, nodes_parsed) { - unsigned long s = nodes[i].start >> PAGE_SHIFT; - unsigned long e = nodes[i].end >> PAGE_SHIFT; - pxmram += e - s; - pxmram -= absent_pages_in_range(s, e); - if ((long)pxmram < 0) - pxmram = 0; - } - - e820ram = end_pfn - absent_pages_in_range(0, end_pfn); - /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ - if ((long)(e820ram - pxmram) >= 1*1024*1024) { - printk(KERN_ERR - "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", - (pxmram << PAGE_SHIFT) >> 20, - (e820ram << PAGE_SHIFT) >> 20); - return 0; - } - return 1; -} - -static void unparse_node(int node) -{ - int i; - node_clear(node, nodes_parsed); - for (i = 0; i < MAX_LOCAL_APIC; i++) { - if (apicid_to_node[i] == node) - apicid_to_node[i] = NUMA_NO_NODE; - } -} - -void __init acpi_numa_arch_fixup(void) {} - -/* Use the information discovered above to actually set up the nodes. */ -int __init acpi_scan_nodes(unsigned long start, unsigned long end) -{ - int i; - - if (acpi_numa <= 0) - return -1; - - /* First clean up the node list */ - for (i = 0; i < MAX_NUMNODES; i++) { - cutoff_node(i, start, end); - if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { - unparse_node(i); - node_set_offline(i); - } - } - - if (!nodes_cover_memory(nodes)) { - bad_srat(); - return -1; - } - - memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES); - if (memnode_shift < 0) { - printk(KERN_ERR - "SRAT: No NUMA node hash function found. Contact maintainer\n"); - bad_srat(); - return -1; - } - - node_possible_map = nodes_parsed; - - /* Finally register nodes */ - for_each_node_mask(i, node_possible_map) - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - /* Try again in case setup_node_bootmem missed one due - to missing bootmem */ - for_each_node_mask(i, node_possible_map) - if (!node_online(i)) - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - - for (i = 0; i < NR_CPUS; i++) { - if (cpu_to_node[i] == NUMA_NO_NODE) - continue; - if (!node_isset(cpu_to_node[i], node_possible_map)) - numa_set_node(i, NUMA_NO_NODE); - } - numa_init_array(); - return 0; -} - -#ifdef CONFIG_NUMA_EMU -static int __init find_node_by_addr(unsigned long addr) -{ - int ret = NUMA_NO_NODE; - int i; - - for_each_node_mask(i, nodes_parsed) { - /* - * Find the real node that this emulated node appears on. For - * the sake of simplicity, we only use a real node's starting - * address to determine which emulated node it appears on. - */ - if (addr >= nodes[i].start && addr < nodes[i].end) { - ret = i; - break; - } - } - return i; -} - -/* - * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID - * mappings that respect the real ACPI topology but reflect our emulated - * environment. For each emulated node, we find which real node it appears on - * and create PXM to NID mappings for those fake nodes which mirror that - * locality. SLIT will now represent the correct distances between emulated - * nodes as a result of the real topology. - */ -void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) -{ - int i, j; - int fake_node_to_pxm_map[MAX_NUMNODES] = { - [0 ... MAX_NUMNODES-1] = PXM_INVAL - }; - unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = { - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE - }; - - printk(KERN_INFO "Faking PXM affinity for fake nodes on real " - "topology.\n"); - for (i = 0; i < num_nodes; i++) { - int nid, pxm; - - nid = find_node_by_addr(fake_nodes[i].start); - if (nid == NUMA_NO_NODE) - continue; - pxm = node_to_pxm(nid); - if (pxm == PXM_INVAL) - continue; - fake_node_to_pxm_map[i] = pxm; - /* - * For each apicid_to_node mapping that exists for this real - * node, it must now point to the fake node ID. - */ - for (j = 0; j < MAX_LOCAL_APIC; j++) - if (apicid_to_node[j] == nid) - fake_apicid_to_node[j] = i; - } - for (i = 0; i < num_nodes; i++) - __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); - memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); - - nodes_clear(nodes_parsed); - for (i = 0; i < num_nodes; i++) - if (fake_nodes[i].start != fake_nodes[i].end) - node_set(i, nodes_parsed); - WARN_ON(!nodes_cover_memory(fake_nodes)); -} - -static int null_slit_node_compare(int a, int b) -{ - return node_to_pxm(a) == node_to_pxm(b); -} -#else -static int null_slit_node_compare(int a, int b) -{ - return a == b; -} -#endif /* CONFIG_NUMA_EMU */ - -void __init srat_reserve_add_area(int nodeid) -{ - if (found_add_area && nodes_add[nodeid].end) { - u64 total_mb; - - printk(KERN_INFO "SRAT: Reserving hot-add memory space " - "for node %d at %Lx-%Lx\n", - nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end); - total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start) - >> PAGE_SHIFT; - total_mb *= sizeof(struct page); - total_mb >>= 20; - printk(KERN_INFO "SRAT: This will cost you %Lu MB of " - "pre-allocated memory.\n", (unsigned long long)total_mb); - reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start, - nodes_add[nodeid].end - nodes_add[nodeid].start); - } -} - -int __node_distance(int a, int b) -{ - int index; - - if (!acpi_slit) - return null_slit_node_compare(a, b) ? LOCAL_DISTANCE : - REMOTE_DISTANCE; - index = acpi_slit->locality_count * node_to_pxm(a); - return acpi_slit->entry[index + node_to_pxm(b)]; -} - -EXPORT_SYMBOL(__node_distance); - -int memory_add_physaddr_to_nid(u64 start) -{ - int i, ret = 0; - - for_each_node(i) - if (nodes_add[i].start <= start && nodes_add[i].end > start) - ret = i; - - return ret; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); - -- cgit v1.2.3-70-g09d2