summaryrefslogtreecommitdiffstats
path: root/arch/x86_64/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64/mm')
-rw-r--r--arch/x86_64/mm/Makefile2
-rw-r--r--arch/x86_64/mm/fault.c54
-rw-r--r--arch/x86_64/mm/init.c206
-rw-r--r--arch/x86_64/mm/mmap.c30
-rw-r--r--arch/x86_64/mm/numa.c75
-rw-r--r--arch/x86_64/mm/pageattr.c9
-rw-r--r--arch/x86_64/mm/srat.c67
7 files changed, 369 insertions, 74 deletions
diff --git a/arch/x86_64/mm/Makefile b/arch/x86_64/mm/Makefile
index 1d232a87f11..d25ac86fe27 100644
--- a/arch/x86_64/mm/Makefile
+++ b/arch/x86_64/mm/Makefile
@@ -2,7 +2,7 @@
# Makefile for the linux x86_64-specific parts of the memory manager.
#
-obj-y := init.o fault.o ioremap.o extable.o pageattr.o
+obj-y := init.o fault.o ioremap.o extable.o pageattr.o mmap.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_NUMA) += numa.o
obj-$(CONFIG_K8_NUMA) += k8topology.o
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 3a63707a698..26eac194064 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -35,6 +35,13 @@
#include <asm-generic/sections.h>
#include <asm/kdebug.h>
+/* Page fault error code bits */
+#define PF_PROT (1<<0) /* or no page found */
+#define PF_WRITE (1<<1)
+#define PF_USER (1<<2)
+#define PF_RSVD (1<<3)
+#define PF_INSTR (1<<4)
+
void bust_spinlocks(int yes)
{
int loglevel_save = console_loglevel;
@@ -68,7 +75,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
unsigned char *max_instr;
/* If it was a exec fault ignore */
- if (error_code & (1<<4))
+ if (error_code & PF_INSTR)
return 0;
instr = (unsigned char *)convert_rip_to_linear(current, regs);
@@ -222,17 +229,22 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
unsigned long error_code)
{
unsigned long flags = oops_begin();
+ struct task_struct *tsk;
printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
current->comm, address);
dump_pagetable(address);
+ tsk = current;
+ tsk->thread.cr2 = address;
+ tsk->thread.trap_no = 14;
+ tsk->thread.error_code = error_code;
__die("Bad pagetable", regs, error_code);
oops_end(flags);
do_exit(SIGKILL);
}
/*
- * Handle a fault on the vmalloc or module mapping area
+ * Handle a fault on the vmalloc area
*
* This assumes no large pages in there.
*/
@@ -278,7 +290,6 @@ static int vmalloc_fault(unsigned long address)
that. */
if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
BUG();
- __flush_tlb_all();
return 0;
}
@@ -289,12 +300,6 @@ int exception_trace = 1;
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
- *
- * error_code:
- * bit 0 == 0 means no page found, 1 means protection fault
- * bit 1 == 0 means read, 1 means write
- * bit 2 == 0 means kernel, 1 means user-mode
- * bit 3 == 1 means fault was an instruction fetch
*/
asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
unsigned long error_code)
@@ -337,12 +342,16 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
*
* This verifies that the fault happens in kernel space
* (error_code & 4) == 0, and that the fault was not a
- * protection error (error_code & 1) == 0.
+ * protection error (error_code & 9) == 0.
*/
if (unlikely(address >= TASK_SIZE64)) {
- if (!(error_code & 5) &&
- ((address >= VMALLOC_START && address < VMALLOC_END) ||
- (address >= MODULES_VADDR && address < MODULES_END))) {
+ /*
+ * Don't check for the module range here: its PML4
+ * is always initialized because it's shared with the main
+ * kernel text. Only vmalloc may need PML4 syncups.
+ */
+ if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
+ ((address >= VMALLOC_START && address < VMALLOC_END))) {
if (vmalloc_fault(address) < 0)
goto bad_area_nosemaphore;
return;
@@ -354,7 +363,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
goto bad_area_nosemaphore;
}
- if (unlikely(error_code & (1 << 3)))
+ if (unlikely(error_code & PF_RSVD))
pgtable_bad(address, regs, error_code);
/*
@@ -381,7 +390,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
* thus avoiding the deadlock.
*/
if (!down_read_trylock(&mm->mmap_sem)) {
- if ((error_code & 4) == 0 &&
+ if ((error_code & PF_USER) == 0 &&
!search_exception_tables(regs->rip))
goto bad_area_nosemaphore;
down_read(&mm->mmap_sem);
@@ -408,17 +417,17 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
good_area:
info.si_code = SEGV_ACCERR;
write = 0;
- switch (error_code & 3) {
+ switch (error_code & (PF_PROT|PF_WRITE)) {
default: /* 3: write, present */
/* fall through */
- case 2: /* write, not present */
+ case PF_WRITE: /* write, not present */
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
write++;
break;
- case 1: /* read, present */
+ case PF_PROT: /* read, present */
goto bad_area;
- case 0: /* read, not present */
+ case 0: /* read, not present */
if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
goto bad_area;
}
@@ -453,7 +462,7 @@ bad_area:
bad_area_nosemaphore:
/* User mode accesses just cause a SIGSEGV */
- if (error_code & 4) {
+ if (error_code & PF_USER) {
if (is_prefetch(regs, address, error_code))
return;
@@ -521,6 +530,9 @@ no_context:
printk_address(regs->rip);
printk("\n");
dump_pagetable(address);
+ tsk->thread.cr2 = address;
+ tsk->thread.trap_no = 14;
+ tsk->thread.error_code = error_code;
__die("Oops", regs, error_code);
/* Executive summary in case the body of the oops scrolled away */
printk(KERN_EMERG "CR2: %016lx\n", address);
@@ -546,7 +558,7 @@ do_sigbus:
up_read(&mm->mmap_sem);
/* Kernel mode? Handle exceptions or die */
- if (!(error_code & 4))
+ if (!(error_code & PF_USER))
goto no_context;
tsk->thread.cr2 = address;
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index c016dfe8478..7af1742aa95 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -23,6 +23,9 @@
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/module.h>
+#include <linux/memory_hotplug.h>
#include <asm/processor.h>
#include <asm/system.h>
@@ -38,11 +41,16 @@
#include <asm/proto.h>
#include <asm/smp.h>
#include <asm/sections.h>
+#include <asm/dma-mapping.h>
+#include <asm/swiotlb.h>
#ifndef Dprintk
#define Dprintk(x...)
#endif
+struct dma_mapping_ops* dma_ops;
+EXPORT_SYMBOL(dma_ops);
+
static unsigned long dma_reserve __initdata;
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -174,13 +182,19 @@ static struct temp_map {
{}
};
-static __init void *alloc_low_page(int *index, unsigned long *phys)
+static __meminit void *alloc_low_page(int *index, unsigned long *phys)
{
struct temp_map *ti;
int i;
unsigned long pfn = table_end++, paddr;
void *adr;
+ if (after_bootmem) {
+ adr = (void *)get_zeroed_page(GFP_ATOMIC);
+ *phys = __pa(adr);
+ return adr;
+ }
+
if (pfn >= end_pfn)
panic("alloc_low_page: ran out of memory");
for (i = 0; temp_mappings[i].allocated; i++) {
@@ -193,55 +207,86 @@ static __init void *alloc_low_page(int *index, unsigned long *phys)
ti->allocated = 1;
__flush_tlb();
adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK);
+ memset(adr, 0, PAGE_SIZE);
*index = i;
*phys = pfn * PAGE_SIZE;
return adr;
}
-static __init void unmap_low_page(int i)
+static __meminit void unmap_low_page(int i)
{
- struct temp_map *ti = &temp_mappings[i];
+ struct temp_map *ti;
+
+ if (after_bootmem)
+ return;
+
+ ti = &temp_mappings[i];
set_pmd(ti->pmd, __pmd(0));
ti->allocated = 0;
}
-static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+static void __meminit
+phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
+{
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) {
+ unsigned long entry;
+
+ if (address > end) {
+ for (; i < PTRS_PER_PMD; i++, pmd++)
+ set_pmd(pmd, __pmd(0));
+ break;
+ }
+ entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
+ entry &= __supported_pte_mask;
+ set_pmd(pmd, __pmd(entry));
+ }
+}
+
+static void __meminit
+phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
+{
+ pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
+
+ if (pmd_none(*pmd)) {
+ spin_lock(&init_mm.page_table_lock);
+ phys_pmd_init(pmd, address, end);
+ spin_unlock(&init_mm.page_table_lock);
+ __flush_tlb_all();
+ }
+}
+
+static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
{
- long i, j;
+ long i = pud_index(address);
- i = pud_index(address);
pud = pud + i;
+
+ if (after_bootmem && pud_val(*pud)) {
+ phys_pmd_update(pud, address, end);
+ return;
+ }
+
for (; i < PTRS_PER_PUD; pud++, i++) {
int map;
unsigned long paddr, pmd_phys;
pmd_t *pmd;
- paddr = address + i*PUD_SIZE;
- if (paddr >= end) {
- for (; i < PTRS_PER_PUD; i++, pud++)
- set_pud(pud, __pud(0));
+ paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
+ if (paddr >= end)
break;
- }
- if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) {
+ if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) {
set_pud(pud, __pud(0));
continue;
}
pmd = alloc_low_page(&map, &pmd_phys);
+ spin_lock(&init_mm.page_table_lock);
set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
- for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
- unsigned long pe;
-
- if (paddr >= end) {
- for (; j < PTRS_PER_PMD; j++, pmd++)
- set_pmd(pmd, __pmd(0));
- break;
- }
- pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr;
- pe &= __supported_pte_mask;
- set_pmd(pmd, __pmd(pe));
- }
+ phys_pmd_init(pmd, paddr, end);
+ spin_unlock(&init_mm.page_table_lock);
unmap_low_page(map);
}
__flush_tlb();
@@ -249,25 +294,32 @@ static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned lon
static void __init find_early_table_space(unsigned long end)
{
- unsigned long puds, pmds, tables;
+ unsigned long puds, pmds, tables, start;
puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
- table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables);
+ /* RED-PEN putting page tables only on node 0 could
+ cause a hotspot and fill up ZONE_DMA. The page tables
+ need roughly 0.5KB per GB. */
+ start = 0x8000;
+ table_start = find_e820_area(start, end, tables);
if (table_start == -1UL)
panic("Cannot find space for the kernel page tables");
table_start >>= PAGE_SHIFT;
table_end = table_start;
+
+ early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
+ end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
}
/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
This runs before bootmem is initialized and gets pages directly from the
physical memory. To access them they are temporarily mapped. */
-void __init init_memory_mapping(unsigned long start, unsigned long end)
+void __meminit init_memory_mapping(unsigned long start, unsigned long end)
{
unsigned long next;
@@ -279,7 +331,8 @@ void __init init_memory_mapping(unsigned long start, unsigned long end)
* mapped. Unfortunately this is done currently before the nodes are
* discovered.
*/
- find_early_table_space(end);
+ if (!after_bootmem)
+ find_early_table_space(end);
start = (unsigned long)__va(start);
end = (unsigned long)__va(end);
@@ -287,20 +340,26 @@ void __init init_memory_mapping(unsigned long start, unsigned long end)
for (; start < end; start = next) {
int map;
unsigned long pud_phys;
- pud_t *pud = alloc_low_page(&map, &pud_phys);
+ pgd_t *pgd = pgd_offset_k(start);
+ pud_t *pud;
+
+ if (after_bootmem)
+ pud = pud_offset_k(pgd, __PAGE_OFFSET);
+ else
+ pud = alloc_low_page(&map, &pud_phys);
+
next = start + PGDIR_SIZE;
if (next > end)
next = end;
phys_pud_init(pud, __pa(start), __pa(next));
- set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
+ if (!after_bootmem)
+ set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
unmap_low_page(map);
}
- asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
+ if (!after_bootmem)
+ asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
__flush_tlb_all();
- early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end,
- table_start<<PAGE_SHIFT,
- table_end<<PAGE_SHIFT);
}
void __cpuinit zap_low_mappings(int cpu)
@@ -375,6 +434,9 @@ size_zones(unsigned long *z, unsigned long *h,
void __init paging_init(void)
{
unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
+
+ memory_present(0, 0, end_pfn);
+ sparse_init();
size_zones(zones, holes, 0, end_pfn);
free_area_init_node(0, NODE_DATA(0), zones,
__pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
@@ -415,6 +477,50 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size)
__flush_tlb_all();
}
+/*
+ * Memory hotplug specific functions
+ * These are only for non-NUMA machines right now.
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG
+
+void online_page(struct page *page)
+{
+ ClearPageReserved(page);
+ set_page_count(page, 1);
+ __free_page(page);
+ totalram_pages++;
+ num_physpages++;
+}
+
+int add_memory(u64 start, u64 size)
+{
+ struct pglist_data *pgdat = NODE_DATA(0);
+ struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ int ret;
+
+ ret = __add_pages(zone, start_pfn, nr_pages);
+ if (ret)
+ goto error;
+
+ init_memory_mapping(start, (start + size -1));
+
+ return ret;
+error:
+ printk("%s: Problem encountered in __add_pages!\n", __func__);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(add_memory);
+
+int remove_memory(u64 start, u64 size)
+{
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(remove_memory);
+
+#endif
+
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
kcore_vsyscall;
@@ -423,12 +529,9 @@ void __init mem_init(void)
long codesize, reservedpages, datasize, initsize;
#ifdef CONFIG_SWIOTLB
- if (!iommu_aperture &&
- (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu))
- swiotlb = 1;
- if (swiotlb)
- swiotlb_init();
+ pci_swiotlb_init();
#endif
+ no_iommu_init();
/* How many end-of-memory variables you have, grandma! */
max_low_pfn = end_pfn;
@@ -498,10 +601,33 @@ void free_initmem(void)
printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
}
+#ifdef CONFIG_DEBUG_RODATA
+
+extern char __start_rodata, __end_rodata;
+void mark_rodata_ro(void)
+{
+ unsigned long addr = (unsigned long)&__start_rodata;
+
+ for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
+ change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
+
+ printk ("Write protecting the kernel read-only data: %luk\n",
+ (&__end_rodata - &__start_rodata) >> 10);
+
+ /*
+ * change_page_attr_addr() requires a global_flush_tlb() call after it.
+ * We do this after the printk so that if something went wrong in the
+ * change, the printk gets out at least to give a better debug hint
+ * of who is the culprit.
+ */
+ global_flush_tlb();
+}
+#endif
+
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
- if (start < (unsigned long)&_end)
+ if (start >= end)
return;
printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
for (; start < end; start += PAGE_SIZE) {
diff --git a/arch/x86_64/mm/mmap.c b/arch/x86_64/mm/mmap.c
new file mode 100644
index 00000000000..43e9b99bdf2
--- /dev/null
+++ b/arch/x86_64/mm/mmap.c
@@ -0,0 +1,30 @@
+/* Copyright 2005 Andi Kleen, SuSE Labs.
+ * Licensed under GPL, v.2
+ */
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/random.h>
+#include <asm/ia32.h>
+
+/* Notebook: move the mmap code from sys_x86_64.c over here. */
+
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+#ifdef CONFIG_IA32_EMULATION
+ if (current_thread_info()->flags & _TIF_IA32)
+ return ia32_pick_mmap_layout(mm);
+#endif
+ mm->mmap_base = TASK_UNMAPPED_BASE;
+ if (current->flags & PF_RANDOMIZE) {
+ /* Add 28bit randomness which is about 40bits of address space
+ because mmap base has to be page aligned.
+ or ~1/128 of the total user VM
+ (total user address space is 47bits) */
+ unsigned rnd = get_random_int() & 0xfffffff;
+ mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT;
+ }
+ mm->get_unmapped_area = arch_get_unmapped_area;
+ mm->unmap_area = arch_unmap_area;
+}
+
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 15b67d2760c..6ef9f9a7623 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -46,8 +46,8 @@ int numa_off __initdata;
* 0 if memnodmap[] too small (of shift too small)
* -1 if node overlap or lost ram (shift too big)
*/
-static int __init populate_memnodemap(
- const struct node *nodes, int numnodes, int shift)
+static int __init
+populate_memnodemap(const struct node *nodes, int numnodes, int shift)
{
int i;
int res = -1;
@@ -81,7 +81,7 @@ int __init compute_hash_shift(struct node *nodes, int numnodes)
while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
shift++;
- printk(KERN_DEBUG "Using %d for the hash shift.\n",
+ printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
shift);
if (populate_memnodemap(nodes, numnodes, shift) != 1) {
@@ -110,7 +110,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
start = round_up(start, ZONE_ALIGN);
- printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
+ printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
start_pfn = start >> PAGE_SHIFT;
end_pfn = end >> PAGE_SHIFT;
@@ -156,7 +156,7 @@ void __init setup_node_zones(int nodeid)
start_pfn = node_start_pfn(nodeid);
end_pfn = node_end_pfn(nodeid);
- Dprintk(KERN_INFO "setting up node %d %lx-%lx\n",
+ Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
nodeid, start_pfn, end_pfn);
size_zones(zones, holes, start_pfn, end_pfn);
@@ -200,7 +200,7 @@ static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
while ((x << 1) < sz)
x <<= 1;
if (x < sz/2)
- printk("Numa emulation unbalanced. Complain to maintainer\n");
+ printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
sz = x;
}
@@ -272,7 +272,7 @@ __cpuinit void numa_add_cpu(int cpu)
void __cpuinit numa_set_node(int cpu, int node)
{
- cpu_pda[cpu].nodenumber = node;
+ cpu_pda(cpu)->nodenumber = node;
cpu_to_node[cpu] = node;
}
@@ -330,8 +330,69 @@ __init int numa_setup(char *opt)
return 1;
}
+/*
+ * Setup early cpu_to_node.
+ *
+ * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
+ * and apicid_to_node[] tables have valid entries for a CPU.
+ * This means we skip cpu_to_node[] initialisation for NUMA
+ * emulation and faking node case (when running a kernel compiled
+ * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
+ * is already initialized in a round robin manner at numa_init_array,
+ * prior to this call, and this initialization is good enough
+ * for the fake NUMA cases.
+ */
+void __init init_cpu_to_node(void)
+{
+ int i;
+ for (i = 0; i < NR_CPUS; i++) {
+ u8 apicid = x86_cpu_to_apicid[i];
+ if (apicid == BAD_APICID)
+ continue;
+ if (apicid_to_node[apicid] == NUMA_NO_NODE)
+ continue;
+ cpu_to_node[i] = apicid_to_node[apicid];
+ }
+}
+
EXPORT_SYMBOL(cpu_to_node);
EXPORT_SYMBOL(node_to_cpumask);
EXPORT_SYMBOL(memnode_shift);
EXPORT_SYMBOL(memnodemap);
EXPORT_SYMBOL(node_data);
+
+#ifdef CONFIG_DISCONTIGMEM
+/*
+ * Functions to convert PFNs from/to per node page addresses.
+ * These are out of line because they are quite big.
+ * They could be all tuned by pre caching more state.
+ * Should do that.
+ */
+
+/* Requires pfn_valid(pfn) to be true */
+struct page *pfn_to_page(unsigned long pfn)
+{
+ int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT);
+ return (pfn - node_start_pfn(nid)) + NODE_DATA(nid)->node_mem_map;
+}
+EXPORT_SYMBOL(pfn_to_page);
+
+unsigned long page_to_pfn(struct page *page)
+{
+ return (long)(((page) - page_zone(page)->zone_mem_map) +
+ page_zone(page)->zone_start_pfn);
+}
+EXPORT_SYMBOL(page_to_pfn);
+
+int pfn_valid(unsigned long pfn)
+{
+ unsigned nid;
+ if (pfn >= num_physpages)
+ return 0;
+ nid = pfn_to_nid(pfn);
+ if (nid == 0xff)
+ return 0;
+ return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
+}
+EXPORT_SYMBOL(pfn_valid);
+#endif
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index b90e8fe9eeb..35f1f1aab06 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -128,6 +128,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
pte_t *kpte;
struct page *kpte_page;
unsigned kpte_flags;
+ pgprot_t ref_prot2;
kpte = lookup_address(address);
if (!kpte) return 0;
kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
@@ -140,10 +141,14 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
* split_large_page will take the reference for this change_page_attr
* on the split page.
*/
- struct page *split = split_large_page(address, prot, ref_prot);
+
+ struct page *split;
+ ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE));
+
+ split = split_large_page(address, prot, ref_prot2);
if (!split)
return -ENOMEM;
- set_pte(kpte,mk_pte(split, ref_prot));
+ set_pte(kpte,mk_pte(split, ref_prot2));
kpte_page = split;
}
get_page(kpte_page);
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 33340bd1e32..8b7f85608fa 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -17,21 +17,23 @@
#include <linux/topology.h>
#include <asm/proto.h>
#include <asm/numa.h>
+#include <asm/e820.h>
static struct acpi_table_slit *acpi_slit;
static nodemask_t nodes_parsed __initdata;
static nodemask_t nodes_found __initdata;
static struct node nodes[MAX_NUMNODES] __initdata;
-static __u8 pxm2node[256] = { [0 ... 255] = 0xff };
+static u8 pxm2node[256] = { [0 ... 255] = 0xff };
static int node_to_pxm(int n);
int pxm_to_node(int pxm)
{
if ((unsigned)pxm >= 256)
- return 0;
- return pxm2node[pxm];
+ return -1;
+ /* Extend 0xff to (int)-1 */
+ return (signed char)pxm2node[pxm];
}
static __init int setup_node(int pxm)
@@ -91,9 +93,36 @@ static __init inline int srat_disabled(void)
return numa_off || acpi_numa < 0;
}
+/*
+ * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
+ * up the NUMA heuristics which wants the local node to have a smaller
+ * distance than the others.
+ * Do some quick checks here and only use the SLIT if it passes.
+ */
+static __init int slit_valid(struct acpi_table_slit *slit)
+{
+ int i, j;
+ int d = slit->localities;
+ for (i = 0; i < d; i++) {
+ for (j = 0; j < d; j++) {
+ u8 val = slit->entry[d*i + j];
+ if (i == j) {
+ if (val != 10)
+ return 0;
+ } else if (val <= 10)
+ return 0;
+ }
+ }
+ return 1;
+}
+
/* Callback for SLIT parsing */
void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
{
+ if (!slit_valid(slit)) {
+ printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
+ return;
+ }
acpi_slit = slit;
}
@@ -168,12 +197,39 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
nd->start, nd->end);
}
+/* Sanity check to catch more bad SRATs (they are amazingly common).
+ Make sure the PXMs cover all memory. */
+static int nodes_cover_memory(void)
+{
+ int i;
+ unsigned long pxmram, e820ram;
+
+ pxmram = 0;
+ for_each_node_mask(i, nodes_parsed) {
+ unsigned long s = nodes[i].start >> PAGE_SHIFT;
+ unsigned long e = nodes[i].end >> PAGE_SHIFT;
+ pxmram += e - s;
+ pxmram -= e820_hole_size(s, e);
+ }
+
+ e820ram = end_pfn - e820_hole_size(0, end_pfn);
+ if (pxmram < e820ram) {
+ printk(KERN_ERR
+ "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
+ (pxmram << PAGE_SHIFT) >> 20,
+ (e820ram << PAGE_SHIFT) >> 20);
+ return 0;
+ }
+ return 1;
+}
+
void __init acpi_numa_arch_fixup(void) {}
/* Use the information discovered above to actually set up the nodes. */
int __init acpi_scan_nodes(unsigned long start, unsigned long end)
{
int i;
+
if (acpi_numa <= 0)
return -1;
@@ -184,6 +240,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
node_clear(i, nodes_parsed);
}
+ if (!nodes_cover_memory()) {
+ bad_srat();
+ return -1;
+ }
+
memnode_shift = compute_hash_shift(nodes, nodes_weight(nodes_parsed));
if (memnode_shift < 0) {
printk(KERN_ERR