From e65845045588806fa5c8df8a4f4253516515a5e3 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 2 May 2007 19:27:06 +0200 Subject: [PATCH] x86-64: dma_ops as const The dma_ops structure can be const since it never changes after boot. Signed-off-by: Stephen Hemminger Signed-off-by: Andi Kleen --- arch/x86_64/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64/mm') diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index ec31534eb10..5ca61731f55 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -46,7 +46,7 @@ #define Dprintk(x...) #endif -struct dma_mapping_ops* dma_ops; +const struct dma_mapping_ops* dma_ops; EXPORT_SYMBOL(dma_ops); static unsigned long dma_reserve __initdata; -- cgit v1.2.3-70-g09d2 From dafe41ee3a9389c08c91cdfd8670295f20f89e04 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:06 +0200 Subject: [PATCH] x86-64: Kill temp boot pmds Early in the boot process we need the ability to set up temporary mappings, before our normal mechanisms are initialized. Currently this is used to map pages that are part of the page tables we are building and pages during the dmi scan. The core problem is that we are using the user portion of the page tables to implement this. Which means that while this mechanism is active we cannot catch NULL pointer dereferences and we deviate from the normal ways of handling things. In this patch I modify early_ioremap to map pages into the kernel portion of address space, roughly where we will later put modules, and I make the discovery of which addresses we can use dynamic which removes all kinds of static limits and remove the dependencies on implementation details between different parts of the code. Now alloc_low_page() and unmap_low_page() use early_iomap() and early_iounmap() to allocate/map and unmap a page. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head.S | 3 -- arch/x86_64/mm/init.c | 100 +++++++++++++++++++++------------------------- 2 files changed, 45 insertions(+), 58 deletions(-) (limited to 'arch/x86_64/mm') diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 598a4d0351f..118c6088198 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -288,9 +288,6 @@ NEXT_PAGE(level2_ident_pgt) .quad i << 21 | 0x083 i = i + 1 .endr - /* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */ - .globl temp_boot_pmds -temp_boot_pmds: .fill 492,8,0 NEXT_PAGE(level2_kernel_pgt) diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 5ca61731f55..4ab3d40aac9 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -167,23 +167,9 @@ __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) unsigned long __initdata table_start, table_end; -extern pmd_t temp_boot_pmds[]; - -static struct temp_map { - pmd_t *pmd; - void *address; - int allocated; -} temp_mappings[] __initdata = { - { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) }, - { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) }, - {} -}; - -static __meminit void *alloc_low_page(int *index, unsigned long *phys) +static __meminit void *alloc_low_page(unsigned long *phys) { - struct temp_map *ti; - int i; - unsigned long pfn = table_end++, paddr; + unsigned long pfn = table_end++; void *adr; if (after_bootmem) { @@ -194,57 +180,63 @@ static __meminit void *alloc_low_page(int *index, unsigned long *phys) if (pfn >= end_pfn) panic("alloc_low_page: ran out of memory"); - for (i = 0; temp_mappings[i].allocated; i++) { - if (!temp_mappings[i].pmd) - panic("alloc_low_page: ran out of temp mappings"); - } - ti = &temp_mappings[i]; - paddr = (pfn << PAGE_SHIFT) & PMD_MASK; - set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE)); - ti->allocated = 1; - __flush_tlb(); - adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); + + adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); memset(adr, 0, PAGE_SIZE); - *index = i; - *phys = pfn * PAGE_SIZE; - return adr; -} + *phys = pfn * PAGE_SIZE; + return adr; +} -static __meminit void unmap_low_page(int i) +static __meminit void unmap_low_page(void *adr) { - struct temp_map *ti; if (after_bootmem) return; - ti = &temp_mappings[i]; - set_pmd(ti->pmd, __pmd(0)); - ti->allocated = 0; + early_iounmap(adr, PAGE_SIZE); } /* Must run before zap_low_mappings */ __init void *early_ioremap(unsigned long addr, unsigned long size) { - unsigned long map = round_down(addr, LARGE_PAGE_SIZE); - - /* actually usually some more */ - if (size >= LARGE_PAGE_SIZE) { - return NULL; + unsigned long vaddr; + pmd_t *pmd, *last_pmd; + int i, pmds; + + pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; + vaddr = __START_KERNEL_map; + pmd = level2_kernel_pgt; + last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; + for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { + for (i = 0; i < pmds; i++) { + if (pmd_present(pmd[i])) + goto next; + } + vaddr += addr & ~PMD_MASK; + addr &= PMD_MASK; + for (i = 0; i < pmds; i++, addr += PMD_SIZE) + set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE)); + __flush_tlb(); + return (void *)vaddr; + next: + ; } - set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); - map += LARGE_PAGE_SIZE; - set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); - __flush_tlb(); - return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1)); + printk("early_ioremap(0x%lx, %lu) failed\n", addr, size); + return NULL; } /* To avoid virtual aliases later */ __init void early_iounmap(void *addr, unsigned long size) { - if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address) - printk("early_iounmap: bad address %p\n", addr); - set_pmd(temp_mappings[0].pmd, __pmd(0)); - set_pmd(temp_mappings[1].pmd, __pmd(0)); + unsigned long vaddr; + pmd_t *pmd; + int i, pmds; + + vaddr = (unsigned long)addr; + pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; + pmd = level2_kernel_pgt + pmd_index(vaddr); + for (i = 0; i < pmds; i++) + pmd_clear(pmd + i); __flush_tlb(); } @@ -289,7 +281,6 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) { - int map; unsigned long pmd_phys; pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; @@ -307,12 +298,12 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne continue; } - pmd = alloc_low_page(&map, &pmd_phys); + pmd = alloc_low_page(&pmd_phys); spin_lock(&init_mm.page_table_lock); set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); phys_pmd_init(pmd, addr, end); spin_unlock(&init_mm.page_table_lock); - unmap_low_page(map); + unmap_low_page(pmd); } __flush_tlb(); } @@ -364,7 +355,6 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) end = (unsigned long)__va(end); for (; start < end; start = next) { - int map; unsigned long pud_phys; pgd_t *pgd = pgd_offset_k(start); pud_t *pud; @@ -372,7 +362,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) if (after_bootmem) pud = pud_offset(pgd, start & PGDIR_MASK); else - pud = alloc_low_page(&map, &pud_phys); + pud = alloc_low_page(&pud_phys); next = start + PGDIR_SIZE; if (next > end) @@ -380,7 +370,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) phys_pud_init(pud, __pa(start), __pa(next)); if (!after_bootmem) set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); - unmap_low_page(map); + unmap_low_page(pud); } if (!after_bootmem) -- cgit v1.2.3-70-g09d2 From cfd243d4af7c7f8f52f5cb99d3932d9074b039ff Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH] x86-64: Remove the identity mapping as early as possible With the rewrite of the SMP trampoline and the early page allocator there is nothing that needs identity mapped pages, once we start executing C code. So add zap_identity_mappings into head64.c and remove zap_low_mappings() from much later in the code. The functions are subtly different thus the name change. This also kills boot_level4_pgt which was from an earlier attempt to move the identity mappings as early as possible, and is now no longer needed. Essentially I have replaced boot_level4_pgt with trampoline_level4_pgt in trampoline.S Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head.S | 39 ++++++++++++++------------------------- arch/x86_64/kernel/head64.c | 17 +++++++++++------ arch/x86_64/kernel/setup.c | 2 -- arch/x86_64/kernel/setup64.c | 1 - arch/x86_64/mm/init.c | 24 ------------------------ include/asm-x86_64/pgtable.h | 1 - include/asm-x86_64/proto.h | 2 -- 7 files changed, 25 insertions(+), 61 deletions(-) (limited to 'arch/x86_64/mm') diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 926aa2197aa..c211e52f133 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -71,7 +71,7 @@ startup_32: movl %eax, %cr4 /* Setup early boot stage 4 level pagetables */ - movl $(boot_level4_pgt - __START_KERNEL_map), %eax + movl $(init_level4_pgt - __START_KERNEL_map), %eax movl %eax, %cr3 /* Setup EFER (Extended Feature Enable Register) */ @@ -115,7 +115,7 @@ ENTRY(secondary_startup_64) movq %rax, %cr4 /* Setup early boot stage 4 level pagetables. */ - movq $(boot_level4_pgt - __START_KERNEL_map), %rax + movq $(init_level4_pgt - __START_KERNEL_map), %rax movq %rax, %cr3 /* Check if nx is implemented */ @@ -274,9 +274,19 @@ ENTRY(name) i = i + 1 ; \ .endr + /* + * This default setting generates an ident mapping at address 0x100000 + * and a mapping for the kernel that precisely maps virtual address + * 0xffffffff80000000 to physical address 0x000000. (always using + * 2Mbyte large pages provided by PAE mode) + */ NEXT_PAGE(init_level4_pgt) - /* This gets initialized in x86_64_start_kernel */ - .fill 512,8,0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 257,8,0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 252,8,0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE @@ -307,27 +317,6 @@ NEXT_PAGE(level2_kernel_pgt) #undef NEXT_PAGE .data - -#ifndef CONFIG_HOTPLUG_CPU - __INITDATA -#endif - /* - * This default setting generates an ident mapping at address 0x100000 - * and a mapping for the kernel that precisely maps virtual address - * 0xffffffff80000000 to physical address 0x000000. (always using - * 2Mbyte large pages provided by PAE mode) - */ - .align PAGE_SIZE -ENTRY(boot_level4_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 257,8,0 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 252,8,0 - /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE - - .data - .align 16 .globl cpu_gdt_descr cpu_gdt_descr: diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index 5c529c1e3d6..6c34bdd22e2 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -18,8 +18,16 @@ #include #include #include +#include #include +static void __init zap_identity_mappings(void) +{ + pgd_t *pgd = pgd_offset_k(0UL); + pgd_clear(pgd); + __flush_tlb(); +} + /* Don't add a printk in there. printk relies on the PDA which is not initialized yet. */ static void __init clear_bss(void) @@ -57,18 +65,15 @@ void __init x86_64_start_kernel(char * real_mode_data) /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); + /* Make NULL pointers segfault */ + zap_identity_mappings(); + for (i = 0; i < IDT_ENTRIES; i++) set_intr_gate(i, early_idt_handler); asm volatile("lidt %0" :: "m" (idt_descr)); early_printk("Kernel alive\n"); - /* - * switch to init_level4_pgt from boot_level4_pgt - */ - memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t)); - asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); - for (i = 0; i < NR_CPUS; i++) cpu_pda(i) = &boot_cpu_pda[i]; diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 65e2bc551a2..0e2b8df0ea6 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -274,8 +274,6 @@ void __init setup_arch(char **cmdline_p) dmi_scan_machine(); - zap_low_mappings(0); - #ifdef CONFIG_ACPI /* * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 6a70b55f719..53064a9a365 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -201,7 +201,6 @@ void __cpuinit cpu_init (void) /* CPU 0 is initialised in head64.c */ if (cpu != 0) { pda_init(cpu); - zap_low_mappings(cpu); } else estacks = boot_exception_stacks; diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 4ab3d40aac9..b0a60789218 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -378,21 +378,6 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) __flush_tlb_all(); } -void __cpuinit zap_low_mappings(int cpu) -{ - if (cpu == 0) { - pgd_t *pgd = pgd_offset_k(0UL); - pgd_clear(pgd); - } else { - /* - * For AP's, zap the low identity mappings by changing the cr3 - * to init_level4_pgt and doing local flush tlb all - */ - asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); - } - __flush_tlb_all(); -} - #ifndef CONFIG_NUMA void __init paging_init(void) { @@ -569,15 +554,6 @@ void __init mem_init(void) reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10); - -#ifdef CONFIG_SMP - /* - * Sync boot_level4_pgt mappings with the init_level4_pgt - * except for the low identity mappings which are already zapped - * in init_level4_pgt. This sync-up is essential for AP's bringup - */ - memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t)); -#endif } void free_init_pages(char *what, unsigned long begin, unsigned long end) diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index 5a5d43b3ef5..703f0243f27 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -17,7 +17,6 @@ extern pud_t level3_kernel_pgt[512]; extern pud_t level3_ident_pgt[512]; extern pmd_t level2_kernel_pgt[512]; extern pgd_t init_level4_pgt[]; -extern pgd_t boot_level4_pgt[]; extern unsigned long __supported_pte_mask; #define swapper_pg_dir init_level4_pgt diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index 78427021d94..3f8f285138d 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -11,8 +11,6 @@ struct pt_regs; extern void start_kernel(void); extern void pda_init(int); -extern void zap_low_mappings(int cpu); - extern void early_idt_handler(void); extern void mcheck_init(struct cpuinfo_x86 *c); -- cgit v1.2.3-70-g09d2 From 0dbf7028c0c1f266c9631139450a1502d3cd457e Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH] x86: __pa and __pa_symbol address space separation Currently __pa_symbol is for use with symbols in the kernel address map and __pa is for use with pointers into the physical memory map. But the code is implemented so you can usually interchange the two. __pa which is much more common can be implemented much more cheaply if it is it doesn't have to worry about any other kernel address spaces. This is especially true with a relocatable kernel as __pa_symbol needs to peform an extra variable read to resolve the address. There is a third macro that is added for the vsyscall data __pa_vsymbol for finding the physical addesses of vsyscall pages. Most of this patch is simply sorting through the references to __pa or __pa_symbol and using the proper one. A little of it is continuing to use a physical address when we have it instead of recalculating it several times. swapper_pgd is now NULL. leave_mm now uses init_mm.pgd and init_mm.pgd is initialized at boot (instead of compile time) to the physmem virtual mapping of init_level4_pgd. The physical address changed. Except for the for EMPTY_ZERO page all of the remaining references to __pa_symbol appear to be during kernel initialization. So this should reduce the cost of __pa in the common case, even on a relocated kernel. As this is technically a semantic change we need to be on the lookout for anything I missed. But it works for me (tm). Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/i386/kernel/alternative.c | 4 ++-- arch/i386/mm/init.c | 15 ++++++++------- arch/x86_64/kernel/machine_kexec.c | 14 +++++++------- arch/x86_64/kernel/setup.c | 9 +++++---- arch/x86_64/kernel/smp.c | 2 +- arch/x86_64/kernel/vsyscall.c | 9 +++++++-- arch/x86_64/mm/init.c | 21 +++++++++++---------- arch/x86_64/mm/pageattr.c | 16 ++++++++-------- include/asm-x86_64/page.h | 6 ++---- include/asm-x86_64/pgtable.h | 4 ++-- 10 files changed, 53 insertions(+), 47 deletions(-) (limited to 'arch/x86_64/mm') diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index 426f59b0106..a27c8d34736 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -402,8 +402,8 @@ void __init alternative_instructions(void) _text, _etext); } free_init_pages("SMP alternatives", - (unsigned long)__smp_alt_begin, - (unsigned long)__smp_alt_end); + __pa_symbol(&__smp_alt_begin), + __pa_symbol(&__smp_alt_end)); } else { alternatives_smp_save(__smp_alt_instructions, __smp_alt_instructions_end); diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index ae436882af7..23be1b0aafa 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -774,10 +774,11 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) unsigned long addr; for (addr = begin; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); - free_page(addr); + struct page *page = pfn_to_page(addr >> PAGE_SHIFT); + ClearPageReserved(page); + init_page_count(page); + memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE); + __free_page(page); totalram_pages++; } printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); @@ -786,14 +787,14 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) void free_initmem(void) { free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); + __pa_symbol(&__init_begin), + __pa_symbol(&__init_end)); } #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_init_pages("initrd memory", start, end); + free_init_pages("initrd memory", __pa(start), __pa(end)); } #endif diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c index 0497e3bd5bf..a8bb33c1a8f 100644 --- a/arch/x86_64/kernel/machine_kexec.c +++ b/arch/x86_64/kernel/machine_kexec.c @@ -191,19 +191,19 @@ NORET_TYPE void machine_kexec(struct kimage *image) page_list[PA_CONTROL_PAGE] = __pa(control_page); page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; - page_list[PA_PGD] = __pa(kexec_pgd); + page_list[PA_PGD] = __pa_symbol(&kexec_pgd); page_list[VA_PGD] = (unsigned long)kexec_pgd; - page_list[PA_PUD_0] = __pa(kexec_pud0); + page_list[PA_PUD_0] = __pa_symbol(&kexec_pud0); page_list[VA_PUD_0] = (unsigned long)kexec_pud0; - page_list[PA_PMD_0] = __pa(kexec_pmd0); + page_list[PA_PMD_0] = __pa_symbol(&kexec_pmd0); page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; - page_list[PA_PTE_0] = __pa(kexec_pte0); + page_list[PA_PTE_0] = __pa_symbol(&kexec_pte0); page_list[VA_PTE_0] = (unsigned long)kexec_pte0; - page_list[PA_PUD_1] = __pa(kexec_pud1); + page_list[PA_PUD_1] = __pa_symbol(&kexec_pud1); page_list[VA_PUD_1] = (unsigned long)kexec_pud1; - page_list[PA_PMD_1] = __pa(kexec_pmd1); + page_list[PA_PMD_1] = __pa_symbol(&kexec_pmd1); page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; - page_list[PA_PTE_1] = __pa(kexec_pte1); + page_list[PA_PTE_1] = __pa_symbol(&kexec_pte1); page_list[VA_PTE_1] = (unsigned long)kexec_pte1; page_list[PA_TABLE_PAGE] = diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 0e2b8df0ea6..b9bdfc1b54f 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -243,11 +243,12 @@ void __init setup_arch(char **cmdline_p) init_mm.end_code = (unsigned long) &_etext; init_mm.end_data = (unsigned long) &_edata; init_mm.brk = (unsigned long) &_end; + init_mm.pgd = __va(__pa_symbol(&init_level4_pgt)); - code_resource.start = virt_to_phys(&_text); - code_resource.end = virt_to_phys(&_etext)-1; - data_resource.start = virt_to_phys(&_etext); - data_resource.end = virt_to_phys(&_edata)-1; + code_resource.start = __pa_symbol(&_text); + code_resource.end = __pa_symbol(&_etext)-1; + data_resource.start = __pa_symbol(&_etext); + data_resource.end = __pa_symbol(&_edata)-1; early_identify_cpu(&boot_cpu_data); diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index bd1d123947c..22abae4e9f3 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -76,7 +76,7 @@ static inline void leave_mm(int cpu) if (read_pda(mmu_state) == TLBSTATE_OK) BUG(); cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); - load_cr3(swapper_pg_dir); + load_cr3(init_mm.pgd); } /* diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index b43c698cf7d..d14cbb3e0eb 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -45,6 +45,11 @@ #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) #define __syscall_clobber "r11","rcx","memory" +#define __pa_vsymbol(x) \ + ({unsigned long v; \ + extern char __vsyscall_0; \ + asm("" : "=r" (v) : "0" (x)); \ + ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); }) struct vsyscall_gtod_data_t { seqlock_t lock; @@ -224,10 +229,10 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, return ret; /* gcc has some trouble with __va(__pa()), so just do it this way. */ - map1 = ioremap(__pa_symbol(&vsysc1), 2); + map1 = ioremap(__pa_vsymbol(&vsysc1), 2); if (!map1) return -ENOMEM; - map2 = ioremap(__pa_symbol(&vsysc2), 2); + map2 = ioremap(__pa_vsymbol(&vsysc2), 2); if (!map2) { ret = -ENOMEM; goto out; diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index b0a60789218..69e22d3c923 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -565,11 +565,11 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); for (addr = begin; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)(addr & ~(PAGE_SIZE-1)), - POISON_FREE_INITMEM, PAGE_SIZE); - free_page(addr); + struct page *page = pfn_to_page(addr >> PAGE_SHIFT); + ClearPageReserved(page); + init_page_count(page); + memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE); + __free_page(page); totalram_pages++; } } @@ -579,17 +579,18 @@ void free_initmem(void) memset(__initdata_begin, POISON_FREE_INITDATA, __initdata_end - __initdata_begin); free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); + __pa_symbol(&__init_begin), + __pa_symbol(&__init_end)); } #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void) { - unsigned long addr = (unsigned long)__start_rodata; + unsigned long addr = (unsigned long)__va(__pa_symbol(&__start_rodata)); + unsigned long end = (unsigned long)__va(__pa_symbol(&__end_rodata)); - for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) + for (; addr < end; addr += PAGE_SIZE) change_page_attr_addr(addr, 1, PAGE_KERNEL_RO); printk ("Write protecting the kernel read-only data: %luk\n", @@ -608,7 +609,7 @@ void mark_rodata_ro(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_init_pages("initrd memory", start, end); + free_init_pages("initrd memory", __pa(start), __pa(end)); } #endif diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index 081409aa345..76ee90a5abe 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -51,7 +51,6 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, SetPagePrivate(base); page_private(base) = 0; - address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { @@ -101,13 +100,12 @@ static inline void save_page(struct page *fpage) * No more special protections in this 2/4MB area - revert to a * large page again. */ -static void revert_page(unsigned long address, pgprot_t ref_prot) +static void revert_page(unsigned long address, unsigned long pfn, pgprot_t ref_prot) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t large_pte; - unsigned long pfn; pgd = pgd_offset_k(address); BUG_ON(pgd_none(*pgd)); @@ -115,7 +113,6 @@ static void revert_page(unsigned long address, pgprot_t ref_prot) BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, address); BUG_ON(pmd_val(*pmd) & _PAGE_PSE); - pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT; large_pte = pfn_pte(pfn, ref_prot); large_pte = pte_mkhuge(large_pte); set_pte((pte_t *)pmd, large_pte); @@ -141,7 +138,8 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, */ struct page *split; ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); - split = split_large_page(address, prot, ref_prot2); + split = split_large_page(pfn << PAGE_SHIFT, prot, + ref_prot2); if (!split) return -ENOMEM; set_pte(kpte, mk_pte(split, ref_prot2)); @@ -160,7 +158,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, if (page_private(kpte_page) == 0) { save_page(kpte_page); - revert_page(address, ref_prot); + revert_page(address, pfn, ref_prot); } return 0; } @@ -180,6 +178,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, */ int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) { + unsigned long phys_base_pfn = __pa_symbol(__START_KERNEL_map) >> PAGE_SHIFT; int err = 0; int i; @@ -192,10 +191,11 @@ int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) break; /* Handle kernel mapping too which aliases part of the * lowmem */ - if (__pa(address) < KERNEL_TEXT_SIZE) { + if ((pfn >= phys_base_pfn) && + ((pfn - phys_base_pfn) < (KERNEL_TEXT_SIZE >> PAGE_SHIFT))) { unsigned long addr2; pgprot_t prot2; - addr2 = __START_KERNEL_map + __pa(address); + addr2 = __START_KERNEL_map + ((pfn - phys_base_pfn) << PAGE_SHIFT); /* Make sure the kernel mappings stay executable */ prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); err = __change_page_attr(addr2, pfn, prot2, diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h index d554b94485d..4974433bbf3 100644 --- a/include/asm-x86_64/page.h +++ b/include/asm-x86_64/page.h @@ -102,17 +102,15 @@ typedef struct { unsigned long pgprot; } pgprot_t; /* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol. Otherwise you risk miscompilation. */ -#define __pa(x) (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET) +#define __pa(x) ((unsigned long)(x) - PAGE_OFFSET) /* __pa_symbol should be used for C visible symbols. This seems to be the official gcc blessed way to do such arithmetic. */ #define __pa_symbol(x) \ ({unsigned long v; \ asm("" : "=r" (v) : "0" (x)); \ - __pa(v); }) + (v - __START_KERNEL_map); }) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) -#define __boot_va(x) __va(x) -#define __boot_pa(x) __pa(x) #ifdef CONFIG_FLATMEM #define pfn_valid(pfn) ((pfn) < end_pfn) #endif diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index 703f0243f27..c1865e38c7b 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -19,7 +19,7 @@ extern pmd_t level2_kernel_pgt[512]; extern pgd_t init_level4_pgt[]; extern unsigned long __supported_pte_mask; -#define swapper_pg_dir init_level4_pgt +#define swapper_pg_dir ((pgd_t *)NULL) extern void paging_init(void); extern void clear_kernel_mapping(unsigned long addr, unsigned long size); @@ -29,7 +29,7 @@ extern void clear_kernel_mapping(unsigned long addr, unsigned long size); * for zero-mapped memory areas etc.. */ extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) +#define ZERO_PAGE(vaddr) (pfn_to_page(__pa_symbol(&empty_zero_page) >> PAGE_SHIFT)) #endif /* !__ASSEMBLY__ */ -- cgit v1.2.3-70-g09d2 From 8b8ca80e192b10eecc01fc44a2902510af86f73b Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 2 May 2007 19:27:09 +0200 Subject: [PATCH] x86-64: configurable fake numa node sizes Extends the numa=fake x86_64 command-line option to allow for configurable node sizes. These nodes can be used in conjunction with cpusets for coarse memory resource management. The old command-line option is still supported: numa=fake=32 gives 32 fake NUMA nodes, ignoring the NUMA setup of the actual machine. But now you may configure your system for the node sizes of your choice: numa=fake=2*512,1024,2*256 gives two 512M nodes, one 1024M node, two 256M nodes, and the rest of system memory to a sixth node. The existing hash function is maintained to support the various node sizes that are possible with this implementation. Each node of the same size receives roughly the same amount of available pages, regardless of any reserved memory with its address range. The total available pages on the system is calculated and divided by the number of equal nodes to allocate. These nodes are then dynamically allocated and their borders extended until such time as their number of available pages reaches the required size. Configurable node sizes are recommended when used in conjunction with cpusets for memory control because it eliminates the overhead associated with scanning the zonelists of many smaller full nodes on page_alloc(). Cc: Andi Kleen Signed-off-by: David Rientjes Signed-off-by: Andi Kleen Cc: Paul Jackson Cc: Christoph Lameter Signed-off-by: Andrew Morton --- Documentation/x86_64/boot-options.txt | 8 +- arch/x86_64/mm/numa.c | 261 ++++++++++++++++++++-------------- include/asm-x86_64/mmzone.h | 2 +- 3 files changed, 161 insertions(+), 110 deletions(-) (limited to 'arch/x86_64/mm') diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index 85f51e5a749..7500aad95f3 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt @@ -149,7 +149,13 @@ NUMA numa=noacpi Don't parse the SRAT table for NUMA setup - numa=fake=X Fake X nodes and ignore NUMA setup of the actual machine. + numa=fake=CMDLINE + If a number, fakes CMDLINE nodes and ignores NUMA setup of the + actual machine. Otherwise, system memory is configured + depending on the sizes and coefficients listed. For example: + numa=fake=2*512,1024,4*256 + gives two 512M nodes, a 1024M node, and four 256M nodes. The + remaining system RAM is allocated to an additional node. numa=hotadd=percent Only allow hotadd memory to preallocate page structures upto diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 41b8fb06992..c55936bc6be 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -273,125 +273,172 @@ void __init numa_init_array(void) #ifdef CONFIG_NUMA_EMU /* Numa emulation */ -int numa_fake __initdata = 0; +#define E820_ADDR_HOLE_SIZE(start, end) \ + (e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) << \ + PAGE_SHIFT) +char *cmdline __initdata; /* - * This function is used to find out if the start and end correspond to - * different zones. + * Setups up nid to range from addr to addr + size. If the end boundary is + * greater than max_addr, then max_addr is used instead. The return value is 0 + * if there is additional memory left for allocation past addr and -1 otherwise. + * addr is adjusted to be at the end of the node. */ -int zone_cross_over(unsigned long start, unsigned long end) +static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, + u64 size, u64 max_addr) { - if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) && - (end >= (MAX_DMA32_PFN << PAGE_SHIFT))) - return 1; - return 0; + int ret = 0; + nodes[nid].start = *addr; + *addr += size; + if (*addr >= max_addr) { + *addr = max_addr; + ret = -1; + } + nodes[nid].end = *addr; + node_set_online(nid); + printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, + nodes[nid].start, nodes[nid].end, + (nodes[nid].end - nodes[nid].start) >> 20); + return ret; } -static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) +/* + * Splits num_nodes nodes up equally starting at node_start. The return value + * is the number of nodes split up and addr is adjusted to be at the end of the + * last node allocated. + */ +static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, + u64 max_addr, int node_start, + int num_nodes) { - int i, big; - struct bootnode nodes[MAX_NUMNODES]; - unsigned long sz, old_sz; - unsigned long hole_size; - unsigned long start, end; - unsigned long max_addr = (end_pfn << PAGE_SHIFT); - - start = (start_pfn << PAGE_SHIFT); - hole_size = e820_hole_size(start, max_addr); - sz = (max_addr - start - hole_size) / numa_fake; - - /* Kludge needed for the hash function */ - - old_sz = sz; - /* - * Round down to the nearest FAKE_NODE_MIN_SIZE. - */ - sz &= FAKE_NODE_MIN_HASH_MASK; + unsigned int big; + u64 size; + int i; + if (num_nodes <= 0) + return -1; + if (num_nodes > MAX_NUMNODES) + num_nodes = MAX_NUMNODES; + size = (max_addr - *addr - E820_ADDR_HOLE_SIZE(*addr, max_addr)) / + num_nodes; /* - * We ensure that each node is at least 64MB big. Smaller than this - * size can cause VM hiccups. + * Calculate the number of big nodes that can be allocated as a result + * of consolidating the leftovers. */ - if (sz == 0) { - printk(KERN_INFO "Not enough memory for %d nodes. Reducing " - "the number of nodes\n", numa_fake); - numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE; - printk(KERN_INFO "Number of fake nodes will be = %d\n", - numa_fake); - sz = FAKE_NODE_MIN_SIZE; + big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / + FAKE_NODE_MIN_SIZE; + + /* Round down to nearest FAKE_NODE_MIN_SIZE. */ + size &= FAKE_NODE_MIN_HASH_MASK; + if (!size) { + printk(KERN_ERR "Not enough memory for each node. " + "NUMA emulation disabled.\n"); + return -1; } - /* - * Find out how many nodes can get an extra NODE_MIN_SIZE granule. - * This logic ensures the extra memory gets distributed among as many - * nodes as possible (as compared to one single node getting all that - * extra memory. - */ - big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE; - printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: " - "%d\n", - (sz >> 20), (hole_size >> 20), big); - memset(&nodes,0,sizeof(nodes)); - end = start; - for (i = 0; i < numa_fake; i++) { - /* - * In case we are not able to allocate enough memory for all - * the nodes, we reduce the number of fake nodes. - */ - if (end >= max_addr) { - numa_fake = i - 1; - break; - } - start = nodes[i].start = end; - /* - * Final node can have all the remaining memory. - */ - if (i == numa_fake-1) - sz = max_addr - start; - end = nodes[i].start + sz; - /* - * Fir "big" number of nodes get extra granule. - */ + + for (i = node_start; i < num_nodes + node_start; i++) { + u64 end = *addr + size; if (i < big) end += FAKE_NODE_MIN_SIZE; /* - * Iterate over the range to ensure that this node gets at - * least sz amount of RAM (excluding holes) + * The final node can have the remaining system RAM. Other + * nodes receive roughly the same amount of available pages. */ - while ((end - start - e820_hole_size(start, end)) < sz) { - end += FAKE_NODE_MIN_SIZE; - if (end >= max_addr) - break; + if (i == num_nodes + node_start - 1) + end = max_addr; + else + while (end - *addr - E820_ADDR_HOLE_SIZE(*addr, end) < + size) { + end += FAKE_NODE_MIN_SIZE; + if (end > max_addr) { + end = max_addr; + break; + } + } + if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) + break; + } + return i - node_start + 1; +} + +/* + * Sets up the system RAM area from start_pfn to end_pfn according to the + * numa=fake command-line option. + */ +static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) +{ + struct bootnode nodes[MAX_NUMNODES]; + u64 addr = start_pfn << PAGE_SHIFT; + u64 max_addr = end_pfn << PAGE_SHIFT; + unsigned int coeff; + unsigned int num = 0; + int num_nodes = 0; + u64 size; + int i; + + memset(&nodes, 0, sizeof(nodes)); + /* + * If the numa=fake command-line is just a single number N, split the + * system RAM into N fake nodes. + */ + if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { + num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, + simple_strtol(cmdline, NULL, 0)); + if (num_nodes < 0) + return num_nodes; + goto out; + } + + /* Parse the command line. */ + for (coeff = 1; ; cmdline++) { + if (*cmdline && isdigit(*cmdline)) { + num = num * 10 + *cmdline - '0'; + continue; } - /* - * Look at the next node to make sure there is some real memory - * to map. Bad things happen when the only memory present - * in a zone on a fake node is IO hole. - */ - while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) { - if (zone_cross_over(start, end + sz)) { - end = (MAX_DMA32_PFN << PAGE_SHIFT); - break; + if (*cmdline == '*') + coeff = num; + if (!*cmdline || *cmdline == ',') { + /* + * Round down to the nearest FAKE_NODE_MIN_SIZE. + * Command-line coefficients are in megabytes. + */ + size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; + if (size) { + for (i = 0; i < coeff; i++, num_nodes++) + if (setup_node_range(num_nodes, nodes, + &addr, size, max_addr) < 0) + goto done; + coeff = 1; } - if (end >= max_addr) - break; - end += FAKE_NODE_MIN_SIZE; } - if (end > max_addr) - end = max_addr; - nodes[i].end = end; - printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", - i, - nodes[i].start, nodes[i].end, - (nodes[i].end - nodes[i].start) >> 20); - node_set_online(i); - } - memnode_shift = compute_hash_shift(nodes, numa_fake); - if (memnode_shift < 0) { - memnode_shift = 0; - printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); - return -1; - } - for_each_online_node(i) { + if (!*cmdline) + break; + num = 0; + } +done: + if (!num_nodes) + return -1; + /* Fill remainder of system RAM with a final node, if appropriate. */ + if (addr < max_addr) { + setup_node_range(num_nodes, nodes, &addr, max_addr - addr, + max_addr); + num_nodes++; + } +out: + memnode_shift = compute_hash_shift(nodes, num_nodes); + if (memnode_shift < 0) { + memnode_shift = 0; + printk(KERN_ERR "No NUMA hash function found. NUMA emulation " + "disabled.\n"); + return -1; + } + + /* + * We need to vacate all active ranges that may have been registered by + * SRAT. + */ + remove_all_active_ranges(); + for_each_online_node(i) { e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); setup_node_bootmem(i, nodes[i].start, nodes[i].end); @@ -399,14 +446,15 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) numa_init_array(); return 0; } -#endif +#undef E820_ADDR_HOLE_SIZE +#endif /* CONFIG_NUMA_EMU */ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) { int i; #ifdef CONFIG_NUMA_EMU - if (numa_fake && !numa_emulation(start_pfn, end_pfn)) + if (cmdline && !numa_emulation(start_pfn, end_pfn)) return; #endif @@ -486,11 +534,8 @@ static __init int numa_setup(char *opt) if (!strncmp(opt,"off",3)) numa_off = 1; #ifdef CONFIG_NUMA_EMU - if(!strncmp(opt, "fake=", 5)) { - numa_fake = simple_strtoul(opt+5,NULL,0); ; - if (numa_fake >= MAX_NUMNODES) - numa_fake = MAX_NUMNODES; - } + if (!strncmp(opt, "fake=", 5)) + cmdline = opt + 5; #endif #ifdef CONFIG_ACPI_NUMA if (!strncmp(opt,"noacpi",6)) diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h index fb558fb1d21..19a89377b12 100644 --- a/include/asm-x86_64/mmzone.h +++ b/include/asm-x86_64/mmzone.h @@ -49,7 +49,7 @@ extern int pfn_valid(unsigned long pfn); #ifdef CONFIG_NUMA_EMU #define FAKE_NODE_MIN_SIZE (64*1024*1024) -#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1ul)) +#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1uL)) #endif #endif -- cgit v1.2.3-70-g09d2 From 14694d736bb66d0ec250d05c81c6e98a19c229c6 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 2 May 2007 19:27:09 +0200 Subject: [PATCH] x86-64: split remaining fake nodes equally Extends the numa=fake x86_64 command-line option to split the remaining system memory into equal-sized nodes. For example: numa=fake=2*512,4* gives two 512M nodes and the remaining system memory is split into four approximately equal chunks. This is beneficial for systems where the exact size of RAM is unknown or not necessarily relevant, but the granularity with which nodes shall be allocated is known. Cc: Andi Kleen Signed-off-by: David Rientjes Signed-off-by: Andi Kleen Cc: Paul Jackson Cc: Christoph Lameter Signed-off-by: Andrew Morton --- Documentation/x86_64/boot-options.txt | 4 +++- arch/x86_64/mm/numa.c | 22 ++++++++++++++++++---- 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'arch/x86_64/mm') diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index 7500aad95f3..12a9aacecaa 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt @@ -155,7 +155,9 @@ NUMA depending on the sizes and coefficients listed. For example: numa=fake=2*512,1024,4*256 gives two 512M nodes, a 1024M node, and four 256M nodes. The - remaining system RAM is allocated to an additional node. + remaining system RAM is allocated to an additional node. If + the last character of CMDLINE is a *, the remaining system RAM + is instead divided up equally among its coefficient. numa=hotadd=percent Only allow hotadd memory to preallocate page structures upto diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index c55936bc6be..0ae2d9d5d7e 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -418,11 +418,25 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) done: if (!num_nodes) return -1; - /* Fill remainder of system RAM with a final node, if appropriate. */ + /* Fill remainder of system RAM, if appropriate. */ if (addr < max_addr) { - setup_node_range(num_nodes, nodes, &addr, max_addr - addr, - max_addr); - num_nodes++; + switch (*(cmdline - 1)) { + case '*': + /* Split remaining nodes into coeff chunks */ + if (coeff <= 0) + break; + num_nodes += split_nodes_equally(nodes, &addr, max_addr, + num_nodes, coeff); + break; + case ',': + /* Do not allocate remaining system RAM */ + break; + default: + /* Give one final node */ + setup_node_range(num_nodes, nodes, &addr, + max_addr - addr, max_addr); + num_nodes++; + } } out: memnode_shift = compute_hash_shift(nodes, num_nodes); -- cgit v1.2.3-70-g09d2 From 382591d500bbcd20a44416c5e0e292708468587c Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 2 May 2007 19:27:09 +0200 Subject: [PATCH] x86-64: fixed size remaining fake nodes Extends the numa=fake x86_64 command-line option to split the remaining system memory into nodes of fixed size. Any leftover memory is allocated to a final node unless the command-line ends with a comma. For example: numa=fake=2*512,*128 gives two 512M nodes and the remaining system memory is split into nodes of 128M each. This is beneficial for systems where the exact size of RAM is unknown or not necessarily relevant, but the size of the remaining nodes to be allocated is known based on their capacity for resource management. Cc: Andi Kleen Signed-off-by: David Rientjes Signed-off-by: Andi Kleen Cc: Paul Jackson Cc: Christoph Lameter Signed-off-by: Andrew Morton --- Documentation/x86_64/boot-options.txt | 14 +++++++---- arch/x86_64/mm/numa.c | 47 +++++++++++++++++++++++++++-------- 2 files changed, 46 insertions(+), 15 deletions(-) (limited to 'arch/x86_64/mm') diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index 12a9aacecaa..6177d881983 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt @@ -153,11 +153,15 @@ NUMA If a number, fakes CMDLINE nodes and ignores NUMA setup of the actual machine. Otherwise, system memory is configured depending on the sizes and coefficients listed. For example: - numa=fake=2*512,1024,4*256 - gives two 512M nodes, a 1024M node, and four 256M nodes. The - remaining system RAM is allocated to an additional node. If - the last character of CMDLINE is a *, the remaining system RAM - is instead divided up equally among its coefficient. + numa=fake=2*512,1024,4*256,*128 + gives two 512M nodes, a 1024M node, four 256M nodes, and the + rest split into 128M chunks. If the last character of CMDLINE + is a *, the remaining memory is divided up equally among its + coefficient: + numa=fake=2*512,2* + gives two 512M nodes and the rest split into two nodes. + Otherwise, the remaining system RAM is allocated to an + additional node. numa=hotadd=percent Only allow hotadd memory to preallocate page structures upto diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 0ae2d9d5d7e..5ee07bc41eb 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -361,6 +361,21 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, return i - node_start + 1; } +/* + * Splits the remaining system RAM into chunks of size. The remaining memory is + * always assigned to a final node and can be asymmetric. Returns the number of + * nodes split. + */ +static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, + u64 max_addr, int node_start, u64 size) +{ + int i = node_start; + size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; + while (!setup_node_range(i++, nodes, addr, size, max_addr)) + ; + return i - node_start; +} + /* * Sets up the system RAM area from start_pfn to end_pfn according to the * numa=fake command-line option. @@ -370,9 +385,10 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) struct bootnode nodes[MAX_NUMNODES]; u64 addr = start_pfn << PAGE_SHIFT; u64 max_addr = end_pfn << PAGE_SHIFT; - unsigned int coeff; - unsigned int num = 0; int num_nodes = 0; + int coeff_flag; + int coeff = -1; + int num = 0; u64 size; int i; @@ -390,29 +406,34 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) } /* Parse the command line. */ - for (coeff = 1; ; cmdline++) { + for (coeff_flag = 0; ; cmdline++) { if (*cmdline && isdigit(*cmdline)) { num = num * 10 + *cmdline - '0'; continue; } - if (*cmdline == '*') - coeff = num; + if (*cmdline == '*') { + if (num > 0) + coeff = num; + coeff_flag = 1; + } if (!*cmdline || *cmdline == ',') { + if (!coeff_flag) + coeff = 1; /* * Round down to the nearest FAKE_NODE_MIN_SIZE. * Command-line coefficients are in megabytes. */ size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; - if (size) { + if (size) for (i = 0; i < coeff; i++, num_nodes++) if (setup_node_range(num_nodes, nodes, &addr, size, max_addr) < 0) goto done; - coeff = 1; - } + if (!*cmdline) + break; + coeff_flag = 0; + coeff = -1; } - if (!*cmdline) - break; num = 0; } done: @@ -420,6 +441,12 @@ done: return -1; /* Fill remainder of system RAM, if appropriate. */ if (addr < max_addr) { + if (coeff_flag && coeff < 0) { + /* Split remaining nodes into num-sized chunks */ + num_nodes += split_nodes_by_size(nodes, &addr, max_addr, + num_nodes, num); + goto out; + } switch (*(cmdline - 1)) { case '*': /* Split remaining nodes into coeff chunks */ -- cgit v1.2.3-70-g09d2 From d01ad8dd56527be72947b4b9997bb2c05783c3ed Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH] x86: Improve handling of kernel mappings in change_page_attr Fix various broken corner cases in i386 and x86-64 change_page_attr. AK: split off from tighten kernel image access rights Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/mm/pageattr.c | 4 ++-- arch/x86_64/mm/pageattr.c | 16 ++++++++++++---- include/asm-i386/pgtable.h | 2 ++ 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'arch/x86_64/mm') diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index 412ebbd8adb..ea6b6d4a0a2 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c @@ -142,7 +142,7 @@ __change_page_attr(struct page *page, pgprot_t prot) return -EINVAL; kpte_page = virt_to_page(kpte); if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { - if ((pte_val(*kpte) & _PAGE_PSE) == 0) { + if (!pte_huge(*kpte)) { set_pte_atomic(kpte, mk_pte(page, prot)); } else { pgprot_t ref_prot; @@ -158,7 +158,7 @@ __change_page_attr(struct page *page, pgprot_t prot) kpte_page = split; } page_private(kpte_page)++; - } else if ((pte_val(*kpte) & _PAGE_PSE) == 0) { + } else if (!pte_huge(*kpte)) { set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL)); BUG_ON(page_private(kpte_page) == 0); page_private(kpte_page)--; diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index 76ee90a5abe..bf4aa8dd425 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -179,16 +179,24 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) { unsigned long phys_base_pfn = __pa_symbol(__START_KERNEL_map) >> PAGE_SHIFT; - int err = 0; + int err = 0, kernel_map = 0; int i; + if (address >= __START_KERNEL_map + && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) { + address = (unsigned long)__va(__pa(address)); + kernel_map = 1; + } + down_write(&init_mm.mmap_sem); for (i = 0; i < numpages; i++, address += PAGE_SIZE) { unsigned long pfn = __pa(address) >> PAGE_SHIFT; - err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); - if (err) - break; + if (!kernel_map || pte_present(pfn_pte(0, prot))) { + err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); + if (err) + break; + } /* Handle kernel mapping too which aliases part of the * lowmem */ if ((pfn >= phys_base_pfn) && diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index c3b58d473a5..143ddc42b86 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -159,6 +159,7 @@ void paging_init(void); extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC; #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) +#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) @@ -166,6 +167,7 @@ extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC; #define PAGE_KERNEL __pgprot(__PAGE_KERNEL) #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) -- cgit v1.2.3-70-g09d2 From 6fb14755a676282a4e6caa05a08c92db8e45cfff Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH] x86: tighten kernel image page access rights On x86-64, kernel memory freed after init can be entirely unmapped instead of just getting 'poisoned' by overwriting with a debug pattern. On i386 and x86-64 (under CONFIG_DEBUG_RODATA), kernel text and bug table can also be write-protected. Compared to the first version, this one prevents re-creating deleted mappings in the kernel image range on x86-64, if those got removed previously. This, together with the original changes, prevents temporarily having inconsistent mappings when cacheability attributes are being changed on such pages (e.g. from AGP code). While on i386 such duplicate mappings don't exist, the same change is done there, too, both for consistency and because checking pte_present() before using various other pte_XXX functions is a requirement anyway. At once, i386 code gets adjusted to use pte_huge() instead of open coding this. AK: split out cpa() changes Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/kernel/vmlinux.lds.S | 4 ++-- arch/i386/mm/init.c | 25 +++++++++++++++++++------ arch/x86_64/kernel/head.S | 1 - arch/x86_64/kernel/vmlinux.lds.S | 5 +++-- arch/x86_64/mm/init.c | 25 ++++++++++++++++--------- include/linux/poison.h | 3 --- 6 files changed, 40 insertions(+), 23 deletions(-) (limited to 'arch/x86_64/mm') diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 6f38f818380..f4ec7223183 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -61,8 +61,6 @@ SECTIONS __stop___ex_table = .; } - RODATA - BUG_TABLE . = ALIGN(4); @@ -72,6 +70,8 @@ SECTIONS __tracedata_end = .; } + RODATA + /* writeable */ . = ALIGN(4096); .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 23be1b0aafa..bd5ef371850 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -751,13 +752,25 @@ static int noinline do_test_wp_bit(void) void mark_rodata_ro(void) { - unsigned long addr = (unsigned long)__start_rodata; + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; - for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) - change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO); +#ifdef CONFIG_HOTPLUG_CPU + /* It must still be possible to apply SMP alternatives. */ + if (num_possible_cpus() <= 1) +#endif + { + change_page_attr(virt_to_page(start), + size >> PAGE_SHIFT, PAGE_KERNEL_RX); + printk("Write protecting the kernel text: %luk\n", size >> 10); + } - printk("Write protecting the kernel read-only data: %uk\n", - (__end_rodata - __start_rodata) >> 10); + start += size; + size = (unsigned long)__end_rodata - start; + change_page_attr(virt_to_page(start), + size >> PAGE_SHIFT, PAGE_KERNEL_RO); + printk("Write protecting the kernel read-only data: %luk\n", + size >> 10); /* * change_page_attr() requires a global_flush_tlb() call after it. @@ -781,7 +794,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) __free_page(page); totalram_pages++; } - printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); } void free_initmem(void) diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 36aa98a6d15..fd9fdfdd143 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -280,7 +280,6 @@ early_idt_ripmsg: .balign PAGE_SIZE ENTRY(stext) -ENTRY(_stext) #define NEXT_PAGE(name) \ .balign PAGE_SIZE; \ diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 5176ecf006e..3bdeb88d28f 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -29,6 +29,7 @@ SECTIONS .text : AT(ADDR(.text) - LOAD_OFFSET) { /* First the code that has to be first for bootstrapping */ *(.bootstrap.text) + _stext = .; /* Then all the functions that are "hot" in profiles, to group them onto the same hugetlb entry */ #include "functionlist" @@ -50,10 +51,10 @@ SECTIONS __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } __stop___ex_table = .; - RODATA - BUG_TABLE + RODATA + . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 69e22d3c923..e3134bc9a4f 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -563,21 +564,23 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) if (begin >= end) return; - printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); for (addr = begin; addr < end; addr += PAGE_SIZE) { struct page *page = pfn_to_page(addr >> PAGE_SHIFT); ClearPageReserved(page); init_page_count(page); memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE); + if (addr >= __START_KERNEL_map) + change_page_attr_addr(addr, 1, __pgprot(0)); __free_page(page); totalram_pages++; } + if (addr > __START_KERNEL_map) + global_flush_tlb(); } void free_initmem(void) { - memset(__initdata_begin, POISON_FREE_INITDATA, - __initdata_end - __initdata_begin); free_init_pages("unused kernel memory", __pa_symbol(&__init_begin), __pa_symbol(&__init_end)); @@ -587,14 +590,18 @@ void free_initmem(void) void mark_rodata_ro(void) { - unsigned long addr = (unsigned long)__va(__pa_symbol(&__start_rodata)); - unsigned long end = (unsigned long)__va(__pa_symbol(&__end_rodata)); + unsigned long start = PFN_ALIGN(__va(__pa_symbol(&_stext))), size; - for (; addr < end; addr += PAGE_SIZE) - change_page_attr_addr(addr, 1, PAGE_KERNEL_RO); +#ifdef CONFIG_HOTPLUG_CPU + /* It must still be possible to apply SMP alternatives. */ + if (num_possible_cpus() > 1) + start = PFN_ALIGN(__va(__pa_symbol(&_etext))); +#endif + size = (unsigned long)__va(__pa_symbol(&__end_rodata)) - start; + change_page_attr_addr(start, size >> PAGE_SHIFT, PAGE_KERNEL_RO); - printk ("Write protecting the kernel read-only data: %luk\n", - (__end_rodata - __start_rodata) >> 10); + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", + size >> 10); /* * change_page_attr_addr() requires a global_flush_tlb() call after it. diff --git a/include/linux/poison.h b/include/linux/poison.h index 3e628f990fd..89580b76495 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -26,9 +26,6 @@ /********** arch/$ARCH/mm/init.c **********/ #define POISON_FREE_INITMEM 0xcc -/********** arch/x86_64/mm/init.c **********/ -#define POISON_FREE_INITDATA 0xba - /********** arch/ia64/hp/common/sba_iommu.c **********/ /* * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a -- cgit v1.2.3-70-g09d2 From 2bff73830c3df5f575d3bc21bf19df1a10bf7091 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH] x86-64: use lru instead of page->index and page->private for pgd lists management. x86_64 currently simulates a list using the index and private fields of the page struct. Seems that the code was inherited from i386. But x86_64 does not use the slab to allocate pgds and pmds etc. So the lru field is not used by the slab and therefore available. This patch uses standard list operations on page->lru to realize pgd tracking. Signed-off-by: Christoph Lameter Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/mm/fault.c | 5 ++--- include/asm-x86_64/pgalloc.h | 14 +++----------- include/asm-x86_64/pgtable.h | 2 +- 3 files changed, 6 insertions(+), 15 deletions(-) (limited to 'arch/x86_64/mm') diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 6ada7231f3a..de99dba2c51 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -585,7 +585,7 @@ do_sigbus: } DEFINE_SPINLOCK(pgd_lock); -struct page *pgd_list; +LIST_HEAD(pgd_list); void vmalloc_sync_all(void) { @@ -605,8 +605,7 @@ void vmalloc_sync_all(void) if (pgd_none(*pgd_ref)) continue; spin_lock(&pgd_lock); - for (page = pgd_list; page; - page = (struct page *)page->index) { + list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; pgd = (pgd_t *)page_address(page) + pgd_index(address); if (pgd_none(*pgd)) diff --git a/include/asm-x86_64/pgalloc.h b/include/asm-x86_64/pgalloc.h index 31d49717196..8bb56468786 100644 --- a/include/asm-x86_64/pgalloc.h +++ b/include/asm-x86_64/pgalloc.h @@ -44,24 +44,16 @@ static inline void pgd_list_add(pgd_t *pgd) struct page *page = virt_to_page(pgd); spin_lock(&pgd_lock); - page->index = (pgoff_t)pgd_list; - if (pgd_list) - pgd_list->private = (unsigned long)&page->index; - pgd_list = page; - page->private = (unsigned long)&pgd_list; + list_add(&page->lru, &pgd_list); spin_unlock(&pgd_lock); } static inline void pgd_list_del(pgd_t *pgd) { - struct page *next, **pprev, *page = virt_to_page(pgd); + struct page *page = virt_to_page(pgd); spin_lock(&pgd_lock); - next = (struct page *)page->index; - pprev = (struct page **)page->private; - *pprev = next; - if (next) - next->private = (unsigned long)pprev; + list_del(&page->lru); spin_unlock(&pgd_lock); } diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index c1865e38c7b..599993f6ba8 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -410,7 +410,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) extern spinlock_t pgd_lock; -extern struct page *pgd_list; +extern struct list_head pgd_list; void vmalloc_sync_all(void); extern int kern_addr_valid(unsigned long addr); -- cgit v1.2.3-70-g09d2 From ae32b1297a77c23fd0badd642bb685062f7a37f8 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Date: Wed, 2 May 2007 19:27:11 +0200 Subject: [PATCH] x86-64: Inhibit machine from asserting an NMI when doing Alt-SysRq-M operation. This patch touches the NMI watchdog every MAX_ORDER_NR_PAGES to inhibit the machine from triggering an NMI while the CPUs are locked. This situation is happening on boxes with more than 64CPUs and 128GB of RAM when Alt-SysRq-m is performed. It has been succesfully tested for regression on uni, 2, 4, 8 32, and 64 CPU boxes with various memory configuration. Signed-off-by: Andi Kleen --- arch/x86_64/mm/init.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86_64/mm') diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index e3134bc9a4f..282b0a8f00a 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -73,6 +74,11 @@ void show_mem(void) for_each_online_pgdat(pgdat) { for (i = 0; i < pgdat->node_spanned_pages; ++i) { + /* this loop can take a while with 256 GB and 4k pages + so update the NMI watchdog */ + if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) { + touch_nmi_watchdog(); + } page = pfn_to_page(pgdat->node_start_pfn + i); total++; if (PageReserved(page)) -- cgit v1.2.3-70-g09d2 From e3f1caeef9a70b0699518092d653c15274b025ab Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 2 May 2007 19:27:20 +0200 Subject: [PATCH] x86-64: set node_possible_map at runtime - try 2 Set the node_possible_map at runtime on x86_64. On a non NUMA system, num_possible_nodes() will now say '1'. Signed-off-by: Suresh Siddha Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Eric Dumazet Cc: David Rientjes Cc: Christoph Lameter --- arch/x86_64/mm/k8topology.c | 7 ++----- arch/x86_64/mm/numa.c | 10 ++++++++-- arch/x86_64/mm/srat.c | 8 +++++--- 3 files changed, 15 insertions(+), 10 deletions(-) (limited to 'arch/x86_64/mm') diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c index b5b8dba28b4..60e860e5ef4 100644 --- a/arch/x86_64/mm/k8topology.c +++ b/arch/x86_64/mm/k8topology.c @@ -49,11 +49,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) int found = 0; u32 reg; unsigned numnodes; - nodemask_t nodes_parsed; unsigned dualcore = 0; - nodes_clear(nodes_parsed); - if (!early_pci_allowed()) return -1; @@ -102,7 +99,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) nodeid, (base>>8)&3, (limit>>8) & 3); return -1; } - if (node_isset(nodeid, nodes_parsed)) { + if (node_isset(nodeid, node_possible_map)) { printk(KERN_INFO "Node %d already present. Skipping\n", nodeid); continue; @@ -155,7 +152,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) prevbase = base; - node_set(nodeid, nodes_parsed); + node_set(nodeid, node_possible_map); } if (!found) diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 5ee07bc41eb..51548947ad3 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -295,7 +295,7 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, ret = -1; } nodes[nid].end = *addr; - node_set_online(nid); + node_set(nid, node_possible_map); printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, nodes[nid].start, nodes[nid].end, (nodes[nid].end - nodes[nid].start) >> 20); @@ -479,7 +479,7 @@ out: * SRAT. */ remove_all_active_ranges(); - for_each_online_node(i) { + for_each_node_mask(i, node_possible_map) { e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); setup_node_bootmem(i, nodes[i].start, nodes[i].end); @@ -494,20 +494,25 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) { int i; + nodes_clear(node_possible_map); + #ifdef CONFIG_NUMA_EMU if (cmdline && !numa_emulation(start_pfn, end_pfn)) return; + nodes_clear(node_possible_map); #endif #ifdef CONFIG_ACPI_NUMA if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT)) return; + nodes_clear(node_possible_map); #endif #ifdef CONFIG_K8_NUMA if (!numa_off && !k8_scan_nodes(start_pfn< Date: Wed, 2 May 2007 19:27:21 +0200 Subject: [PATCH] x86-64: Don't enable NUMA for a single node in K8 NUMA scanning This was supposed to see the full memory on a ASUS A8SX motherboard with 4GB RAM where the northbridge reports less memory, but it didn't help there. But it's a reasonable change so let's include it anyways. Signed-off-by: Andi Kleen --- arch/x86_64/mm/k8topology.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86_64/mm') diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c index 60e860e5ef4..f983c75825d 100644 --- a/arch/x86_64/mm/k8topology.c +++ b/arch/x86_64/mm/k8topology.c @@ -62,6 +62,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) reg = read_pci_config(0, nb, 0, 0x60); numnodes = ((reg >> 4) & 0xF) + 1; + if (numnodes <= 1) + return -1; printk(KERN_INFO "Number of nodes %d\n", numnodes); -- cgit v1.2.3-70-g09d2 From e3ebadd95cb621e2c7436f3d3646447ac9d5c16d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 7 May 2007 08:44:24 -0700 Subject: Revert "[PATCH] x86: __pa and __pa_symbol address space separation" This was broken. It adds complexity, for no good reason. Rather than separate __pa() and __pa_symbol(), we should deprecate __pa_symbol(), and preferably __pa() too - and just use "virt_to_phys()" instead, which is more readable and has nicer semantics. However, right now, just undo the separation, and make __pa_symbol() be the exact same as __pa(). That fixes the bugs this patch introduced, and we can do the fairly obvious cleanups later. Do the new __phys_addr() function (which is now the actual workhorse for the unified __pa()/__pa_symbol()) as a real external function, that way all the potential issues with compile/link-time optimizations of constant symbol addresses go away, and we can also, if we choose to, add more sanity-checking of the argument. Cc: Eric W. Biederman Cc: Vivek Goyal Cc: Andi Kleen Cc: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/alternative.c | 4 ++-- arch/i386/mm/init.c | 15 +++++++-------- arch/x86_64/kernel/machine_kexec.c | 16 ++++++++-------- arch/x86_64/kernel/setup.c | 9 ++++----- arch/x86_64/kernel/smp.c | 2 +- arch/x86_64/mm/init.c | 31 ++++++++++++++++++------------- arch/x86_64/mm/ioremap.c | 9 +++++++++ arch/x86_64/mm/pageattr.c | 16 ++++++++-------- include/asm-x86_64/page.h | 18 +++++++----------- include/asm-x86_64/pgtable.h | 4 ++-- 10 files changed, 66 insertions(+), 58 deletions(-) (limited to 'arch/x86_64/mm') diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index e5cec6685cc..d8cda14fff8 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -390,8 +390,8 @@ void __init alternative_instructions(void) _text, _etext); } free_init_pages("SMP alternatives", - __pa_symbol(&__smp_locks), - __pa_symbol(&__smp_locks_end)); + (unsigned long)__smp_locks, + (unsigned long)__smp_locks_end); } else { alternatives_smp_module_add(NULL, "core kernel", __smp_locks, __smp_locks_end, diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index dbe16f63a56..1a7197e89eb 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -843,11 +843,10 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) unsigned long addr; for (addr = begin; addr < end; addr += PAGE_SIZE) { - struct page *page = pfn_to_page(addr >> PAGE_SHIFT); - ClearPageReserved(page); - init_page_count(page); - memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE); - __free_page(page); + ClearPageReserved(virt_to_page(addr)); + init_page_count(virt_to_page(addr)); + memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); + free_page(addr); totalram_pages++; } printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); @@ -856,14 +855,14 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) void free_initmem(void) { free_init_pages("unused kernel memory", - __pa_symbol(&__init_begin), - __pa_symbol(&__init_end)); + (unsigned long)(&__init_begin), + (unsigned long)(&__init_end)); } #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_init_pages("initrd memory", __pa(start), __pa(end)); + free_init_pages("initrd memory", start, end); } #endif diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c index a8bb33c1a8f..c3a55470367 100644 --- a/arch/x86_64/kernel/machine_kexec.c +++ b/arch/x86_64/kernel/machine_kexec.c @@ -189,21 +189,21 @@ NORET_TYPE void machine_kexec(struct kimage *image) control_page = page_address(image->control_code_page) + PAGE_SIZE; memcpy(control_page, relocate_kernel, PAGE_SIZE); - page_list[PA_CONTROL_PAGE] = __pa(control_page); + page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; - page_list[PA_PGD] = __pa_symbol(&kexec_pgd); + page_list[PA_PGD] = virt_to_phys(&kexec_pgd); page_list[VA_PGD] = (unsigned long)kexec_pgd; - page_list[PA_PUD_0] = __pa_symbol(&kexec_pud0); + page_list[PA_PUD_0] = virt_to_phys(&kexec_pud0); page_list[VA_PUD_0] = (unsigned long)kexec_pud0; - page_list[PA_PMD_0] = __pa_symbol(&kexec_pmd0); + page_list[PA_PMD_0] = virt_to_phys(&kexec_pmd0); page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; - page_list[PA_PTE_0] = __pa_symbol(&kexec_pte0); + page_list[PA_PTE_0] = virt_to_phys(&kexec_pte0); page_list[VA_PTE_0] = (unsigned long)kexec_pte0; - page_list[PA_PUD_1] = __pa_symbol(&kexec_pud1); + page_list[PA_PUD_1] = virt_to_phys(&kexec_pud1); page_list[VA_PUD_1] = (unsigned long)kexec_pud1; - page_list[PA_PMD_1] = __pa_symbol(&kexec_pmd1); + page_list[PA_PMD_1] = virt_to_phys(&kexec_pmd1); page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; - page_list[PA_PTE_1] = __pa_symbol(&kexec_pte1); + page_list[PA_PTE_1] = virt_to_phys(&kexec_pte1); page_list[VA_PTE_1] = (unsigned long)kexec_pte1; page_list[PA_TABLE_PAGE] = diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index db30b5bcef6..db51577bda3 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -245,12 +245,11 @@ void __init setup_arch(char **cmdline_p) init_mm.end_code = (unsigned long) &_etext; init_mm.end_data = (unsigned long) &_edata; init_mm.brk = (unsigned long) &_end; - init_mm.pgd = __va(__pa_symbol(&init_level4_pgt)); - code_resource.start = __pa_symbol(&_text); - code_resource.end = __pa_symbol(&_etext)-1; - data_resource.start = __pa_symbol(&_etext); - data_resource.end = __pa_symbol(&_edata)-1; + code_resource.start = virt_to_phys(&_text); + code_resource.end = virt_to_phys(&_etext)-1; + data_resource.start = virt_to_phys(&_etext); + data_resource.end = virt_to_phys(&_edata)-1; early_identify_cpu(&boot_cpu_data); diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 22abae4e9f3..bd1d123947c 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -76,7 +76,7 @@ static inline void leave_mm(int cpu) if (read_pda(mmu_state) == TLBSTATE_OK) BUG(); cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); - load_cr3(init_mm.pgd); + load_cr3(swapper_pg_dir); } /* diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 282b0a8f00a..c0822683b91 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -572,13 +572,13 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); for (addr = begin; addr < end; addr += PAGE_SIZE) { - struct page *page = pfn_to_page(addr >> PAGE_SHIFT); - ClearPageReserved(page); - init_page_count(page); - memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE); + ClearPageReserved(virt_to_page(addr)); + init_page_count(virt_to_page(addr)); + memset((void *)(addr & ~(PAGE_SIZE-1)), + POISON_FREE_INITMEM, PAGE_SIZE); if (addr >= __START_KERNEL_map) change_page_attr_addr(addr, 1, __pgprot(0)); - __free_page(page); + free_page(addr); totalram_pages++; } if (addr > __START_KERNEL_map) @@ -588,26 +588,31 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) void free_initmem(void) { free_init_pages("unused kernel memory", - __pa_symbol(&__init_begin), - __pa_symbol(&__init_end)); + (unsigned long)(&__init_begin), + (unsigned long)(&__init_end)); } #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void) { - unsigned long start = PFN_ALIGN(__va(__pa_symbol(&_stext))), size; + unsigned long start = (unsigned long)_stext, end; #ifdef CONFIG_HOTPLUG_CPU /* It must still be possible to apply SMP alternatives. */ if (num_possible_cpus() > 1) - start = PFN_ALIGN(__va(__pa_symbol(&_etext))); + start = (unsigned long)_etext; #endif - size = (unsigned long)__va(__pa_symbol(&__end_rodata)) - start; - change_page_attr_addr(start, size >> PAGE_SHIFT, PAGE_KERNEL_RO); + end = (unsigned long)__end_rodata; + start = (start + PAGE_SIZE - 1) & PAGE_MASK; + end &= PAGE_MASK; + if (end <= start) + return; + + change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", - size >> 10); + (end - start) >> 10); /* * change_page_attr_addr() requires a global_flush_tlb() call after it. @@ -622,7 +627,7 @@ void mark_rodata_ro(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_init_pages("initrd memory", __pa(start), __pa(end)); + free_init_pages("initrd memory", start, end); } #endif diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c index c6e5e8d401a..6cac90aa503 100644 --- a/arch/x86_64/mm/ioremap.c +++ b/arch/x86_64/mm/ioremap.c @@ -13,12 +13,21 @@ #include #include #include + #include #include #include #include #include +unsigned long __phys_addr(unsigned long x) +{ + if (x >= __START_KERNEL_map) + return x - __START_KERNEL_map + phys_base; + return x - PAGE_OFFSET; +} +EXPORT_SYMBOL(__phys_addr); + #define ISA_START_ADDRESS 0xa0000 #define ISA_END_ADDRESS 0x100000 diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index bf4aa8dd425..d653d0bf3df 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -51,6 +51,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, SetPagePrivate(base); page_private(base) = 0; + address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { @@ -100,12 +101,13 @@ static inline void save_page(struct page *fpage) * No more special protections in this 2/4MB area - revert to a * large page again. */ -static void revert_page(unsigned long address, unsigned long pfn, pgprot_t ref_prot) +static void revert_page(unsigned long address, pgprot_t ref_prot) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t large_pte; + unsigned long pfn; pgd = pgd_offset_k(address); BUG_ON(pgd_none(*pgd)); @@ -113,6 +115,7 @@ static void revert_page(unsigned long address, unsigned long pfn, pgprot_t ref_p BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, address); BUG_ON(pmd_val(*pmd) & _PAGE_PSE); + pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT; large_pte = pfn_pte(pfn, ref_prot); large_pte = pte_mkhuge(large_pte); set_pte((pte_t *)pmd, large_pte); @@ -138,8 +141,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, */ struct page *split; ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); - split = split_large_page(pfn << PAGE_SHIFT, prot, - ref_prot2); + split = split_large_page(address, prot, ref_prot2); if (!split) return -ENOMEM; set_pte(kpte, mk_pte(split, ref_prot2)); @@ -158,7 +160,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, if (page_private(kpte_page) == 0) { save_page(kpte_page); - revert_page(address, pfn, ref_prot); + revert_page(address, ref_prot); } return 0; } @@ -178,7 +180,6 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, */ int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) { - unsigned long phys_base_pfn = __pa_symbol(__START_KERNEL_map) >> PAGE_SHIFT; int err = 0, kernel_map = 0; int i; @@ -199,11 +200,10 @@ int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) } /* Handle kernel mapping too which aliases part of the * lowmem */ - if ((pfn >= phys_base_pfn) && - ((pfn - phys_base_pfn) < (KERNEL_TEXT_SIZE >> PAGE_SHIFT))) { + if (__pa(address) < KERNEL_TEXT_SIZE) { unsigned long addr2; pgprot_t prot2; - addr2 = __START_KERNEL_map + ((pfn - phys_base_pfn) << PAGE_SHIFT); + addr2 = __START_KERNEL_map + __pa(address); /* Make sure the kernel mappings stay executable */ prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); err = __change_page_attr(addr2, pfn, prot2, diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h index b17fc16ec2e..4d04e247956 100644 --- a/include/asm-x86_64/page.h +++ b/include/asm-x86_64/page.h @@ -94,26 +94,22 @@ extern unsigned long phys_base; #define KERNEL_TEXT_SIZE (40*1024*1024) #define KERNEL_TEXT_START 0xffffffff80000000 +#define PAGE_OFFSET __PAGE_OFFSET #ifndef __ASSEMBLY__ #include -#endif /* __ASSEMBLY__ */ +extern unsigned long __phys_addr(unsigned long); -#define PAGE_OFFSET __PAGE_OFFSET +#endif /* __ASSEMBLY__ */ -/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol. - Otherwise you risk miscompilation. */ -#define __pa(x) ((unsigned long)(x) - PAGE_OFFSET) -/* __pa_symbol should be used for C visible symbols. - This seems to be the official gcc blessed way to do such arithmetic. */ -#define __pa_symbol(x) \ - ({unsigned long v; \ - asm("" : "=r" (v) : "0" (x)); \ - ((v - __START_KERNEL_map) + phys_base); }) +#define __pa(x) __phys_addr((unsigned long)(x)) +#define __pa_symbol(x) __phys_addr((unsigned long)(x)) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) +#define __boot_va(x) __va(x) +#define __boot_pa(x) __pa(x) #ifdef CONFIG_FLATMEM #define pfn_valid(pfn) ((pfn) < end_pfn) #endif diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index 599993f6ba8..da3390faaea 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -19,7 +19,7 @@ extern pmd_t level2_kernel_pgt[512]; extern pgd_t init_level4_pgt[]; extern unsigned long __supported_pte_mask; -#define swapper_pg_dir ((pgd_t *)NULL) +#define swapper_pg_dir init_level4_pgt extern void paging_init(void); extern void clear_kernel_mapping(unsigned long addr, unsigned long size); @@ -29,7 +29,7 @@ extern void clear_kernel_mapping(unsigned long addr, unsigned long size); * for zero-mapped memory areas etc.. */ extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) (pfn_to_page(__pa_symbol(&empty_zero_page) >> PAGE_SHIFT)) +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) #endif /* !__ASSEMBLY__ */ -- cgit v1.2.3-70-g09d2