From 3aa4b37d3e899cfe7a9cbdcda2b277df4c1f210d Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 6 Feb 2008 22:39:43 +0100 Subject: x86: make traps on entry code be debuggable in user space, 64-bit Unify the x86-64 behavior for 32-bit processes that set bogus %cs/%ss values (the only ones that can fault in iret) match what the native i386 behavior is. (do not kill the task via do_exit but generate a SIGSEGV signal) [ tglx@linutronix.de: build fix ] Signed-off-by: Roland McGrath Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index bea8474744f..e518928114d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -593,13 +593,22 @@ ENTRY(native_iret) .quad native_iret, bad_iret .previous .section .fixup,"ax" - /* force a signal here? this matches i386 behaviour */ - /* running with kernel gs */ bad_iret: - movq $11,%rdi /* SIGSEGV */ - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) - jmp do_exit + /* + * The iret traps when the %cs or %ss being restored is bogus. + * We've lost the original trap vector and error code. + * #GPF is the most likely one to get for an invalid selector. + * So pretend we completed the iret and took the #GPF in user mode. + * + * We are now running with the kernel GS after exception recovery. + * But error_entry expects us to have user GS to match the user %cs, + * so swap back. + */ + pushq $0 + + SWAPGS + jmp general_protection + .previous /* edi: workmask, edx: work */ -- cgit v1.2.3-70-g09d2 From d8b57bb700a73872fd06b891d7c9bc4cea1a6af4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Feb 2008 22:39:43 +0100 Subject: x86: make spurious fault handler aware of large mappings In very rare cases, on certain CPUs, we could end up in the spurious fault handler and ignore a large pud/pmd mapping. The resulting pte pointer points into the mapped physical space and dereferencing it will fault recursively. Make the code aware of large mappings and do the permission check on the pmd/pud entry, when a large pud/pmd mapping is detected. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ad8b9733d6b..d8ed4006b3d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -428,6 +428,16 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, } #endif +static int spurious_fault_check(unsigned long error_code, pte_t *pte) +{ + if ((error_code & PF_WRITE) && !pte_write(*pte)) + return 0; + if ((error_code & PF_INSTR) && !pte_exec(*pte)) + return 0; + + return 1; +} + /* * Handle a spurious fault caused by a stale TLB entry. This allows * us to lazily refresh the TLB when increasing the permissions of a @@ -457,20 +467,21 @@ static int spurious_fault(unsigned long address, if (!pud_present(*pud)) return 0; + if (pud_large(*pud)) + return spurious_fault_check(error_code, (pte_t *) pud); + pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return 0; + if (pmd_large(*pmd)) + return spurious_fault_check(error_code, (pte_t *) pmd); + pte = pte_offset_kernel(pmd, address); if (!pte_present(*pte)) return 0; - if ((error_code & PF_WRITE) && !pte_write(*pte)) - return 0; - if ((error_code & PF_INSTR) && !pte_exec(*pte)) - return 0; - - return 1; + return spurious_fault_check(error_code, pte); } /* -- cgit v1.2.3-70-g09d2 From 2d684cd6d9cf0c6a0e28978362671b6e2d8fb56c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 6 Feb 2008 22:39:44 +0100 Subject: x86: remove X2 workaround With the spurious handler fix, the X2 does not lock up anymore. Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 16ce841f08d..c870424aa9a 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -260,17 +260,6 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, pgprot_t old_prot, new_prot; int level, do_split = 1; - /* - * An Athlon 64 X2 showed hard hangs if we tried to preserve - * largepages and changed the PSE entry from RW to RO. - * - * As AMD CPUs have a long series of erratas in this area, - * (and none of the known ones seem to explain this hang), - * disable this code until the hang can be debugged: - */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - return 1; - spin_lock_irqsave(&pgd_lock, flags); /* * Check for races, another CPU might have split this page -- cgit v1.2.3-70-g09d2 From c63855d04034c96db791a7217954c93aa66d24cb Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 6 Feb 2008 22:39:44 +0100 Subject: x86 ptrace: disallow null cs/ss In my revamp of the x86 ptrace code for setting register values, I accidentally omitted a check that was there in the old code. Allowing %cs to be 0 causes a bad crash in recovery from iret failure. This patch fixes that regression against 2.6.24, and adds a comment that should help prevent this subtlety from being overlooked again. Signed-off-by: Roland McGrath Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 96286df1bb8..702c33efea8 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -103,9 +103,26 @@ static int set_segment_reg(struct task_struct *task, if (invalid_selector(value)) return -EIO; - if (offset != offsetof(struct user_regs_struct, gs)) + /* + * For %cs and %ss we cannot permit a null selector. + * We can permit a bogus selector as long as it has USER_RPL. + * Null selectors are fine for other segment registers, but + * we will never get back to user mode with invalid %cs or %ss + * and will take the trap in iret instead. Much code relies + * on user_mode() to distinguish a user trap frame (which can + * safely use invalid selectors) from a kernel trap frame. + */ + switch (offset) { + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ss): + if (unlikely(value == 0)) + return -EIO; + + default: *pt_regs_access(task_pt_regs(task), offset) = value; - else { + break; + + case offsetof(struct user_regs_struct, gs): task->thread.gs = value; if (task == current) /* @@ -227,12 +244,16 @@ static int set_segment_reg(struct task_struct *task, * Can't actually change these in 64-bit mode. */ case offsetof(struct user_regs_struct,cs): + if (unlikely(value == 0)) + return -EIO; #ifdef CONFIG_IA32_EMULATION if (test_tsk_thread_flag(task, TIF_IA32)) task_pt_regs(task)->cs = value; #endif break; case offsetof(struct user_regs_struct,ss): + if (unlikely(value == 0)) + return -EIO; #ifdef CONFIG_IA32_EMULATION if (test_tsk_thread_flag(task, TIF_IA32)) task_pt_regs(task)->ss = value; -- cgit v1.2.3-70-g09d2 From 4a5a77d106d6b43183662d4ad37a613bbaa9b829 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Wed, 6 Feb 2008 22:39:44 +0100 Subject: x86: trivial sparse/checkpatch in quirks.c arch/x86/kernel/quirks.c:384:3: warning: returning void-valued expression arch/x86/kernel/quirks.c:387:3: warning: returning void-valued expression arch/x86/kernel/quirks.c:390:3: warning: returning void-valued expression arch/x86/kernel/quirks.c:393:3: warning: returning void-valued expression Signed-off-by: Harvey Harrison Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/quirks.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 3cd7a2dcd4f..6ba33ca8715 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -380,19 +380,19 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0367, void force_hpet_resume(void) { switch (force_hpet_resume_type) { - case ICH_FORCE_HPET_RESUME: - return ich_force_hpet_resume(); - - case OLD_ICH_FORCE_HPET_RESUME: - return old_ich_force_hpet_resume(); - - case VT8237_FORCE_HPET_RESUME: - return vt8237_force_hpet_resume(); - - case NVIDIA_FORCE_HPET_RESUME: - return nvidia_force_hpet_resume(); - - default: + case ICH_FORCE_HPET_RESUME: + ich_force_hpet_resume(); + return; + case OLD_ICH_FORCE_HPET_RESUME: + old_ich_force_hpet_resume(); + return; + case VT8237_FORCE_HPET_RESUME: + vt8237_force_hpet_resume(); + return; + case NVIDIA_FORCE_HPET_RESUME: + nvidia_force_hpet_resume(); + return; + default: break; } } -- cgit v1.2.3-70-g09d2 From deef79ef351225a9fe02e41a40cb125ed03a3e6b Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Wed, 6 Feb 2008 22:39:44 +0100 Subject: x86: fix sparse error in traps_32.c This was being used to ensure the proper alignment of the FXSAVE/FXRSTOR data. This would create a sparse error in the _correct_ cases, hiding further warnings. Use BUILD_BUG_ON instead. Signed-off-by: Harvey Harrison Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 3cf72977d01..b22c01e05a1 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -1176,17 +1176,12 @@ void __init trap_init(void) #endif set_trap_gate(19,&simd_coprocessor_error); + /* + * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned. + * Generate a build-time error if the alignment is wrong. + */ + BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15); if (cpu_has_fxsr) { - /* - * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned. - * Generates a compile-time "error: zero width for bit-field" if - * the alignment is wrong. - */ - struct fxsrAlignAssert { - int _:!(offsetof(struct task_struct, - thread.i387.fxsave) & 15); - }; - printk(KERN_INFO "Enabling fast FPU save and restore... "); set_in_cr4(X86_CR4_OSFXSR); printk("done.\n"); -- cgit v1.2.3-70-g09d2 From d7ac12fa05ed839d5a426795409fdf1a480e3f7a Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Wed, 6 Feb 2008 22:39:44 +0100 Subject: x86: fix sparse warnings in powernow-k8.c arch/x86/kernel/cpu/cpufreq/powernow-k8.c:830:7: warning: symbol 'hi' shadows an earlier one arch/x86/kernel/cpu/cpufreq/powernow-k8.c:824:6: originally declared here arch/x86/kernel/cpu/cpufreq/powernow-k8.c:830:15: warning: symbol 'lo' shadows an earlier one arch/x86/kernel/cpu/cpufreq/powernow-k8.c:824:14: originally declared here Signed-off-by: Harvey Harrison Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index a0522735dd9..5affe91ca1e 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -827,7 +827,6 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf for (i = 0; i < data->acpi_data.state_count; i++) { u32 index; - u32 hi = 0, lo = 0; index = data->acpi_data.states[i].control & HW_PSTATE_MASK; if (index > data->max_hw_pstate) { -- cgit v1.2.3-70-g09d2 From a57dae3aa4d00a000b5bac4238025438204c78b2 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 6 Feb 2008 22:39:45 +0100 Subject: x86: fix iret exception recovery on 64-bit This change broke recovery of exceptions in iret: commit 72fe4858544292ad64600765cb78bc02298c6b1c Author: Glauber de Oliveira Costa x86: replace privileged instructions with paravirt macros The ENTRY(native_iret) macro adds alignment padding before the iretq instruction, so "iret_label" no longer points exactly at the instruction. It was sloppy to leave the old "iret_label" label behind when replacing its nearby use. Removing it would have revealed the other use of the label later in the file, and upon noticing that use, anyone exercising the minimum of attention to detail expected of anyone touching this subtle code would realize it needed to change as well. Signed-off-by: Roland McGrath Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e518928114d..c7341e81941 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -582,7 +582,6 @@ retint_restore_args: /* return to kernel space */ TRACE_IRQS_IRETQ restore_args: RESTORE_ARGS 0,8,0 -iret_label: #ifdef CONFIG_PARAVIRT INTERRUPT_RETURN #endif @@ -920,7 +919,7 @@ error_kernelspace: iret run with kernel gs again, so don't set the user space flag. B stepping K8s sometimes report an truncated RIP for IRET exceptions returning to compat mode. Check for these here too. */ - leaq iret_label(%rip),%rbp + leaq native_iret(%rip),%rbp cmpq %rbp,RIP(%rsp) je error_swapgs movl %ebp,%ebp /* zero extend */ -- cgit v1.2.3-70-g09d2 From 984bb80d94d891592ab16d4d129b988792752c7b Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 6 Feb 2008 22:39:45 +0100 Subject: x86: mark the .rodata section also NX The .rodata section shouldn't just be read-only, but also non-executable. This is free since we've broken up the 2MB page already anyway. also update test_nx to check for this. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/kernel/test_nx.c | 2 +- arch/x86/mm/init_64.c | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c index 36c100c323a..10b8a6f69f8 100644 --- a/arch/x86/kernel/test_nx.c +++ b/arch/x86/kernel/test_nx.c @@ -139,7 +139,6 @@ static int test_NX(void) * Until then, don't run them to avoid too many people getting scared * by the error message */ -#if 0 #ifdef CONFIG_DEBUG_RODATA /* Test 3: Check if the .rodata section is executable */ @@ -152,6 +151,7 @@ static int test_NX(void) } #endif +#if 0 /* Test 4: Check if the .data section of a module is executable */ if (test_address(&test_data)) { printk(KERN_ERR "test_nx: .data section is executable\n"); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3a98d6f724a..9b61c75a235 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -591,10 +591,17 @@ void mark_rodata_ro(void) if (end <= start) return; - set_memory_ro(start, (end - start) >> PAGE_SHIFT); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); + set_memory_ro(start, (end - start) >> PAGE_SHIFT); + + /* + * The rodata section (but not the kernel text!) should also be + * not-executable. + */ + start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; + set_memory_nx(start, (end - start) >> PAGE_SHIFT); rodata_test(); -- cgit v1.2.3-70-g09d2 From cc842b82cc513ebc78bef6eeaacb5f6335851bcb Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 6 Feb 2008 22:39:45 +0100 Subject: x86: remove suprious ifdefs from pageattr.c The .rodata section really should just be read only; the config option is there to make breaking up the 2Mb page an option (so people whos machines give more performance for the 2Mb case can opt to do so). But when the page gets split anyway, this is no longer an issue, so clean up the code and remove the ifdefs Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index c870424aa9a..8493c855582 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -167,8 +167,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) if (within(address, virt_to_highmap(_text), virt_to_highmap(_etext))) pgprot_val(forbidden) |= _PAGE_NX; - -#ifdef CONFIG_DEBUG_RODATA /* The .rodata section needs to be read-only */ if (within(address, (unsigned long)__start_rodata, (unsigned long)__end_rodata)) @@ -179,7 +177,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) if (within(address, virt_to_highmap(__start_rodata), virt_to_highmap(__end_rodata))) pgprot_val(forbidden) |= _PAGE_RW; -#endif prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); -- cgit v1.2.3-70-g09d2 From f1fbabb312d657262322f4ce68b30a95f501945c Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 6 Feb 2008 22:39:45 +0100 Subject: x86: fix 64-bit sections fix 64-bit section warnings. Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 4f283ad215e..09b38d539b0 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -250,18 +250,13 @@ ENTRY(secondary_startup_64) lretq /* SMP bootup changes these two */ -#ifndef CONFIG_HOTPLUG_CPU - .pushsection .init.data -#endif + __CPUINITDATA .align 8 - .globl initial_code -initial_code: + ENTRY(initial_code) .quad x86_64_start_kernel -#ifndef CONFIG_HOTPLUG_CPU - .popsection -#endif - .globl init_rsp -init_rsp: + __FINITDATA + + ENTRY(init_rsp) .quad init_thread_union+THREAD_SIZE-8 bad_address: -- cgit v1.2.3-70-g09d2 From 971a52d66a3e87d4d2f5d3455e62680447cdb8e9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 6 Feb 2008 22:39:45 +0100 Subject: x86: delay CPA self-test and repeat it delay the CPA self-test so that any impact (corruption) of user-space pagetables can be triggered. Repeat the test every 30 seconds. this would have prevented the bug fixed by 8cb2a7c1e95e472b5, at its source. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 4 +-- arch/x86/mm/pageattr-test.c | 65 +++++++++++++++++++++++++++++++++++---------- 2 files changed, 53 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 2e1e3af28c3..fa555148823 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -220,9 +220,9 @@ config DEBUG_BOOT_PARAMS This option will cause struct boot_params to be exported via debugfs. config CPA_DEBUG - bool "CPA self test code" + bool "CPA self-test code" depends on DEBUG_KERNEL help - Do change_page_attr self tests at boot. + Do change_page_attr() self-tests every 30 seconds. endmenu diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 398f3a578dd..ed820160035 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -5,6 +5,7 @@ * and compares page tables forwards and afterwards. */ #include +#include #include #include #include @@ -14,8 +15,13 @@ #include #include +/* + * Only print the results of the first pass: + */ +static __read_mostly int print = 1; + enum { - NTEST = 4000, + NTEST = 400, #ifdef CONFIG_X86_64 LPS = (1 << PMD_SHIFT), #elif defined(CONFIG_X86_PAE) @@ -31,7 +37,7 @@ struct split_state { long min_exec, max_exec; }; -static __init int print_split(struct split_state *s) +static int print_split(struct split_state *s) { long i, expected, missed = 0; int printed = 0; @@ -82,10 +88,13 @@ static __init int print_split(struct split_state *s) s->max_exec = addr; } } - printk(KERN_INFO - "CPA mapping 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n", - s->spg, s->lpg, s->gpg, s->exec, - s->min_exec != ~0UL ? s->min_exec : 0, s->max_exec, missed); + if (print) { + printk(KERN_INFO + " 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n", + s->spg, s->lpg, s->gpg, s->exec, + s->min_exec != ~0UL ? s->min_exec : 0, + s->max_exec, missed); + } expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed; if (expected != i) { @@ -96,11 +105,11 @@ static __init int print_split(struct split_state *s) return err; } -static unsigned long __initdata addr[NTEST]; -static unsigned int __initdata len[NTEST]; +static unsigned long addr[NTEST]; +static unsigned int len[NTEST]; /* Change the global bit on random pages in the direct mapping */ -static __init int exercise_pageattr(void) +static int pageattr_test(void) { struct split_state sa, sb, sc; unsigned long *bm; @@ -110,7 +119,8 @@ static __init int exercise_pageattr(void) int i, k; int err; - printk(KERN_INFO "CPA exercising pageattr\n"); + if (print) + printk(KERN_INFO "CPA self-test:\n"); bm = vmalloc((max_pfn_mapped + 7) / 8); if (!bm) { @@ -186,7 +196,6 @@ static __init int exercise_pageattr(void) failed += print_split(&sb); - printk(KERN_INFO "CPA reverting everything\n"); for (i = 0; i < NTEST; i++) { if (!addr[i]) continue; @@ -214,12 +223,40 @@ static __init int exercise_pageattr(void) failed += print_split(&sc); if (failed) { - printk(KERN_ERR "CPA selftests NOT PASSED. Please report.\n"); + printk(KERN_ERR "NOT PASSED. Please report.\n"); WARN_ON(1); + return -EINVAL; } else { - printk(KERN_INFO "CPA selftests PASSED\n"); + if (print) + printk(KERN_INFO "ok.\n"); } return 0; } -module_init(exercise_pageattr); + +static int do_pageattr_test(void *__unused) +{ + while (!kthread_should_stop()) { + schedule_timeout_interruptible(HZ*30); + if (pageattr_test() < 0) + break; + if (print) + print--; + } + return 0; +} + +static int start_pageattr_test(void) +{ + struct task_struct *p; + + p = kthread_create(do_pageattr_test, NULL, "pageattr-test"); + if (!IS_ERR(p)) + wake_up_process(p); + else + WARN_ON(1); + + return 0; +} + +module_init(start_pageattr_test); -- cgit v1.2.3-70-g09d2 From 20651af9ac60fd6e31360688ad44861a7d05256a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 Feb 2008 22:39:45 +0100 Subject: x86: fix mttr trimming Pavel Emelyanov reported that his networking card did not work and bisected it down to: " The commit 093af8d7f0ba3c6be1485973508584ef081e9f93 x86_32: trim memory by updating e820 broke my e1000 card: on loading driver says that e1000: probe of 0000:04:03.0 failed with error -5 and the interface doesn't appear. " on a 32-bit kernel, base will overflow when try to do PAGE_SHIFT, and highest_addr will always less 4G. So use pfn instead of address to avoid the overflow when more than 4g RAM is installed on a 32-bit kernel. Many thanks to Pavel Emelyanov for reporting and testing it. Bisected-by: Pavel Emelyanov Signed-off-by: Yinghai Lu Tested-by: Pavel Emelyanov Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 1e27b69a7a0..b6e136f23d3 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -659,7 +659,7 @@ static __init int amd_special_default_mtrr(void) */ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) { - unsigned long i, base, size, highest_addr = 0, def, dummy; + unsigned long i, base, size, highest_pfn = 0, def, dummy; mtrr_type type; u64 trim_start, trim_size; @@ -682,28 +682,27 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) mtrr_if->get(i, &base, &size, &type); if (type != MTRR_TYPE_WRBACK) continue; - base <<= PAGE_SHIFT; - size <<= PAGE_SHIFT; - if (highest_addr < base + size) - highest_addr = base + size; + if (highest_pfn < base + size) + highest_pfn = base + size; } /* kvm/qemu doesn't have mtrr set right, don't trim them all */ - if (!highest_addr) { + if (!highest_pfn) { printk(KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n"); WARN_ON(1); return 0; } - if ((highest_addr >> PAGE_SHIFT) < end_pfn) { + if (highest_pfn < end_pfn) { printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" - " all of memory, losing %LdMB of RAM.\n", - (((u64)end_pfn << PAGE_SHIFT) - highest_addr) >> 20); + " all of memory, losing %luMB of RAM.\n", + (end_pfn - highest_pfn) >> (20 - PAGE_SHIFT)); WARN_ON(1); printk(KERN_INFO "update e820 for mtrr\n"); - trim_start = highest_addr; + trim_start = highest_pfn; + trim_start <<= PAGE_SHIFT; trim_size = end_pfn; trim_size <<= PAGE_SHIFT; trim_size -= trim_start; -- cgit v1.2.3-70-g09d2 From 58d5d0d8dd52cbca988af24b5692a20b00285543 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 6 Feb 2008 22:39:45 +0100 Subject: x86: fix deadlock, make pgd_lock irq-safe lockdep just caught this one: ================================= [ INFO: inconsistent lock state ] 2.6.24 #38 --------------------------------- inconsistent {in-softirq-W} -> {softirq-on-W} usage. swapper/1 [HC0[0]:SC0[0]:HE1:SE1] takes: (pgd_lock){-+..}, at: [] mm_init+0x1da/0x250 {in-softirq-W} state was registered at: [] 0xffffffffffffffff irq event stamp: 394559 hardirqs last enabled at (394559): [] get_page_from_freelist+0x30a/0x4c0 hardirqs last disabled at (394558): [] get_page_from_freelist+0x125/0x4c0 softirqs last enabled at (393952): [] __do_softirq+0xce/0xe0 softirqs last disabled at (393945): [] call_softirq+0x1c/0x30 other info that might help us debug this: no locks held by swapper/1. stack backtrace: Pid: 1, comm: swapper Not tainted 2.6.24 #38 Call Trace: [] print_usage_bug+0x18b/0x190 [] mark_lock+0x53d/0x560 [] __lock_acquire+0x3ca/0xed0 [] lock_acquire+0xa8/0xe0 [] ? mm_init+0x1da/0x250 [] _spin_lock+0x30/0x70 [] mm_init+0x1da/0x250 [] mm_alloc+0x39/0x50 [] bprm_mm_init+0x2a/0x1a0 [] do_execve+0x7b/0x220 [] sys_execve+0x46/0x70 [] kernel_execve+0x64/0xd0 [] ? _stext+0x1e/0x20 [] init_post+0x9a/0xf0 [] ? trace_hardirqs_on_thunk+0x35/0x3a [] ? trace_hardirqs_on+0xba/0xd0 [] ? child_rip+0xa/0x12 [] ? restore_args+0x0/0x44 [] ? child_rip+0x0/0x12 turns out that pgd_lock has been used on 64-bit x86 in an irq-unsafe way for almost two years, since commit 8c914cb704a11460e. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/fault.c | 5 +++-- include/asm-x86/pgalloc_64.h | 10 ++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index d8ed4006b3d..621afb6343d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -958,11 +958,12 @@ void vmalloc_sync_all(void) for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { if (!test_bit(pgd_index(address), insync)) { const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; struct page *page; if (pgd_none(*pgd_ref)) continue; - spin_lock(&pgd_lock); + spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; pgd = (pgd_t *)page_address(page) + pgd_index(address); @@ -971,7 +972,7 @@ void vmalloc_sync_all(void) else BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); } - spin_unlock(&pgd_lock); + spin_unlock_irqrestore(&pgd_lock, flags); set_bit(pgd_index(address), insync); } if (address == start) diff --git a/include/asm-x86/pgalloc_64.h b/include/asm-x86/pgalloc_64.h index 315314ce4bf..4f6220db22b 100644 --- a/include/asm-x86/pgalloc_64.h +++ b/include/asm-x86/pgalloc_64.h @@ -42,19 +42,21 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) static inline void pgd_list_add(pgd_t *pgd) { struct page *page = virt_to_page(pgd); + unsigned long flags; - spin_lock(&pgd_lock); + spin_lock_irqsave(&pgd_lock, flags); list_add(&page->lru, &pgd_list); - spin_unlock(&pgd_lock); + spin_unlock_irqrestore(&pgd_lock, flags); } static inline void pgd_list_del(pgd_t *pgd) { struct page *page = virt_to_page(pgd); + unsigned long flags; - spin_lock(&pgd_lock); + spin_lock_irqsave(&pgd_lock, flags); list_del(&page->lru); - spin_unlock(&pgd_lock); + spin_unlock_irqrestore(&pgd_lock, flags); } static inline pgd_t *pgd_alloc(struct mm_struct *mm) -- cgit v1.2.3-70-g09d2