diff options
Diffstat (limited to 'arch/x86')
61 files changed, 605 insertions, 498 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index aaed1a3b92d..4a88cf7695b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -21,6 +21,8 @@ config X86 select HAVE_IDE select HAVE_OPROFILE select HAVE_KPROBES + select HAVE_KVM + config GENERIC_LOCKBREAK def_bool n @@ -119,8 +121,6 @@ config ARCH_HAS_CPU_RELAX config HAVE_SETUP_PER_CPU_AREA def_bool X86_64 -select HAVE_KVM - config ARCH_HIBERNATION_POSSIBLE def_bool y depends on !SMP || !X86_VOYAGER @@ -1054,7 +1054,7 @@ config SECCOMP config CC_STACKPROTECTOR bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)" - depends on X86_64 && EXPERIMENTAL + depends on X86_64 && EXPERIMENTAL && BROKEN help This option turns on the -fstack-protector GCC feature. This feature puts, at the beginning of critical functions, a canary diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index e09a6b73a1a..6d50064db18 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -377,6 +377,19 @@ config X86_OOSTORE def_bool y depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR +# +# P6_NOPs are a relatively minor optimization that require a family >= +# 6 processor, except that it is broken on certain VIA chips. +# Furthermore, AMD chips prefer a totally different sequence of NOPs +# (which work on all CPUs). As a result, disallow these if we're +# compiling X86_GENERIC but not X86_64 (these NOPs do work on all +# x86-64 capable chips); the list of processors in the right-hand clause +# are the cores that benefit from this optimization. +# +config X86_P6_NOP + def_bool y + depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || PENTIUM4) + config X86_TSC def_bool y depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 @@ -390,6 +403,7 @@ config X86_CMOV config X86_MINIMUM_CPU_FAMILY int default "64" if X86_64 + default "6" if X86_32 && X86_P6_NOP default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK) default "3" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 864affc9a7b..702eb39901c 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -156,7 +156,7 @@ config IO_DELAY_TYPE_NONE choice prompt "IO delay type" - default IO_DELAY_0XED + default IO_DELAY_0X80 config IO_DELAY_0X80 bool "port 0x80 based port-IO delay [recommended]" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 204af43535c..f1e739a43d4 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -229,7 +229,7 @@ zdisk bzdisk: vmlinux fdimage fdimage144 fdimage288 isoimage: vmlinux $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@ -install: vdso_install +install: $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install PHONY += vdso_install diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index 378353956b5..e77d89f9e8a 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -37,6 +37,12 @@ static int detect_memory_e820(void) "=m" (*desc) : "D" (desc), "d" (SMAP), "a" (0xe820)); + /* BIOSes which terminate the chain with CF = 1 as opposed + to %ebx = 0 don't always report the SMAP signature on + the final, failing, probe. */ + if (err) + break; + /* Some BIOSes stop returning SMAP in the middle of the search loop. We don't know exactly how the BIOS screwed up the map at that point, we might have a @@ -47,9 +53,6 @@ static int detect_memory_e820(void) break; } - if (err) - break; - count++; desc++; } while (next && count < E820MAX); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 76ec0f8f138..4eb5ce84110 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -6,7 +6,15 @@ extra-y := head_$(BITS).o init_task.o vmlinux.lds extra-$(CONFIG_X86_64) += head64.o CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) -CFLAGS_vsyscall_64.o := $(PROFILING) -g0 + +# +# vsyscalls (which work on the user stack) should have +# no stack-protector checks: +# +nostackp := $(call cc-option, -fno-stack-protector) +CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) +CFLAGS_hpet.o := $(nostackp) +CFLAGS_tsc_64.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps_$(BITS).o irq_$(BITS).o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 680b7300a48..2cdc9de9371 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -72,7 +72,8 @@ static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return #define PREFIX "ACPI: " int acpi_noirq; /* skip ACPI IRQ initialization */ -int acpi_pci_disabled __initdata; /* skip ACPI PCI scan and IRQ initialization */ +int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */ +EXPORT_SYMBOL(acpi_pci_disabled); int acpi_ht __initdata = 1; /* enable HT */ int acpi_lapic; diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 10b67170b13..8ca3557a6d5 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -126,6 +126,8 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, printk(KERN_DEBUG "Monitor-Mwait will be used to enter C-%d " "state\n", cx->type); } + snprintf(cx->desc, ACPI_CX_DESC_LEN, "ACPI FFH INTEL MWAIT 0x%x", + cx->address); out: set_cpus_allowed(current, saved_mask); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index afd84463b71..8ea040124f7 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -20,10 +20,8 @@ #include <xen/interface/xen.h> -#ifdef CONFIG_LGUEST_GUEST #include <linux/lguest.h> #include "../../../drivers/lguest/lg.h" -#endif #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -130,10 +128,12 @@ void foo(void) OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); #endif -#ifdef CONFIG_LGUEST_GUEST +#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); + + BLANK(); OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 9b95edcfc6a..027e5c003b1 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -25,14 +25,6 @@ static int __init no_halt(char *s) __setup("no-hlt", no_halt); -static int __init mca_pentium(char *s) -{ - mca_pentium_flag = 1; - return 1; -} - -__setup("mca-pentium", mca_pentium); - static int __init no_387(char *s) { boot_cpu_data.hard_math = 0; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f86a3c4a266..a38aafaefc2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -504,7 +504,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) /* Clear all flags overriden by options */ for (i = 0; i < NCAPINTS; i++) - c->x86_capability[i] ^= cleared_cpu_caps[i]; + c->x86_capability[i] &= ~cleared_cpu_caps[i]; /* Init Machine Check Exception if available. */ mcheck_init(c); diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index b6e136f23d3..be83336fddb 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -43,6 +43,7 @@ #include <asm/uaccess.h> #include <asm/processor.h> #include <asm/msr.h> +#include <asm/kvm_para.h> #include "mtrr.h" u32 num_var_ranges = 0; @@ -649,6 +650,7 @@ static __init int amd_special_default_mtrr(void) /** * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs + * @end_pfn: ending page frame number * * Some buggy BIOSes don't setup the MTRRs properly for systems with certain * memory configurations. This routine checks that the highest MTRR matches @@ -688,8 +690,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) /* kvm/qemu doesn't have mtrr set right, don't trim them all */ if (!highest_pfn) { - printk(KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n"); - WARN_ON(1); + if (!kvm_para_available()) { + printk(KERN_WARNING + "WARNING: strange, CPU MTRRs all blank?\n"); + WARN_ON(1); + } return 0; } diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 200fb3f9ebf..e8b422c1c51 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -76,13 +76,6 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) /* All Transmeta CPUs have a constant TSC */ set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); - /* If we can run i686 user-space code, call us an i686 */ -#define USER686 ((1 << X86_FEATURE_TSC)|\ - (1 << X86_FEATURE_CX8)|\ - (1 << X86_FEATURE_CMOV)) - if (c->x86 == 5 && (c->x86_capability[0] & USER686) == USER686) - c->x86 = 6; - #ifdef CONFIG_SYSCTL /* randomize_va_space slows us down enormously; it probably triggers retranslation of x86->native bytecode */ diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 32dd62b36ff..759e02bec07 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -54,7 +54,7 @@ EXPORT_SYMBOL(efi); struct efi_memory_map memmap; -struct efi efi_phys __initdata; +static struct efi efi_phys __initdata; static efi_system_table_t efi_systab __initdata; static int __init setup_noefi(char *arg) @@ -384,9 +384,6 @@ static void __init runtime_code_page_mkexec(void) efi_memory_desc_t *md; void *p; - if (!(__supported_pte_mask & _PAGE_NX)) - return; - /* Make EFI runtime service code area executable */ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; @@ -394,7 +391,7 @@ static void __init runtime_code_page_mkexec(void) if (md->type != EFI_RUNTIME_SERVICES_CODE) continue; - set_memory_x(md->virt_addr, md->num_pages << EFI_PAGE_SHIFT); + set_memory_x(md->virt_addr, md->num_pages); } } @@ -428,9 +425,6 @@ void __init efi_enter_virtual_mode(void) else va = efi_ioremap(md->phys_addr, size); - if (md->attribute & EFI_MEMORY_WB) - set_memory_uc(md->virt_addr, size); - md->virt_addr = (u64) (unsigned long) va; if (!va) { @@ -439,6 +433,9 @@ void __init efi_enter_virtual_mode(void) continue; } + if (!(md->attribute & EFI_MEMORY_WB)) + set_memory_uc(md->virt_addr, md->num_pages); + systab = (u64) (unsigned long) efi_phys.systab; if (md->phys_addr <= systab && systab < end) { systab += md->virt_addr - md->phys_addr; @@ -476,7 +473,8 @@ void __init efi_enter_virtual_mode(void) efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; efi.reset_system = virt_efi_reset_system; efi.set_virtual_address_map = virt_efi_set_virtual_address_map; - runtime_code_page_mkexec(); + if (__supported_pte_mask & _PAGE_NX) + runtime_code_page_mkexec(); early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); memmap.map = NULL; } diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c index cb91f985b4a..5d23d85624d 100644 --- a/arch/x86/kernel/efi_32.c +++ b/arch/x86/kernel/efi_32.c @@ -28,6 +28,7 @@ #include <asm/page.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> +#include <asm/efi.h> /* * To make EFI call EFI runtime service in physical addressing mode we need diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index 09d5c233093..d143a1e76b3 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c @@ -35,6 +35,7 @@ #include <asm/tlbflush.h> #include <asm/proto.h> #include <asm/efi.h> +#include <asm/cacheflush.h> static pgd_t save_pgd __initdata; static unsigned long efi_flags __initdata; @@ -43,22 +44,15 @@ static void __init early_mapping_set_exec(unsigned long start, unsigned long end, int executable) { - pte_t *kpte; - unsigned int level; - - while (start < end) { - kpte = lookup_address((unsigned long)__va(start), &level); - BUG_ON(!kpte); - if (executable) - set_pte(kpte, pte_mkexec(*kpte)); - else - set_pte(kpte, __pte((pte_val(*kpte) | _PAGE_NX) & \ - __supported_pte_mask)); - if (level == PG_LEVEL_4K) - start = (start + PAGE_SIZE) & PAGE_MASK; - else - start = (start + PMD_SIZE) & PMD_MASK; - } + unsigned long num_pages; + + start &= PMD_MASK; + end = (end + PMD_SIZE - 1) & PMD_MASK; + num_pages = (end - start) >> PAGE_SHIFT; + if (executable) + set_memory_x((unsigned long)__va(start), num_pages); + else + set_memory_nx((unsigned long)__va(start), num_pages); } static void __init early_runtime_code_mapping_set_exec(int executable) @@ -74,7 +68,7 @@ static void __init early_runtime_code_mapping_set_exec(int executable) md = p; if (md->type == EFI_RUNTIME_SERVICES_CODE) { unsigned long end; - end = md->phys_addr + (md->num_pages << PAGE_SHIFT); + end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); early_mapping_set_exec(md->phys_addr, end, executable); } } @@ -84,8 +78,8 @@ void __init efi_call_phys_prelog(void) { unsigned long vaddress; - local_irq_save(efi_flags); early_runtime_code_mapping_set_exec(1); + local_irq_save(efi_flags); vaddress = (unsigned long)__va(0x0UL); save_pgd = *pgd_offset_k(0x0UL); set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress)); @@ -98,9 +92,9 @@ void __init efi_call_phys_epilog(void) * After the lock is released, the original page table is restored. */ set_pgd(pgd_offset_k(0x0UL), save_pgd); - early_runtime_code_mapping_set_exec(0); __flush_tlb_all(); local_irq_restore(efi_flags); + early_runtime_code_mapping_set_exec(0); } void __init efi_reserve_bootmem(void) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 824e21b80aa..4b87c32b639 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -409,7 +409,7 @@ restore_nocheck_notrace: RESTORE_REGS addl $4, %esp # skip orig_eax/error_code CFI_ADJUST_CFA_OFFSET -4 -ENTRY(irq_return) +irq_return: INTERRUPT_RETURN .section .fixup,"ax" iret_exc: diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 6be39a387c5..c20c9e7e08d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -453,6 +453,7 @@ ENTRY(stub_execve) CFI_REGISTER rip, r11 SAVE_REST FIXUP_TOP_OF_STACK %r11 + movq %rsp, %rcx call sys_execve RESTORE_TOP_OF_STACK %r11 movq %rax,RAX(%rsp) @@ -583,7 +584,7 @@ retint_restore_args: /* return to kernel space */ restore_args: RESTORE_ARGS 0,8,0 -ENTRY(irq_return) +irq_return: INTERRUPT_RETURN .section __ex_table, "a" @@ -1036,15 +1037,16 @@ ENDPROC(child_rip) * rdi: name, rsi: argv, rdx: envp * * We want to fallback into: - * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs) + * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs) * * do_sys_execve asm fallback arguments: - * rdi: name, rsi: argv, rdx: envp, fake frame on the stack + * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack */ ENTRY(kernel_execve) CFI_STARTPROC FAKE_STACK_FRAME $0 SAVE_ALL + movq %rsp,%rcx call sys_execve movq %rax, RAX(%rsp) RESTORE_REST diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 24dbf56928d..ad2440832de 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -88,6 +88,9 @@ void __init x86_64_start_kernel(char * real_mode_data) /* Make NULL pointers segfault */ zap_identity_mappings(); + /* Cleanup the over mapped high alias */ + cleanup_highmap(); + for (i = 0; i < IDT_ENTRIES; i++) { #ifdef CONFIG_EARLY_PRINTK set_intr_gate(i, &early_idt_handlers[i]); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 74ef4a41f22..fd8ca53943a 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -606,13 +606,13 @@ ENTRY(_stext) .section ".bss.page_aligned","wa" .align PAGE_SIZE_asm #ifdef CONFIG_X86_PAE -ENTRY(swapper_pg_pmd) +swapper_pg_pmd: .fill 1024*KPMDS,4,0 #else ENTRY(swapper_pg_dir) .fill 1024,4,0 #endif -ENTRY(swapper_pg_fixmap) +swapper_pg_fixmap: .fill 1024,4,0 ENTRY(empty_zero_page) .fill 4096,1,0 diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 09b38d539b0..a007454133a 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -107,8 +107,13 @@ startup_64: movq %rdx, 0(%rbx, %rax, 8) ident_complete: - /* Fixup the kernel text+data virtual addresses + /* + * Fixup the kernel text+data virtual addresses. Note that + * we might write invalid pmds, when the kernel is relocated + * cleanup_highmap() fixes this up along with the mappings + * beyond _end. */ + leaq level2_kernel_pgt(%rip), %rdi leaq 4096(%rdi), %r8 /* See if it is a valid page table entry */ @@ -250,7 +255,7 @@ ENTRY(secondary_startup_64) lretq /* SMP bootup changes these two */ - __CPUINITDATA + __REFDATA .align 8 ENTRY(initial_code) .quad x86_64_start_kernel @@ -374,18 +379,24 @@ NEXT_PAGE(level2_ident_pgt) /* Since I easily can, map the first 1G. * Don't set NX because code runs from these pages. */ - PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) + PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) NEXT_PAGE(level2_kernel_pgt) - /* 40MB kernel mapping. The kernel code cannot be bigger than that. - When you change this change KERNEL_TEXT_SIZE in page.h too. */ - /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */ - PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, KERNEL_TEXT_SIZE/PMD_SIZE) - /* Module mapping starts here */ - .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 + /* + * 128 MB kernel mapping. We spend a full page on this pagetable + * anyway. + * + * The kernel code+data+bss must not be bigger than that. + * + * (NOTE: at +128MB starts the module area, see MODULES_VADDR. + * If you want to increase this then increase MODULES_VADDR + * too.) + */ + PMDS(0, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, + KERNEL_IMAGE_SIZE/PMD_SIZE) NEXT_PAGE(level2_spare_pgt) - .fill 512,8,0 + .fill 512, 8, 0 #undef PMDS #undef NEXT_PAGE diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 429d084e014..235fd6c7750 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -368,8 +368,8 @@ static int hpet_clocksource_register(void) return 0; } -/* - * Try to setup the HPET timer +/** + * hpet_enable - Try to setup the HPET timer. Returns 1 on success. */ int __init hpet_enable(void) { diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 26719bd2c77..763dfc40723 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -39,7 +39,7 @@ #define HAVE_HWFP 1 #endif -unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; +static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; void mxcsr_feature_mask_init(void) { diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index ef62b07b2b4..8540abe86ad 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -95,7 +95,7 @@ static int pit_next_event(unsigned long delta, struct clock_event_device *evt) * registered. This mechanism replaces the previous #ifdef LOCAL_APIC - * !using_apic_timer decisions in do_timer_interrupt_hook() */ -struct clock_event_device pit_clockevent = { +static struct clock_event_device pit_clockevent = { .name = "pit", .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, .set_mode = init_pit_timer, diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c index 2d25b77102f..fe631967d62 100644 --- a/arch/x86/kernel/i8259_32.c +++ b/arch/x86/kernel/i8259_32.c @@ -26,8 +26,6 @@ * present in the majority of PC/AT boxes. * plus some generic x86 specific things if generic specifics makes * any sense at all. - * this file should become arch/i386/kernel/irq.c when the old irq.c - * moves to arch independent land */ static int i8259A_auto_eoi; @@ -362,23 +360,12 @@ void __init init_ISA_irqs (void) #endif init_8259A(0); - for (i = 0; i < NR_IRQS; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = NULL; - irq_desc[i].depth = 1; - - if (i < 16) { - /* - * 16 old-style INTA-cycle interrupts: - */ - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } else { - /* - * 'high' PCI IRQs filled in on demand - */ - irq_desc[i].chip = &no_irq_chip; - } + /* + * 16 old-style INTA-cycle interrupts: + */ + for (i = 0; i < 16; i++) { + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); } } diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c index bd49321034d..c706a306155 100644 --- a/arch/x86/kernel/io_delay.c +++ b/arch/x86/kernel/io_delay.c @@ -13,7 +13,6 @@ #include <asm/io.h> int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE; -EXPORT_SYMBOL_GPL(io_delay_type); static int __initdata io_delay_override; diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index a99e764fd66..34a591283f5 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -581,7 +581,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) * When a retprobed function returns, this code saves registers and * calls trampoline_handler() runs, which calls the kretprobe's handler. */ -void __kprobes kretprobe_trampoline_holder(void) +static void __used __kprobes kretprobe_trampoline_holder(void) { asm volatile ( ".global kretprobe_trampoline\n" @@ -673,7 +673,7 @@ void __kprobes kretprobe_trampoline_holder(void) /* * Called from kretprobe_trampoline */ -void * __kprobes trampoline_handler(struct pt_regs *regs) +static __used __kprobes void *trampoline_handler(struct pt_regs *regs) { struct kretprobe_instance *ri = NULL; struct hlist_head *head, empty_rp; diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index edd413650b3..6a0aa703868 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -46,9 +46,6 @@ static unsigned int nmi_hz = HZ; static DEFINE_PER_CPU(short, wd_enabled); -/* local prototypes */ -static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); - static int endflag __initdata = 0; #ifdef CONFIG_SMP @@ -391,15 +388,6 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) return rc; } -int do_nmi_callback(struct pt_regs * regs, int cpu) -{ -#ifdef CONFIG_SYSCTL - if (unknown_nmi_panic) - return unknown_nmi_panic_callback(regs, cpu); -#endif - return 0; -} - #ifdef CONFIG_SYSCTL static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) @@ -453,6 +441,15 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, #endif +int do_nmi_callback(struct pt_regs *regs, int cpu) +{ +#ifdef CONFIG_SYSCTL + if (unknown_nmi_panic) + return unknown_nmi_panic_callback(regs, cpu); +#endif + return 0; +} + void __trigger_all_cpu_backtrace(void) { int i; diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c index fb99484d21c..9a4fde74bee 100644 --- a/arch/x86/kernel/nmi_64.c +++ b/arch/x86/kernel/nmi_64.c @@ -46,9 +46,6 @@ static unsigned int nmi_hz = HZ; static DEFINE_PER_CPU(short, wd_enabled); -/* local prototypes */ -static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); - /* Run after command line and cpu_init init, but before all other checks */ void nmi_watchdog_default(void) { @@ -394,15 +391,6 @@ asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) nmi_exit(); } -int do_nmi_callback(struct pt_regs * regs, int cpu) -{ -#ifdef CONFIG_SYSCTL - if (unknown_nmi_panic) - return unknown_nmi_panic_callback(regs, cpu); -#endif - return 0; -} - void stop_nmi(void) { acpi_nmi_disable(); @@ -464,6 +452,15 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, #endif +int do_nmi_callback(struct pt_regs *regs, int cpu) +{ +#ifdef CONFIG_SYSCTL + if (unknown_nmi_panic) + return unknown_nmi_panic_callback(regs, cpu); +#endif + return 0; +} + void __trigger_all_cpu_backtrace(void) { int i; diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 65f6acb025c..faf3229f8fb 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -749,6 +749,15 @@ void __init gart_iommu_init(void) */ set_memory_np((unsigned long)__va(iommu_bus_base), iommu_size >> PAGE_SHIFT); + /* + * Tricky. The GART table remaps the physical memory range, + * so the CPU wont notice potential aliases and if the memory + * is remapped to UC later on, we might surprise the PCI devices + * with a stray writeout of a cacheline. So play it sure and + * do an explicit, full-scale wbinvd() _after_ having marked all + * the pages as Not-Present: + */ + wbinvd(); /* * Try to workaround a bug (thanks to BenH) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b0cc8f0136d..43f287744f9 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -730,16 +730,16 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ asmlinkage long sys_execve(char __user *name, char __user * __user *argv, - char __user * __user *envp, struct pt_regs regs) + char __user * __user *envp, struct pt_regs *regs) { long error; char * filename; filename = getname(name); error = PTR_ERR(filename); - if (IS_ERR(filename)) + if (IS_ERR(filename)) return error; - error = do_execve(filename, argv, envp, ®s); + error = do_execve(filename, argv, envp, regs); putname(filename); return error; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 702c33efea8..d862e396b09 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1160,7 +1160,7 @@ static int genregs32_set(struct task_struct *target, if (kbuf) { const compat_ulong_t *k = kbuf; while (count > 0 && !ret) { - ret = putreg(target, pos, *k++); + ret = putreg32(target, pos, *k++); count -= sizeof(*k); pos += sizeof(*k); } @@ -1171,7 +1171,7 @@ static int genregs32_set(struct task_struct *target, ret = __get_user(word, u++); if (ret) break; - ret = putreg(target, pos, word); + ret = putreg32(target, pos, word); count -= sizeof(*u); pos += sizeof(*u); } diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 1941482d4ca..c47208fc593 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -11,7 +11,7 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) { u8 config, rev; - u32 word; + u16 word; /* BIOS may enable hardware IRQ balancing for * E7520/E7320/E7525(revision ID 0x9 and below) @@ -26,8 +26,11 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) pci_read_config_byte(dev, 0xf4, &config); pci_write_config_byte(dev, 0xf4, config|0x2); - /* read xTPR register */ - raw_pci_read(0, 0, 0x40, 0x4c, 2, &word); + /* + * read xTPR register. We may not have a pci_dev for device 8 + * because it might be hidden until the above write. + */ + pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word); if (!(word & (1 << 13))) { dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 5818dc28167..7fd6ac43e4a 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -326,7 +326,7 @@ static inline void kb_wait(void) } } -void machine_emergency_restart(void) +static void native_machine_emergency_restart(void) { int i; @@ -376,7 +376,7 @@ void machine_emergency_restart(void) } } -void machine_shutdown(void) +static void native_machine_shutdown(void) { /* Stop the cpus and apics */ #ifdef CONFIG_SMP @@ -420,7 +420,7 @@ void machine_shutdown(void) #endif } -void machine_restart(char *__unused) +static void native_machine_restart(char *__unused) { printk("machine restart\n"); @@ -429,11 +429,11 @@ void machine_restart(char *__unused) machine_emergency_restart(); } -void machine_halt(void) +static void native_machine_halt(void) { } -void machine_power_off(void) +static void native_machine_power_off(void) { if (pm_power_off) { if (!reboot_force) @@ -443,9 +443,35 @@ void machine_power_off(void) } struct machine_ops machine_ops = { - .power_off = machine_power_off, - .shutdown = machine_shutdown, - .emergency_restart = machine_emergency_restart, - .restart = machine_restart, - .halt = machine_halt + .power_off = native_machine_power_off, + .shutdown = native_machine_shutdown, + .emergency_restart = native_machine_emergency_restart, + .restart = native_machine_restart, + .halt = native_machine_halt }; + +void machine_power_off(void) +{ + machine_ops.power_off(); +} + +void machine_shutdown(void) +{ + machine_ops.shutdown(); +} + +void machine_emergency_restart(void) +{ + machine_ops.emergency_restart(); +} + +void machine_restart(char *cmd) +{ + machine_ops.restart(cmd); +} + +void machine_halt(void) +{ + machine_ops.halt(); +} + diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 691ab4cb167..a1d7071a51c 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -164,7 +164,6 @@ unsigned long mmu_cr4_features = X86_CR4_PAE; unsigned int machine_id; unsigned int machine_submodel_id; unsigned int BIOS_revision; -unsigned int mca_pentium_flag; /* Boot loader ID as an integer, for the benefit of proc_dointvec */ int bootloader_type; diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index c0d8208af12..7637dc91c79 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -518,7 +518,7 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) } #ifdef CONFIG_NUMA -static int nearby_node(int apicid) +static int __cpuinit nearby_node(int apicid) { int i, node; @@ -791,7 +791,7 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) return 1; } -static void srat_detect_node(void) +static void __cpuinit srat_detect_node(void) { #ifdef CONFIG_NUMA unsigned node; @@ -1021,7 +1021,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) /* Clear all flags overriden by options */ for (i = 0; i < NCAPINTS; i++) - c->x86_capability[i] ^= cleared_cpu_caps[i]; + c->x86_capability[i] &= ~cleared_cpu_caps[i]; #ifdef CONFIG_X86_MCE mcheck_init(c); @@ -1046,7 +1046,7 @@ __setup("noclflush", setup_noclflush); void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { if (c->x86_model_id[0]) - printk(KERN_INFO "%s", c->x86_model_id); + printk(KERN_CONT "%s", c->x86_model_id); if (c->x86_mask || c->cpuid_level >= 0) printk(KERN_CONT " stepping %02x\n", c->x86_mask); diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c index d53bd6fcb42..0880f2c388a 100644 --- a/arch/x86/kernel/smpboot_64.c +++ b/arch/x86/kernel/smpboot_64.c @@ -554,10 +554,10 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) int timeout; unsigned long start_rip; struct create_idle c_idle = { - .work = __WORK_INITIALIZER(c_idle.work, do_fork_idle), .cpu = cpu, .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), }; + INIT_WORK(&c_idle.work, do_fork_idle); /* allocate memory for gdts of secondary cpus. Hotplug is considered */ if (!cpu_gdt_descr[cpu].address && diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 02f0f61f5b1..c28c342c162 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -25,6 +25,8 @@ static int save_stack_stack(void *data, char *name) static void save_stack_address(void *data, unsigned long addr, int reliable) { struct stack_trace *trace = data; + if (!reliable) + return; if (trace->skip > 0) { trace->skip--; return; @@ -37,6 +39,8 @@ static void save_stack_address_nosched(void *data, unsigned long addr, int reliable) { struct stack_trace *trace = (struct stack_trace *)data; + if (!reliable) + return; if (in_sched_functions(addr)) return; if (trace->skip > 0) { diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c index 4c163772000..c29e235792a 100644 --- a/arch/x86/kernel/test_rodata.c +++ b/arch/x86/kernel/test_rodata.c @@ -10,8 +10,8 @@ * of the License. */ #include <linux/module.h> +#include <asm/cacheflush.h> #include <asm/sections.h> -extern int rodata_test_data; int rodata_test(void) { diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index a40051b71d9..0fcc95a354f 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -34,7 +34,7 @@ static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); #ifdef CONFIG_HOTPLUG_CPU -int arch_register_cpu(int num) +int __ref arch_register_cpu(int num) { /* * CPU0 cannot be offlined due to several diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index efc66df728b..04546668191 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -84,7 +84,7 @@ static inline void conditional_sti(struct pt_regs *regs) static inline void preempt_conditional_sti(struct pt_regs *regs) { - preempt_disable(); + inc_preempt_count(); if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } @@ -95,7 +95,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) local_irq_disable(); /* Make sure to not schedule here because we could be running on an exception stack. */ - preempt_enable_no_resched(); + dec_preempt_count(); } int kstack_depth_to_print = 12; diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index 43517e324be..f14cfd9d1f9 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -28,7 +28,8 @@ EXPORT_SYMBOL_GPL(tsc_khz); static int __init tsc_setup(char *str) { printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " - "cannot disable TSC.\n"); + "cannot disable TSC completely.\n"); + mark_tsc_unstable("user disabled TSC"); return 1; } #else diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index f1148ac8abe..2ffa9656fe7 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -38,7 +38,7 @@ SECTIONS /* read-only */ .text : AT(ADDR(.text) - LOAD_OFFSET) { - . = ALIGN(4096); /* not really needed, already page aligned */ + . = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */ *(.text.page_aligned) TEXT_TEXT SCHED_TEXT @@ -70,21 +70,21 @@ SECTIONS RODATA /* writeable */ - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ DATA_DATA CONSTRUCTORS } :data - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { __nosave_begin = .; *(.data.nosave) - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); __nosave_end = .; } - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { *(.data.page_aligned) *(.data.idt) @@ -108,7 +108,7 @@ SECTIONS } /* might get freed after init */ - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { __smp_locks = .; *(.smp_locks) @@ -120,10 +120,10 @@ SECTIONS * after boot. Always make sure that ALIGN() directive is present after * the section which contains __smp_alt_end. */ - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); /* will be freed after init */ - . = ALIGN(4096); /* Init code and data */ + . = ALIGN(PAGE_SIZE); /* Init code and data */ .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { __init_begin = .; _sinittext = .; @@ -174,23 +174,23 @@ SECTIONS EXIT_DATA } #if defined(CONFIG_BLK_DEV_INITRD) - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { __initramfs_start = .; *(.init.ramfs) __initramfs_end = .; } #endif - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { __per_cpu_start = .; *(.data.percpu) *(.data.percpu.shared_aligned) __per_cpu_end = .; } - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); /* freed after init ends here */ - + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { __init_end = .; __bss_start = .; /* BSS */ @@ -200,7 +200,7 @@ SECTIONS __bss_stop = .; _end = . ; /* This is where the kernel creates the early boot page tables */ - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); pg0 = . ; } diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 0992b9946c6..fab13229973 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -37,7 +37,7 @@ SECTIONS KPROBES_TEXT *(.fixup) *(.gnu.warning) - _etext = .; /* End of text section */ + _etext = .; /* End of text section */ } :text = 0x9090 . = ALIGN(16); /* Exception table */ @@ -60,7 +60,7 @@ SECTIONS __tracedata_end = .; } - . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ + . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { DATA_DATA @@ -119,7 +119,7 @@ SECTIONS .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) } - . = VSYSCALL_VIRT_ADDR + 4096; + . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; #undef VSYSCALL_ADDR #undef VSYSCALL_PHYS_ADDR @@ -129,28 +129,28 @@ SECTIONS #undef VVIRT_OFFSET #undef VVIRT - . = ALIGN(8192); /* init_task */ + . = ALIGN(THREAD_SIZE); /* init_task */ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { *(.data.init_task) }:data.init - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { *(.data.page_aligned) } /* might get freed after init */ - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); __smp_alt_begin = .; __smp_locks = .; .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { *(.smp_locks) } __smp_locks_end = .; - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); __smp_alt_end = .; - . = ALIGN(4096); /* Init code and data */ + . = ALIGN(PAGE_SIZE); /* Init code and data */ __init_begin = .; .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { _sinittext = .; @@ -191,7 +191,7 @@ SECTIONS .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { *(.altinstructions) } - __alt_instructions_end = .; + __alt_instructions_end = .; .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { *(.altinstr_replacement) } @@ -207,25 +207,25 @@ SECTIONS /* vdso blob that is mapped into user space */ vdso_start = . ; .vdso : AT(ADDR(.vdso) - LOAD_OFFSET) { *(.vdso) } - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); vdso_end = .; #ifdef CONFIG_BLK_DEV_INITRD - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); __initramfs_start = .; .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } __initramfs_end = .; #endif - PERCPU(4096) + PERCPU(PAGE_SIZE) - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); __init_end = .; - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); __nosave_begin = .; .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); __nosave_end = .; __bss_start = .; /* BSS */ diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 3f824277458..b6be812fac0 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -44,11 +44,6 @@ #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) #define __syscall_clobber "r11","cx","memory" -#define __pa_vsymbol(x) \ - ({unsigned long v; \ - extern char __vsyscall_0; \ - asm("" : "=r" (v) : "0" (x)); \ - ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); }) /* * vsyscall_gtod_data contains data that is : @@ -102,7 +97,7 @@ static __always_inline void do_get_tz(struct timezone * tz) static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) { int ret; - asm volatile("vsysc2: syscall" + asm volatile("syscall" : "=a" (ret) : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber ); @@ -112,7 +107,7 @@ static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) static __always_inline long time_syscall(long *t) { long secs; - asm volatile("vsysc1: syscall" + asm volatile("syscall" : "=a" (secs) : "0" (__NR_time),"D" (t) : __syscall_clobber); return secs; @@ -227,50 +222,10 @@ long __vsyscall(3) venosys_1(void) } #ifdef CONFIG_SYSCTL - -#define SYSCALL 0x050f -#define NOP2 0x9090 - -/* - * NOP out syscall in vsyscall page when not needed. - */ -static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - extern u16 vsysc1, vsysc2; - u16 __iomem *map1; - u16 __iomem *map2; - int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - if (!write) - return ret; - /* gcc has some trouble with __va(__pa()), so just do it this - way. */ - map1 = ioremap(__pa_vsymbol(&vsysc1), 2); - if (!map1) - return -ENOMEM; - map2 = ioremap(__pa_vsymbol(&vsysc2), 2); - if (!map2) { - ret = -ENOMEM; - goto out; - } - if (!vsyscall_gtod_data.sysctl_enabled) { - writew(SYSCALL, map1); - writew(SYSCALL, map2); - } else { - writew(NOP2, map1); - writew(NOP2, map2); - } - iounmap(map2); -out: - iounmap(map1); - return ret; -} - static ctl_table kernel_table2[] = { { .procname = "vsyscall64", .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = vsyscall_sysctl_change }, + .mode = 0644 }, {} }; @@ -279,7 +234,6 @@ static ctl_table kernel_root_table2[] = { .child = kernel_table2 }, {} }; - #endif /* Assume __initcall executes before all user space. Hopefully kmod diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 5afdde4895d..cccb38a5965 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -57,6 +57,7 @@ #include <linux/lguest_launcher.h> #include <linux/virtio_console.h> #include <linux/pm.h> +#include <asm/lguest.h> #include <asm/paravirt.h> #include <asm/param.h> #include <asm/page.h> @@ -75,15 +76,6 @@ * behaving in simplified but equivalent ways. In particular, the Guest is the * same kernel as the Host (or at least, built from the same source code). :*/ -/* Declarations for definitions in lguest_guest.S */ -extern char lguest_noirq_start[], lguest_noirq_end[]; -extern const char lgstart_cli[], lgend_cli[]; -extern const char lgstart_sti[], lgend_sti[]; -extern const char lgstart_popf[], lgend_popf[]; -extern const char lgstart_pushf[], lgend_pushf[]; -extern const char lgstart_iret[], lgend_iret[]; -extern void lguest_iret(void); - struct lguest_data lguest_data = { .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, .noirq_start = (u32)lguest_noirq_start, @@ -489,7 +481,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { *pmdp = pmdval; lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK, - (__pa(pmdp)&(PAGE_SIZE-1))/4, 0); + (__pa(pmdp)&(PAGE_SIZE-1)), 0); } /* There are a couple of legacy places where the kernel sets a PTE, but we diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index fd42a4a095f..459b58a8a15 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -1,117 +1,129 @@ -/* Copyright 2002,2003 Andi Kleen, SuSE Labs. +/* + * Copyright 2002, 2003 Andi Kleen, SuSE Labs. * Subject to the GNU Public License v.2 - * + * * Wrappers of assembly checksum functions for x86-64. */ - #include <asm/checksum.h> #include <linux/module.h> -/** - * csum_partial_copy_from_user - Copy and checksum from user space. - * @src: source address (user space) +/** + * csum_partial_copy_from_user - Copy and checksum from user space. + * @src: source address (user space) * @dst: destination address * @len: number of bytes to be copied. * @isum: initial sum that is added into the result (32bit unfolded) * @errp: set to -EFAULT for an bad source address. - * + * * Returns an 32bit unfolded checksum of the buffer. - * src and dst are best aligned to 64bits. - */ + * src and dst are best aligned to 64bits. + */ __wsum csum_partial_copy_from_user(const void __user *src, void *dst, int len, __wsum isum, int *errp) -{ +{ might_sleep(); *errp = 0; - if (likely(access_ok(VERIFY_READ,src, len))) { - /* Why 6, not 7? To handle odd addresses aligned we - would need to do considerable complications to fix the - checksum which is defined as an 16bit accumulator. The - fix alignment code is primarily for performance - compatibility with 32bit and that will handle odd - addresses slowly too. */ - if (unlikely((unsigned long)src & 6)) { - while (((unsigned long)src & 6) && len >= 2) { - __u16 val16; - *errp = __get_user(val16, (const __u16 __user *)src); - if (*errp) - return isum; - *(__u16 *)dst = val16; - isum = (__force __wsum)add32_with_carry( - (__force unsigned)isum, val16); - src += 2; - dst += 2; - len -= 2; - } + + if (!likely(access_ok(VERIFY_READ, src, len))) + goto out_err; + + /* + * Why 6, not 7? To handle odd addresses aligned we + * would need to do considerable complications to fix the + * checksum which is defined as an 16bit accumulator. The + * fix alignment code is primarily for performance + * compatibility with 32bit and that will handle odd + * addresses slowly too. + */ + if (unlikely((unsigned long)src & 6)) { + while (((unsigned long)src & 6) && len >= 2) { + __u16 val16; + + *errp = __get_user(val16, (const __u16 __user *)src); + if (*errp) + return isum; + + *(__u16 *)dst = val16; + isum = (__force __wsum)add32_with_carry( + (__force unsigned)isum, val16); + src += 2; + dst += 2; + len -= 2; } - isum = csum_partial_copy_generic((__force const void *)src, - dst, len, isum, errp, NULL); - if (likely(*errp == 0)) - return isum; - } + } + isum = csum_partial_copy_generic((__force const void *)src, + dst, len, isum, errp, NULL); + if (unlikely(*errp)) + goto out_err; + + return isum; + +out_err: *errp = -EFAULT; - memset(dst,0,len); - return isum; -} + memset(dst, 0, len); + return isum; +} EXPORT_SYMBOL(csum_partial_copy_from_user); -/** - * csum_partial_copy_to_user - Copy and checksum to user space. +/** + * csum_partial_copy_to_user - Copy and checksum to user space. * @src: source address * @dst: destination address (user space) * @len: number of bytes to be copied. * @isum: initial sum that is added into the result (32bit unfolded) * @errp: set to -EFAULT for an bad destination address. - * + * * Returns an 32bit unfolded checksum of the buffer. * src and dst are best aligned to 64bits. - */ + */ __wsum csum_partial_copy_to_user(const void *src, void __user *dst, int len, __wsum isum, int *errp) -{ +{ might_sleep(); + if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) { *errp = -EFAULT; - return 0; + return 0; } if (unlikely((unsigned long)dst & 6)) { - while (((unsigned long)dst & 6) && len >= 2) { + while (((unsigned long)dst & 6) && len >= 2) { __u16 val16 = *(__u16 *)src; + isum = (__force __wsum)add32_with_carry( (__force unsigned)isum, val16); *errp = __put_user(val16, (__u16 __user *)dst); if (*errp) return isum; - src += 2; - dst += 2; + src += 2; + dst += 2; len -= 2; } } *errp = 0; - return csum_partial_copy_generic(src, (void __force *)dst,len,isum,NULL,errp); -} - + return csum_partial_copy_generic(src, (void __force *)dst, + len, isum, NULL, errp); +} EXPORT_SYMBOL(csum_partial_copy_to_user); -/** +/** * csum_partial_copy_nocheck - Copy and checksum. * @src: source address * @dst: destination address * @len: number of bytes to be copied. * @isum: initial sum that is added into the result (32bit unfolded) - * + * * Returns an 32bit unfolded checksum of the buffer. - */ + */ __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum) -{ - return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL); -} +{ + return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL); +} EXPORT_SYMBOL(csum_partial_copy_nocheck); __sum16 csum_ipv6_magic(const struct in6_addr *saddr, @@ -119,17 +131,20 @@ __sum16 csum_ipv6_magic(const struct in6_addr *saddr, __u32 len, unsigned short proto, __wsum sum) { __u64 rest, sum64; - + rest = (__force __u64)htonl(len) + (__force __u64)htons(proto) + (__force __u64)sum; - asm(" addq (%[saddr]),%[sum]\n" - " adcq 8(%[saddr]),%[sum]\n" - " adcq (%[daddr]),%[sum]\n" - " adcq 8(%[daddr]),%[sum]\n" - " adcq $0,%[sum]\n" - : [sum] "=r" (sum64) - : "[sum]" (rest),[saddr] "r" (saddr), [daddr] "r" (daddr)); - return csum_fold((__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32)); -} + asm(" addq (%[saddr]),%[sum]\n" + " adcq 8(%[saddr]),%[sum]\n" + " adcq (%[daddr]),%[sum]\n" + " adcq 8(%[daddr]),%[sum]\n" + " adcq $0,%[sum]\n" + + : [sum] "=r" (sum64) + : "[sum]" (rest), [saddr] "r" (saddr), [daddr] "r" (daddr)); + + return csum_fold( + (__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32)); +} EXPORT_SYMBOL(csum_ipv6_magic); diff --git a/arch/x86/lib/io_64.c b/arch/x86/lib/io_64.c index 87b4a4e1803..3f1eb59b5f0 100644 --- a/arch/x86/lib/io_64.c +++ b/arch/x86/lib/io_64.c @@ -1,23 +1,25 @@ #include <linux/string.h> -#include <asm/io.h> #include <linux/module.h> +#include <asm/io.h> -void __memcpy_toio(unsigned long dst,const void*src,unsigned len) +void __memcpy_toio(unsigned long dst, const void *src, unsigned len) { - __inline_memcpy((void *) dst,src,len); + __inline_memcpy((void *)dst, src, len); } EXPORT_SYMBOL(__memcpy_toio); -void __memcpy_fromio(void *dst,unsigned long src,unsigned len) +void __memcpy_fromio(void *dst, unsigned long src, unsigned len) { - __inline_memcpy(dst,(const void *) src,len); + __inline_memcpy(dst, (const void *)src, len); } EXPORT_SYMBOL(__memcpy_fromio); void memset_io(volatile void __iomem *a, int b, size_t c) { - /* XXX: memset can mangle the IO patterns quite a bit. - perhaps it would be better to use a dumb one */ - memset((void *)a,b,c); + /* + * TODO: memset can mangle the IO patterns quite a bit. + * perhaps it would be better to use a dumb one: + */ + memset((void *)a, b, c); } EXPORT_SYMBOL(memset_io); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 621afb6343d..fdc667422df 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -186,7 +186,7 @@ static int bad_address(void *p) } #endif -void dump_pagetable(unsigned long address) +static void dump_pagetable(unsigned long address) { #ifdef CONFIG_X86_32 __typeof__(pte_val(__pte(0))) page; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8106bba41ec..ee1091a4696 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -47,6 +47,7 @@ #include <asm/sections.h> #include <asm/paravirt.h> #include <asm/setup.h> +#include <asm/cacheflush.h> unsigned int __VMALLOC_RESERVE = 128 << 20; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b59fc238151..a02a14f0f32 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -45,6 +45,7 @@ #include <asm/sections.h> #include <asm/kdebug.h> #include <asm/numa.h> +#include <asm/cacheflush.h> const struct dma_mapping_ops *dma_ops; EXPORT_SYMBOL(dma_ops); @@ -170,6 +171,34 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) __flush_tlb_one(vaddr); } +/* + * The head.S code sets up the kernel high mapping: + * + * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) + * + * phys_addr holds the negative offset to the kernel, which is added + * to the compile time generated pmds. This results in invalid pmds up + * to the point where we hit the physaddr 0 mapping. + * + * We limit the mappings to the region from _text to _end. _end is + * rounded up to the 2MB boundary. This catches the invalid pmds as + * well, as they are located before _text: + */ +void __init cleanup_highmap(void) +{ + unsigned long vaddr = __START_KERNEL_map; + unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; + pmd_t *pmd = level2_kernel_pgt; + pmd_t *last_pmd = pmd + PTRS_PER_PMD; + + for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { + if (!pmd_present(*pmd)) + continue; + if (vaddr < (unsigned long) _text || vaddr > end) + set_pmd(pmd, __pmd(0)); + } +} + /* NOTE: this is meant to be run only at boot */ void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) @@ -487,14 +516,6 @@ void __init mem_init(void) /* clear_bss() already clear the empty_zero_page */ - /* temporary debugging - double check it's true: */ - { - int i; - - for (i = 0; i < 1024; i++) - WARN_ON_ONCE(empty_zero_page[i]); - } - reservedpages = 0; /* this will put all low memory onto the freelists */ diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index a4897a85268..882328efc3d 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -42,6 +42,22 @@ int page_is_ram(unsigned long pagenr) unsigned long addr, end; int i; + /* + * A special case is the first 4Kb of memory; + * This is a BIOS owned area, not kernel ram, but generally + * not listed as such in the E820 table. + */ + if (pagenr == 0) + return 0; + + /* + * Second special case: Some BIOSen report the PC BIOS + * area (640->1Mb) as ram even though it is not. + */ + if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) && + pagenr < (BIOS_END >> PAGE_SHIFT)) + return 0; + for (i = 0; i < e820.nr_map; i++) { /* * Not usable memory: @@ -51,14 +67,6 @@ int page_is_ram(unsigned long pagenr) addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT; end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT; - /* - * Sanity check: Some BIOSen report areas as RAM that - * are not. Notably the 640->1Mb area, which is the - * PCI BIOS area. - */ - if (addr >= (BIOS_BEGIN >> PAGE_SHIFT) && - end < (BIOS_END >> PAGE_SHIFT)) - continue; if ((pagenr >= addr) && (pagenr < end)) return 1; @@ -126,6 +134,8 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, return NULL; } + WARN_ON_ONCE(page_is_ram(pfn)); + switch (mode) { case IOR_MODE_UNCACHED: default: @@ -265,7 +275,9 @@ static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) { - pgd_t *pgd = &swapper_pg_dir[pgd_index(addr)]; + /* Don't assume we're using swapper_pg_dir at this point */ + pgd_t *base = __va(read_cr3()); + pgd_t *pgd = &base[pgd_index(addr)]; pud_t *pud = pud_offset(pgd, addr); pmd_t *pmd = pmd_offset(pud, addr); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 1aecc658cd7..59898fb0a4a 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -494,11 +494,13 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) int i; nodes_clear(node_possible_map); + nodes_clear(node_online_map); #ifdef CONFIG_NUMA_EMU if (cmdline && !numa_emulation(start_pfn, end_pfn)) return; nodes_clear(node_possible_map); + nodes_clear(node_online_map); #endif #ifdef CONFIG_ACPI_NUMA @@ -506,6 +508,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) end_pfn << PAGE_SHIFT)) return; nodes_clear(node_possible_map); + nodes_clear(node_online_map); #endif #ifdef CONFIG_K8_NUMA @@ -513,6 +516,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) end_pfn<<PAGE_SHIFT)) return; nodes_clear(node_possible_map); + nodes_clear(node_online_map); #endif printk(KERN_INFO "%s\n", numa_off ? "NUMA turned off" : "No NUMA configuration found"); @@ -524,7 +528,6 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) memnode_shift = 63; memnodemap = memnode.embedded_map; memnodemap[0] = 0; - nodes_clear(node_online_map); node_set_online(0); node_set(0, node_possible_map); for (i = 0; i < NR_CPUS; i++) diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index ed820160035..75f1b109aae 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -40,7 +40,6 @@ struct split_state { static int print_split(struct split_state *s) { long i, expected, missed = 0; - int printed = 0; int err = 0; s->lpg = s->gpg = s->spg = s->exec = 0; @@ -53,12 +52,6 @@ static int print_split(struct split_state *s) pte = lookup_address(addr, &level); if (!pte) { - if (!printed) { - dump_pagetable(addr); - printk(KERN_INFO "CPA %lx no pte level %d\n", - addr, level); - printed = 1; - } missed++; i++; continue; diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 440210a2277..14e48b5a94b 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -16,6 +16,7 @@ #include <asm/sections.h> #include <asm/uaccess.h> #include <asm/pgalloc.h> +#include <asm/proto.h> /* * The current flushing context - we pass it instead of 5 arguments: @@ -26,8 +27,29 @@ struct cpa_data { pgprot_t mask_clr; int numpages; int flushtlb; + unsigned long pfn; }; +#ifdef CONFIG_X86_64 + +static inline unsigned long highmap_start_pfn(void) +{ + return __pa(_text) >> PAGE_SHIFT; +} + +static inline unsigned long highmap_end_pfn(void) +{ + return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; +} + +#endif + +#ifdef CONFIG_DEBUG_PAGEALLOC +# define debug_pagealloc 1 +#else +# define debug_pagealloc 0 +#endif + static inline int within(unsigned long addr, unsigned long start, unsigned long end) { @@ -123,29 +145,14 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) } } -#define HIGH_MAP_START __START_KERNEL_map -#define HIGH_MAP_END (__START_KERNEL_map + KERNEL_TEXT_SIZE) - - -/* - * Converts a virtual address to a X86-64 highmap address - */ -static unsigned long virt_to_highmap(void *address) -{ -#ifdef CONFIG_X86_64 - return __pa((unsigned long)address) + HIGH_MAP_START - phys_base; -#else - return (unsigned long)address; -#endif -} - /* * Certain areas of memory on x86 require very specific protection flags, * for example the BIOS area or kernel text. Callers don't always get this * right (again, ioremap() on BIOS memory is not uncommon) so this function * checks and fixes these known static required protection bits. */ -static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) +static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, + unsigned long pfn) { pgprot_t forbidden = __pgprot(0); @@ -153,30 +160,23 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) * The BIOS area between 640k and 1Mb needs to be executable for * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. */ - if (within(__pa(address), BIOS_BEGIN, BIOS_END)) + if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_NX; /* * The kernel text needs to be executable for obvious reasons - * Does not cover __inittext since that is gone later on + * Does not cover __inittext since that is gone later on. On + * 64bit we do not enforce !NX on the low mapping */ if (within(address, (unsigned long)_text, (unsigned long)_etext)) pgprot_val(forbidden) |= _PAGE_NX; - /* - * Do the same for the x86-64 high kernel mapping - */ - if (within(address, virt_to_highmap(_text), virt_to_highmap(_etext))) - pgprot_val(forbidden) |= _PAGE_NX; - /* The .rodata section needs to be read-only */ - if (within(address, (unsigned long)__start_rodata, - (unsigned long)__end_rodata)) - pgprot_val(forbidden) |= _PAGE_RW; /* - * Do the same for the x86-64 high kernel mapping + * The .rodata section needs to be read-only. Using the pfn + * catches all aliases. */ - if (within(address, virt_to_highmap(__start_rodata), - virt_to_highmap(__end_rodata))) + if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, + __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); @@ -253,7 +253,7 @@ static int try_preserve_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { - unsigned long nextpage_addr, numpages, pmask, psize, flags, addr; + unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; pte_t new_pte, old_pte, *tmp; pgprot_t old_prot, new_prot; int i, do_split = 1; @@ -275,8 +275,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, break; #ifdef CONFIG_X86_64 case PG_LEVEL_1G: - psize = PMD_PAGE_SIZE; - pmask = PMD_PAGE_MASK; + psize = PUD_PAGE_SIZE; + pmask = PUD_PAGE_MASK; break; #endif default: @@ -301,7 +301,15 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); - new_prot = static_protections(new_prot, address); + + /* + * old_pte points to the large page base address. So we need + * to add the offset of the virtual address: + */ + pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); + cpa->pfn = pfn; + + new_prot = static_protections(new_prot, address, pfn); /* * We need to check the full range, whether @@ -309,8 +317,9 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * the pages in the range we try to preserve: */ addr = address + PAGE_SIZE; - for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE) { - pgprot_t chk_prot = static_protections(new_prot, addr); + pfn++; + for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) { + pgprot_t chk_prot = static_protections(new_prot, addr, pfn); if (pgprot_val(chk_prot) != pgprot_val(new_prot)) goto out_unlock; @@ -352,45 +361,48 @@ out_unlock: static LIST_HEAD(page_pool); static unsigned long pool_size, pool_pages, pool_low; -static unsigned long pool_used, pool_failed, pool_refill; +static unsigned long pool_used, pool_failed; -static void cpa_fill_pool(void) +static void cpa_fill_pool(struct page **ret) { - struct page *p; gfp_t gfp = GFP_KERNEL; + unsigned long flags; + struct page *p; - /* Do not allocate from interrupt context */ - if (in_irq() || irqs_disabled()) - return; /* - * Check unlocked. I does not matter when we have one more - * page in the pool. The bit lock avoids recursive pool - * allocations: + * Avoid recursion (on debug-pagealloc) and also signal + * our priority to get to these pagetables: */ - if (pool_pages >= pool_size || test_and_set_bit_lock(0, &pool_refill)) + if (current->flags & PF_MEMALLOC) return; + current->flags |= PF_MEMALLOC; -#ifdef CONFIG_DEBUG_PAGEALLOC /* - * We could do: - * gfp = in_atomic() ? GFP_ATOMIC : GFP_KERNEL; - * but this fails on !PREEMPT kernels + * Allocate atomically from atomic contexts: */ - gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; -#endif + if (in_atomic() || irqs_disabled() || debug_pagealloc) + gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; - while (pool_pages < pool_size) { + while (pool_pages < pool_size || (ret && !*ret)) { p = alloc_pages(gfp, 0); if (!p) { pool_failed++; break; } - spin_lock_irq(&pgd_lock); + /* + * If the call site needs a page right now, provide it: + */ + if (ret && !*ret) { + *ret = p; + continue; + } + spin_lock_irqsave(&pgd_lock, flags); list_add(&p->lru, &page_pool); pool_pages++; - spin_unlock_irq(&pgd_lock); + spin_unlock_irqrestore(&pgd_lock, flags); } - clear_bit_unlock(0, &pool_refill); + + current->flags &= ~PF_MEMALLOC; } #define SHIFT_MB (20 - PAGE_SHIFT) @@ -411,11 +423,15 @@ void __init cpa_init(void) * GiB. Shift MiB to Gib and multiply the result by * POOL_PAGES_PER_GB: */ - gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; - pool_size = POOL_PAGES_PER_GB * gb; + if (debug_pagealloc) { + gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; + pool_size = POOL_PAGES_PER_GB * gb; + } else { + pool_size = 1; + } pool_low = pool_size; - cpa_fill_pool(); + cpa_fill_pool(NULL); printk(KERN_DEBUG "CPA: page pool initialized %lu of %lu pages preallocated\n", pool_pages, pool_size); @@ -437,16 +453,20 @@ static int split_large_page(pte_t *kpte, unsigned long address) spin_lock_irqsave(&pgd_lock, flags); if (list_empty(&page_pool)) { spin_unlock_irqrestore(&pgd_lock, flags); - return -ENOMEM; + base = NULL; + cpa_fill_pool(&base); + if (!base) + return -ENOMEM; + spin_lock_irqsave(&pgd_lock, flags); + } else { + base = list_first_entry(&page_pool, struct page, lru); + list_del(&base->lru); + pool_pages--; + + if (pool_pages < pool_low) + pool_low = pool_pages; } - base = list_first_entry(&page_pool, struct page, lru); - list_del(&base->lru); - pool_pages--; - - if (pool_pages < pool_low) - pool_low = pool_pages; - /* * Check for races, another CPU might have split this page * up for us already: @@ -505,46 +525,46 @@ out_unlock: return 0; } -static int __change_page_attr(unsigned long address, struct cpa_data *cpa) +static int __change_page_attr(struct cpa_data *cpa, int primary) { + unsigned long address = cpa->vaddr; int do_split, err; unsigned int level; - struct page *kpte_page; - pte_t *kpte; + pte_t *kpte, old_pte; repeat: kpte = lookup_address(address, &level); if (!kpte) - return -EINVAL; + return primary ? -EINVAL : 0; - kpte_page = virt_to_page(kpte); - BUG_ON(PageLRU(kpte_page)); - BUG_ON(PageCompound(kpte_page)); + old_pte = *kpte; + if (!pte_val(old_pte)) { + if (!primary) + return 0; + printk(KERN_WARNING "CPA: called for zero pte. " + "vaddr = %lx cpa->vaddr = %lx\n", address, + cpa->vaddr); + WARN_ON(1); + return -EINVAL; + } if (level == PG_LEVEL_4K) { - pte_t new_pte, old_pte = *kpte; + pte_t new_pte; pgprot_t new_prot = pte_pgprot(old_pte); - - if(!pte_val(old_pte)) { - printk(KERN_WARNING "CPA: called for zero pte. " - "vaddr = %lx cpa->vaddr = %lx\n", address, - cpa->vaddr); - WARN_ON(1); - return -EINVAL; - } + unsigned long pfn = pte_pfn(old_pte); pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); - new_prot = static_protections(new_prot, address); + new_prot = static_protections(new_prot, address, pfn); /* * We need to keep the pfn from the existing PTE, * after all we're only going to change it's attributes * not the memory it points to */ - new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); - + new_pte = pfn_pte(pfn, canon_pgprot(new_prot)); + cpa->pfn = pfn; /* * Do we really change anything ? */ @@ -581,67 +601,59 @@ repeat: return err; } -/** - * change_page_attr_addr - Change page table attributes in linear mapping - * @address: Virtual address in linear mapping. - * @prot: New page table attribute (PAGE_*) - * - * Change page attributes of a page in the direct mapping. This is a variant - * of change_page_attr() that also works on memory holes that do not have - * mem_map entry (pfn_valid() is false). - * - * See change_page_attr() documentation for more details. - * - * Modules and drivers should use the set_memory_* APIs instead. - */ -static int change_page_attr_addr(struct cpa_data *cpa) +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); + +static int cpa_process_alias(struct cpa_data *cpa) { - int err; - unsigned long address = cpa->vaddr; + struct cpa_data alias_cpa; + int ret = 0; -#ifdef CONFIG_X86_64 - unsigned long phys_addr = __pa(address); + if (cpa->pfn > max_pfn_mapped) + return 0; /* - * If we are inside the high mapped kernel range, then we - * fixup the low mapping first. __va() returns the virtual - * address in the linear mapping: + * No need to redo, when the primary call touched the direct + * mapping already: */ - if (within(address, HIGH_MAP_START, HIGH_MAP_END)) - address = (unsigned long) __va(phys_addr); -#endif + if (!within(cpa->vaddr, PAGE_OFFSET, + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { + + alias_cpa = *cpa; + alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); - err = __change_page_attr(address, cpa); - if (err) - return err; + ret = __change_page_attr_set_clr(&alias_cpa, 0); + } #ifdef CONFIG_X86_64 + if (ret) + return ret; + /* + * No need to redo, when the primary call touched the high + * mapping already: + */ + if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end)) + return 0; + /* * If the physical address is inside the kernel map, we need * to touch the high mapped kernel as well: */ - if (within(phys_addr, 0, KERNEL_TEXT_SIZE)) { - /* - * Calc the high mapping address. See __phys_addr() - * for the non obvious details. - * - * Note that NX and other required permissions are - * checked in static_protections(). - */ - address = phys_addr + HIGH_MAP_START - phys_base; + if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) + return 0; - /* - * Our high aliases are imprecise, because we check - * everything between 0 and KERNEL_TEXT_SIZE, so do - * not propagate lookup failures back to users: - */ - __change_page_attr(address, cpa); - } + alias_cpa = *cpa; + alias_cpa.vaddr = + (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; + + /* + * The high mapping range is imprecise, so ignore the return value. + */ + __change_page_attr_set_clr(&alias_cpa, 0); #endif - return err; + return ret; } -static int __change_page_attr_set_clr(struct cpa_data *cpa) +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) { int ret, numpages = cpa->numpages; @@ -651,10 +663,17 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa) * preservation check. */ cpa->numpages = numpages; - ret = change_page_attr_addr(cpa); + + ret = __change_page_attr(cpa, checkalias); if (ret) return ret; + if (checkalias) { + ret = cpa_process_alias(cpa); + if (ret) + return ret; + } + /* * Adjust the number of pages with the result of the * CPA operation. Either a large page has been @@ -677,7 +696,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr) { struct cpa_data cpa; - int ret, cache; + int ret, cache, checkalias; /* * Check, if we are requested to change a not supported @@ -688,13 +707,25 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, if (!pgprot_val(mask_set) && !pgprot_val(mask_clr)) return 0; + /* Ensure we are PAGE_SIZE aligned */ + if (addr & ~PAGE_MASK) { + addr &= PAGE_MASK; + /* + * People should not be passing in unaligned addresses: + */ + WARN_ON_ONCE(1); + } + cpa.vaddr = addr; cpa.numpages = numpages; cpa.mask_set = mask_set; cpa.mask_clr = mask_clr; cpa.flushtlb = 0; - ret = __change_page_attr_set_clr(&cpa); + /* No alias checking for _NX bit modifications */ + checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; + + ret = __change_page_attr_set_clr(&cpa, checkalias); /* * Check whether we really changed something: @@ -720,7 +751,8 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, cpa_flush_all(cache); out: - cpa_fill_pool(); + cpa_fill_pool(NULL); + return ret; } @@ -832,7 +864,7 @@ static int __set_pages_p(struct page *page, int numpages) .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), .mask_clr = __pgprot(0)}; - return __change_page_attr_set_clr(&cpa); + return __change_page_attr_set_clr(&cpa, 1); } static int __set_pages_np(struct page *page, int numpages) @@ -842,7 +874,7 @@ static int __set_pages_np(struct page *page, int numpages) .mask_set = __pgprot(0), .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; - return __change_page_attr_set_clr(&cpa); + return __change_page_attr_set_clr(&cpa, 1); } void kernel_map_pages(struct page *page, int numpages, int enable) @@ -861,8 +893,12 @@ void kernel_map_pages(struct page *page, int numpages, int enable) return; /* - * The return value is ignored - the calls cannot fail, - * large pages are disabled at boot time: + * The return value is ignored as the calls cannot fail. + * Large pages are kept enabled at boot time, and are + * split up quickly with DEBUG_PAGEALLOC. If a splitup + * fails here (due to temporary memory shortage) no damage + * is done because we just keep the largepage intact up + * to the next attempt when it will likely be split up: */ if (enable) __set_pages_p(page, numpages); @@ -879,9 +915,26 @@ void kernel_map_pages(struct page *page, int numpages, int enable) * Try to refill the page pool here. We can do this only after * the tlb flush. */ - cpa_fill_pool(); + cpa_fill_pool(NULL); } -#endif + +#ifdef CONFIG_HIBERNATION + +bool kernel_page_present(struct page *page) +{ + unsigned int level; + pte_t *pte; + + if (PageHighMem(page)) + return false; + + pte = lookup_address((unsigned long)page_address(page), &level); + return (pte_val(*pte) & _PAGE_PRESENT); +} + +#endif /* CONFIG_HIBERNATION */ + +#endif /* CONFIG_DEBUG_PAGEALLOC */ /* * The testcases use internal knowledge of the implementation that shouldn't diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index ecd91ea8a8a..845001c617c 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -166,7 +166,8 @@ static inline int save_add_info(void) {return 0;} * Both SPARSE and RESERVE need nodes_add information. * This code supports one contiguous hot add area per node. */ -static int reserve_hotadd(int node, unsigned long start, unsigned long end) +static int __init +reserve_hotadd(int node, unsigned long start, unsigned long end) { unsigned long s_pfn = start >> PAGE_SHIFT; unsigned long e_pfn = end >> PAGE_SHIFT; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index b7c67a187b6..7b6e3bb9b28 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -541,7 +541,7 @@ void pcibios_disable_device (struct pci_dev *dev) pcibios_disable_irq(dev); } -struct pci_bus *pci_scan_bus_with_sysdata(int busno) +struct pci_bus *__devinit pci_scan_bus_with_sysdata(int busno) { struct pci_bus *bus = NULL; struct pci_sysdata *sd; diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index ed07ce6c171..a8715861877 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -583,6 +583,10 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route case PCI_DEVICE_ID_INTEL_ICH9_4: case PCI_DEVICE_ID_INTEL_ICH9_5: case PCI_DEVICE_ID_INTEL_TOLAPAI_0: + case PCI_DEVICE_ID_INTEL_ICH10_0: + case PCI_DEVICE_ID_INTEL_ICH10_1: + case PCI_DEVICE_ID_INTEL_ICH10_2: + case PCI_DEVICE_ID_INTEL_ICH10_3: r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index 1deb3244b99..000415947d9 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -20,6 +20,7 @@ #include <asm/segment.h> #include <asm/page.h> #include <asm/asm-offsets.h> +#include <asm/processor-flags.h> ENTRY(swsusp_arch_suspend) movq $saved_context, %rax @@ -60,7 +61,7 @@ ENTRY(restore_image) /* Flush TLB */ movq mmu_cr4_features(%rip), %rax movq %rax, %rdx - andq $~(1<<7), %rdx # PGE + andq $~(X86_CR4_PGE), %rdx movq %rdx, %cr4; # turn off PGE movq %cr3, %rcx; # flush TLB movq %rcx, %cr3; @@ -112,7 +113,7 @@ ENTRY(restore_registers) /* Flush TLB, including "global" things (vmalloc) */ movq mmu_cr4_features(%rip), %rax movq %rax, %rdx - andq $~(1<<7), %rdx; # PGE + andq $~(X86_CR4_PGE), %rdx movq %rdx, %cr4; # turn off PGE movq %cr3, %rcx; # flush TLB movq %rcx, %cr3 diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index d28dda57470..b8bd0c4aa02 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -7,7 +7,7 @@ VDSO32-$(CONFIG_X86_32) := y VDSO32-$(CONFIG_COMPAT) := y vdso-install-$(VDSO64-y) += vdso.so -vdso-install-$(VDSO32-y) += $(vdso32-y:=.so) +vdso-install-$(VDSO32-y) += $(vdso32-images) # files to link into the vdso @@ -48,7 +48,7 @@ obj-$(VDSO64-y) += vdso-syms.lds # Match symbols in the DSO that look like VDSO*; produce a file of constants. # sed-vdsosym := -e 's/^00*/0/' \ - -e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p' + -e 's/^\([[:xdigit:]]*\) . \(VDSO[[:alnum:]_]*\)$$/\2 = 0x\1;/p' quiet_cmd_vdsosym = VDSOSYM $@ cmd_vdsosym = $(NM) $< | sed -n $(sed-vdsosym) | LC_ALL=C sort > $@ @@ -63,6 +63,8 @@ vdso32.so-$(CONFIG_X86_32) += int80 vdso32.so-$(CONFIG_COMPAT) += syscall vdso32.so-$(VDSO32-y) += sysenter +vdso32-images = $(vdso32.so-y:%=vdso32-%.so) + CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -Wl,-soname=linux-gate.so.1 @@ -71,21 +73,21 @@ VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -Wl,-soname=linux-gate.so.1 override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ targets += vdso32/vdso32.lds -targets += $(vdso32.so-y:%=vdso32-%.so.dbg) $(vdso32.so-y:%=vdso32-%.so) +targets += $(vdso32-images) $(vdso32-images:=.dbg) targets += vdso32/note.o $(vdso32.so-y:%=vdso32/%.o) -extra-y += $(vdso32.so-y:%=vdso32-%.so) +extra-y += $(vdso32-images) -$(obj)/vdso32.o: $(vdso32.so-y:%=$(obj)/vdso32-%.so) +$(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%) KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -$(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) -$(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): asflags-$(CONFIG_X86_64) += -m32 +$(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) +$(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32 -$(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ - $(obj)/vdso32/vdso32.lds \ - $(obj)/vdso32/note.o \ - $(obj)/vdso32/%.o +$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ + $(obj)/vdso32/vdso32.lds \ + $(obj)/vdso32/note.o \ + $(obj)/vdso32/%.o $(call if_changed,vdso) # Make vdso32-*-syms.lds from each image, and then make sure they match. diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index de647bc6e74..49e5358f481 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -798,6 +798,10 @@ static __init void xen_pagetable_setup_start(pgd_t *base) * added to the table can be prepared properly for Xen. */ xen_write_cr3(__pa(base)); + + /* Unpin initial Xen pagetable */ + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, + PFN_DOWN(__pa(xen_start_info->pt_base))); } static __init void xen_pagetable_setup_done(pgd_t *base) |