summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild16
-rw-r--r--arch/x86/Kconfig115
-rw-r--r--arch/x86/Kconfig.debug20
-rw-r--r--arch/x86/Makefile24
-rw-r--r--arch/x86/boot/.gitignore2
-rw-r--r--arch/x86/boot/Makefile30
-rw-r--r--arch/x86/boot/a20.c9
-rw-r--r--arch/x86/boot/apm.c76
-rw-r--r--arch/x86/boot/bioscall.S82
-rw-r--r--arch/x86/boot/boot.h48
-rw-r--r--arch/x86/boot/compressed/.gitignore3
-rw-r--r--arch/x86/boot/compressed/Makefile57
-rw-r--r--arch/x86/boot/compressed/head_32.S194
-rw-r--r--arch/x86/boot/compressed/head_64.S169
-rw-r--r--arch/x86/boot/compressed/misc.c12
-rw-r--r--arch/x86/boot/compressed/mkpiggy.c97
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S (renamed from arch/x86/boot/compressed/vmlinux_64.lds)29
-rw-r--r--arch/x86/boot/compressed/vmlinux.scr10
-rw-r--r--arch/x86/boot/compressed/vmlinux_32.lds43
-rw-r--r--arch/x86/boot/edd.c71
-rw-r--r--arch/x86/boot/header.S30
-rw-r--r--arch/x86/boot/main.c39
-rw-r--r--arch/x86/boot/mca.c27
-rw-r--r--arch/x86/boot/memory.c79
-rw-r--r--arch/x86/boot/regs.c29
-rw-r--r--arch/x86/boot/setup.ld6
-rw-r--r--arch/x86/boot/tty.c52
-rw-r--r--arch/x86/boot/video-bios.c26
-rw-r--r--arch/x86/boot/video-vesa.c137
-rw-r--r--arch/x86/boot/video-vga.c95
-rw-r--r--arch/x86/boot/video.c42
-rw-r--r--arch/x86/boot/video.h14
-rw-r--r--arch/x86/configs/i386_defconfig148
-rw-r--r--arch/x86/configs/x86_64_defconfig151
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S5
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c271
-rw-r--r--arch/x86/crypto/fpu.c166
-rw-r--r--arch/x86/ia32/ia32entry.S4
-rw-r--r--arch/x86/include/asm/acpi.h1
-rw-r--r--arch/x86/include/asm/alternative.h59
-rw-r--r--arch/x86/include/asm/amd_iommu.h4
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h55
-rw-r--r--arch/x86/include/asm/apic.h33
-rw-r--r--arch/x86/include/asm/apicdef.h8
-rw-r--r--arch/x86/include/asm/atomic_32.h174
-rw-r--r--arch/x86/include/asm/atomic_64.h44
-rw-r--r--arch/x86/include/asm/bitsperlong.h13
-rw-r--r--arch/x86/include/asm/boot.h15
-rw-r--r--arch/x86/include/asm/bootparam.h3
-rw-r--r--arch/x86/include/asm/cpu_debug.h101
-rw-r--r--arch/x86/include/asm/cpufeature.h9
-rw-r--r--arch/x86/include/asm/desc.h26
-rw-r--r--arch/x86/include/asm/dma-mapping.h168
-rw-r--r--arch/x86/include/asm/ds.h82
-rw-r--r--arch/x86/include/asm/efi.h5
-rw-r--r--arch/x86/include/asm/entry_arch.h13
-rw-r--r--arch/x86/include/asm/fixmap.h10
-rw-r--r--arch/x86/include/asm/hardirq.h4
-rw-r--r--arch/x86/include/asm/hw_irq.h29
-rw-r--r--arch/x86/include/asm/i387.h43
-rw-r--r--arch/x86/include/asm/i8259.h4
-rw-r--r--arch/x86/include/asm/intel_arch_perfmon.h31
-rw-r--r--arch/x86/include/asm/io_apic.h11
-rw-r--r--arch/x86/include/asm/iomap.h5
-rw-r--r--arch/x86/include/asm/iommu.h1
-rw-r--r--arch/x86/include/asm/irq_remapping.h2
-rw-r--r--arch/x86/include/asm/irq_vectors.h26
-rw-r--r--arch/x86/include/asm/irqflags.h8
-rw-r--r--arch/x86/include/asm/k8.h13
-rw-r--r--arch/x86/include/asm/kmap_types.h23
-rw-r--r--arch/x86/include/asm/kmemcheck.h42
-rw-r--r--arch/x86/include/asm/kvm.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h45
-rw-r--r--arch/x86/include/asm/kvm_x86_emulate.h6
-rw-r--r--arch/x86/include/asm/lguest.h6
-rw-r--r--arch/x86/include/asm/lguest_hcall.h23
-rw-r--r--arch/x86/include/asm/mce.h143
-rw-r--r--arch/x86/include/asm/microcode.h25
-rw-r--r--arch/x86/include/asm/mman.h2
-rw-r--r--arch/x86/include/asm/mpspec.h15
-rw-r--r--arch/x86/include/asm/msr-index.h12
-rw-r--r--arch/x86/include/asm/msr.h30
-rw-r--r--arch/x86/include/asm/nmi.h3
-rw-r--r--arch/x86/include/asm/numa_64.h10
-rw-r--r--arch/x86/include/asm/page.h2
-rw-r--r--arch/x86/include/asm/page_32_types.h4
-rw-r--r--arch/x86/include/asm/page_64_types.h24
-rw-r--r--arch/x86/include/asm/page_types.h6
-rw-r--r--arch/x86/include/asm/paravirt.h22
-rw-r--r--arch/x86/include/asm/pci.h3
-rw-r--r--arch/x86/include/asm/pci_x86.h3
-rw-r--r--arch/x86/include/asm/percpu.h10
-rw-r--r--arch/x86/include/asm/perf_counter.h98
-rw-r--r--arch/x86/include/asm/pgalloc.h25
-rw-r--r--arch/x86/include/asm/pgtable.h21
-rw-r--r--arch/x86/include/asm/pgtable_32.h8
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h4
-rw-r--r--arch/x86/include/asm/pgtable_64.h11
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h8
-rw-r--r--arch/x86/include/asm/pgtable_types.h10
-rw-r--r--arch/x86/include/asm/processor.h47
-rw-r--r--arch/x86/include/asm/proto.h11
-rw-r--r--arch/x86/include/asm/ptrace.h9
-rw-r--r--arch/x86/include/asm/required-features.h8
-rw-r--r--arch/x86/include/asm/setup.h1
-rw-r--r--arch/x86/include/asm/signal.h2
-rw-r--r--arch/x86/include/asm/smp.h2
-rw-r--r--arch/x86/include/asm/sparsemem.h2
-rw-r--r--arch/x86/include/asm/spinlock.h4
-rw-r--r--arch/x86/include/asm/stacktrace.h2
-rw-r--r--arch/x86/include/asm/string_32.h8
-rw-r--r--arch/x86/include/asm/string_64.h8
-rw-r--r--arch/x86/include/asm/svm.h1
-rw-r--r--arch/x86/include/asm/syscalls.h45
-rw-r--r--arch/x86/include/asm/termios.h1
-rw-r--r--arch/x86/include/asm/therm_throt.h9
-rw-r--r--arch/x86/include/asm/thread_info.h10
-rw-r--r--arch/x86/include/asm/timer.h6
-rw-r--r--arch/x86/include/asm/timex.h4
-rw-r--r--arch/x86/include/asm/tlbflush.h10
-rw-r--r--arch/x86/include/asm/topology.h3
-rw-r--r--arch/x86/include/asm/traps.h5
-rw-r--r--arch/x86/include/asm/types.h6
-rw-r--r--arch/x86/include/asm/uaccess.h6
-rw-r--r--arch/x86/include/asm/uaccess_64.h10
-rw-r--r--arch/x86/include/asm/unistd_32.h2
-rw-r--r--arch/x86/include/asm/unistd_64.h5
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h4
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h13
-rw-r--r--arch/x86/include/asm/vmx.h1
-rw-r--r--arch/x86/include/asm/xor.h5
-rw-r--r--arch/x86/kernel/Makefile9
-rw-r--r--arch/x86/kernel/acpi/boot.c236
-rw-r--r--arch/x86/kernel/acpi/cstate.c16
-rw-r--r--arch/x86/kernel/acpi/processor.c13
-rw-r--r--arch/x86/kernel/acpi/realmode/Makefile3
-rw-r--r--arch/x86/kernel/acpi/realmode/bioscall.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/regs.c1
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/amd_iommu.c520
-rw-r--r--arch/x86/kernel/amd_iommu_init.c296
-rw-r--r--arch/x86/kernel/apic/apic.c320
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c4
-rw-r--r--arch/x86/kernel/apic/es7000_32.c5
-rw-r--r--arch/x86/kernel/apic/io_apic.c970
-rw-r--r--arch/x86/kernel/apic/ipi.c3
-rw-r--r--arch/x86/kernel/apic/nmi.c4
-rw-r--r--arch/x86/kernel/apic/numaq_32.c3
-rw-r--r--arch/x86/kernel/apic/probe_32.c12
-rw-r--r--arch/x86/kernel/apic/probe_64.c2
-rw-r--r--arch/x86/kernel/apic/summit_32.c8
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c12
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c10
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c64
-rw-r--r--arch/x86/kernel/apm_32.c16
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile16
-rw-r--r--arch/x86/kernel/cpu/amd.c23
-rw-r--r--arch/x86/kernel/cpu/common.c86
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c431
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c221
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h14
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c60
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c93
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c29
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c153
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile13
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c45
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c127
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c218
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c2060
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.h14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c76
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c1187
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c (renamed from arch/x86/kernel/cpu/mcheck/mce_amd_64.c)203
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c (renamed from arch/x86/kernel/cpu/mcheck/mce_intel_64.c)84
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c60
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c112
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c51
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c29
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c200
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c20
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c24
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h15
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c6
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c1964
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c21
-rw-r--r--arch/x86/kernel/cpuid.c6
-rw-r--r--arch/x86/kernel/crash.c6
-rw-r--r--arch/x86/kernel/ds.c921
-rw-r--r--arch/x86/kernel/ds_selftest.c408
-rw-r--r--arch/x86/kernel/ds_selftest.h15
-rw-r--r--arch/x86/kernel/dumpstack.c1
-rw-r--r--arch/x86/kernel/dumpstack.h1
-rw-r--r--arch/x86/kernel/dumpstack_32.c6
-rw-r--r--arch/x86/kernel/dumpstack_64.c22
-rw-r--r--arch/x86/kernel/e820.c57
-rw-r--r--arch/x86/kernel/early-quirks.c2
-rw-r--r--arch/x86/kernel/efi.c35
-rw-r--r--arch/x86/kernel/efi_64.c6
-rw-r--r--arch/x86/kernel/entry_32.S66
-rw-r--r--arch/x86/kernel/entry_64.S42
-rw-r--r--arch/x86/kernel/ftrace.c6
-rw-r--r--arch/x86/kernel/head_32.S14
-rw-r--r--arch/x86/kernel/head_64.S1
-rw-r--r--arch/x86/kernel/hpet.c3
-rw-r--r--arch/x86/kernel/i8253.c1
-rw-r--r--arch/x86/kernel/init_task.c1
-rw-r--r--arch/x86/kernel/irq.c47
-rw-r--r--arch/x86/kernel/irqinit.c (renamed from arch/x86/kernel/irqinit_32.c)155
-rw-r--r--arch/x86/kernel/irqinit_64.c177
-rw-r--r--arch/x86/kernel/kgdb.c2
-rw-r--r--arch/x86/kernel/kvm.c8
-rw-r--r--arch/x86/kernel/mfgpt_32.c2
-rw-r--r--arch/x86/kernel/microcode_amd.c70
-rw-r--r--arch/x86/kernel/microcode_core.c330
-rw-r--r--arch/x86/kernel/microcode_intel.c90
-rw-r--r--arch/x86/kernel/module.c (renamed from arch/x86/kernel/module_64.c)82
-rw-r--r--arch/x86/kernel/module_32.c152
-rw-r--r--arch/x86/kernel/mpparse.c34
-rw-r--r--arch/x86/kernel/msr.c6
-rw-r--r--arch/x86/kernel/paravirt.c56
-rw-r--r--arch/x86/kernel/pci-calgary_64.c54
-rw-r--r--arch/x86/kernel/pci-dma.c8
-rw-r--r--arch/x86/kernel/pci-gart_64.c57
-rw-r--r--arch/x86/kernel/pci-swiotlb.c5
-rw-r--r--arch/x86/kernel/process.c28
-rw-r--r--arch/x86/kernel/process_32.c20
-rw-r--r--arch/x86/kernel/process_64.c20
-rw-r--r--arch/x86/kernel/ptrace.c284
-rw-r--r--arch/x86/kernel/pvclock.c2
-rw-r--r--arch/x86/kernel/quirks.c37
-rw-r--r--arch/x86/kernel/reboot.c59
-rw-r--r--arch/x86/kernel/setup.c84
-rw-r--r--arch/x86/kernel/setup_percpu.c229
-rw-r--r--arch/x86/kernel/signal.c7
-rw-r--r--arch/x86/kernel/smp.c51
-rw-r--r--arch/x86/kernel/smpboot.c24
-rw-r--r--arch/x86/kernel/stacktrace.c9
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--arch/x86/kernel/tlb_uv.c25
-rw-r--r--arch/x86/kernel/traps.c37
-rw-r--r--arch/x86/kernel/tsc.c65
-rw-r--r--arch/x86/kernel/tsc_sync.c14
-rw-r--r--arch/x86/kernel/vm86_32.c13
-rw-r--r--arch/x86/kernel/vmi_32.c22
-rw-r--r--arch/x86/kernel/vmlinux.lds.S393
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S229
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S298
-rw-r--r--arch/x86/kernel/vsyscall_64.c8
-rw-r--r--arch/x86/kvm/Kconfig6
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/i8254.c112
-rw-r--r--arch/x86/kvm/i8254.h12
-rw-r--r--arch/x86/kvm/irq.c7
-rw-r--r--arch/x86/kvm/kvm_timer.h18
-rw-r--r--arch/x86/kvm/lapic.c251
-rw-r--r--arch/x86/kvm/lapic.h12
-rw-r--r--arch/x86/kvm/mmu.c242
-rw-r--r--arch/x86/kvm/mmu.h5
-rw-r--r--arch/x86/kvm/paging_tmpl.h18
-rw-r--r--arch/x86/kvm/svm.c421
-rw-r--r--arch/x86/kvm/timer.c46
-rw-r--r--arch/x86/kvm/vmx.c744
-rw-r--r--arch/x86/kvm/x86.c454
-rw-r--r--arch/x86/kvm/x86.h14
-rw-r--r--arch/x86/kvm/x86_emulate.c141
-rw-r--r--arch/x86/lguest/Kconfig1
-rw-r--r--arch/x86/lguest/boot.c664
-rw-r--r--arch/x86/lguest/i386_head.S130
-rw-r--r--arch/x86/lib/Makefile3
-rw-r--r--arch/x86/lib/atomic64_32.c230
-rw-r--r--arch/x86/lib/clear_page_64.S5
-rw-r--r--arch/x86/lib/copy_user_64.S1
-rw-r--r--arch/x86/lib/delay.c3
-rw-r--r--arch/x86/lib/msr-on-cpu.c97
-rw-r--r--arch/x86/lib/msr.c177
-rw-r--r--arch/x86/lib/usercopy_32.c2
-rw-r--r--arch/x86/lib/usercopy_64.c2
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/dump_pagetables.c7
-rw-r--r--arch/x86/mm/fault.c103
-rw-r--r--arch/x86/mm/gup.c67
-rw-r--r--arch/x86/mm/highmem_32.c3
-rw-r--r--arch/x86/mm/init.c98
-rw-r--r--arch/x86/mm/init_32.c73
-rw-r--r--arch/x86/mm/init_64.c64
-rw-r--r--arch/x86/mm/iomap_32.c1
-rw-r--r--arch/x86/mm/kmemcheck/Makefile1
-rw-r--r--arch/x86/mm/kmemcheck/error.c228
-rw-r--r--arch/x86/mm/kmemcheck/error.h15
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c640
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c106
-rw-r--r--arch/x86/mm/kmemcheck/opcode.h9
-rw-r--r--arch/x86/mm/kmemcheck/pte.c22
-rw-r--r--arch/x86/mm/kmemcheck/pte.h10
-rw-r--r--arch/x86/mm/kmemcheck/selftest.c69
-rw-r--r--arch/x86/mm/kmemcheck/selftest.h6
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c162
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h16
-rw-r--r--arch/x86/mm/kmmio.c104
-rw-r--r--arch/x86/mm/memtest.c17
-rw-r--r--arch/x86/mm/mmio-mod.c2
-rw-r--r--arch/x86/mm/numa_64.c33
-rw-r--r--arch/x86/mm/pageattr.c120
-rw-r--r--arch/x86/mm/pat.c3
-rw-r--r--arch/x86/mm/pgtable.c19
-rw-r--r--arch/x86/mm/srat_64.c104
-rw-r--r--arch/x86/mm/tlb.c21
-rw-r--r--arch/x86/oprofile/nmi_int.c34
-rw-r--r--arch/x86/oprofile/op_model_ppro.c10
-rw-r--r--arch/x86/pci/acpi.c70
-rw-r--r--arch/x86/pci/amd_bus.c8
-rw-r--r--arch/x86/pci/i386.c24
-rw-r--r--arch/x86/pci/irq.c84
-rw-r--r--arch/x86/pci/mmconfig-shared.c65
-rw-r--r--arch/x86/power/Makefile4
-rw-r--r--arch/x86/power/cpu.c (renamed from arch/x86/power/cpu_64.c)165
-rw-r--r--arch/x86/power/cpu_32.c148
-rw-r--r--arch/x86/vdso/Makefile1
-rw-r--r--arch/x86/vdso/vdso32-setup.c6
-rw-r--r--arch/x86/vdso/vma.c8
-rw-r--r--arch/x86/xen/Makefile4
-rw-r--r--arch/x86/xen/enlighten.c87
-rw-r--r--arch/x86/xen/mmu.c23
-rw-r--r--arch/x86/xen/setup.c6
-rw-r--r--arch/x86/xen/xen-ops.h1
336 files changed, 16929 insertions, 9231 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
new file mode 100644
index 00000000000..ad8ec356fb3
--- /dev/null
+++ b/arch/x86/Kbuild
@@ -0,0 +1,16 @@
+
+obj-$(CONFIG_KVM) += kvm/
+
+# Xen paravirtualization support
+obj-$(CONFIG_XEN) += xen/
+
+# lguest paravirtualization support
+obj-$(CONFIG_LGUEST_GUEST) += lguest/
+
+obj-y += kernel/
+obj-y += mm/
+
+obj-y += crypto/
+obj-y += vdso/
+obj-$(CONFIG_IA32_EMULATION) += ia32/
+
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a6efe0a2e9a..13ffa5df37d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -24,15 +24,18 @@ config X86
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_IDE
select HAVE_OPROFILE
+ select HAVE_PERF_COUNTERS if (!M386 && !M486)
select HAVE_IOREMAP_PROT
select HAVE_KPROBES
select ARCH_WANT_OPTIONAL_GPIOLIB
select ARCH_WANT_FRAME_POINTERS
+ select HAVE_DMA_ATTRS
select HAVE_KRETPROBES
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_DYNAMIC_FTRACE
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_GRAPH_TRACER
+ select HAVE_FUNCTION_GRAPH_FP_TEST
select HAVE_FUNCTION_TRACE_MCOUNT_TEST
select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
select HAVE_FTRACE_SYSCALLS
@@ -46,6 +49,12 @@ config X86
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_LZMA
+ select HAVE_ARCH_KMEMCHECK
+
+config OUTPUT_FORMAT
+ string
+ default "elf32-i386" if X86_32
+ default "elf64-x86-64" if X86_64
config ARCH_DEFCONFIG
string
@@ -274,15 +283,9 @@ config SPARSE_IRQ
If you don't know what to do here, say N.
-config NUMA_MIGRATE_IRQ_DESC
- bool "Move irq desc when changing irq smp_affinity"
+config NUMA_IRQ_DESC
+ def_bool y
depends on SPARSE_IRQ && NUMA
- depends on BROKEN
- default n
- ---help---
- This enables moving irq_desc to cpu/node that irq will use handled.
-
- If you don't know what to do here, say N.
config X86_MPPARSE
bool "Enable MPS table" if ACPI
@@ -355,7 +358,7 @@ config X86_UV
depends on X86_64
depends on X86_EXTENDED_PLATFORM
depends on NUMA
- select X86_X2APIC
+ depends on X86_X2APIC
---help---
This option is needed in order to support SGI Ultraviolet systems.
If you don't have one of these, you should say N here.
@@ -789,10 +792,26 @@ config X86_MCE
to disable it. MCE support simply ignores non-MCE processors like
the 386 and 486, so nearly everyone can say Y here.
+config X86_OLD_MCE
+ depends on X86_32 && X86_MCE
+ bool "Use legacy machine check code (will go away)"
+ default n
+ select X86_ANCIENT_MCE
+ ---help---
+ Use the old i386 machine check code. This is merely intended for
+ testing in a transition period. Try this if you run into any machine
+ check related software problems, but report the problem to
+ linux-kernel. When in doubt say no.
+
+config X86_NEW_MCE
+ depends on X86_MCE
+ bool
+ default y if (!X86_OLD_MCE && X86_32) || X86_64
+
config X86_MCE_INTEL
def_bool y
prompt "Intel MCE features"
- depends on X86_64 && X86_MCE && X86_LOCAL_APIC
+ depends on X86_NEW_MCE && X86_LOCAL_APIC
---help---
Additional support for intel specific MCE features such as
the thermal monitor.
@@ -800,19 +819,36 @@ config X86_MCE_INTEL
config X86_MCE_AMD
def_bool y
prompt "AMD MCE features"
- depends on X86_64 && X86_MCE && X86_LOCAL_APIC
+ depends on X86_NEW_MCE && X86_LOCAL_APIC
---help---
Additional support for AMD specific MCE features such as
the DRAM Error Threshold.
+config X86_ANCIENT_MCE
+ def_bool n
+ depends on X86_32
+ prompt "Support for old Pentium 5 / WinChip machine checks"
+ ---help---
+ Include support for machine check handling on old Pentium 5 or WinChip
+ systems. These typically need to be enabled explicitely on the command
+ line.
+
config X86_MCE_THRESHOLD
depends on X86_MCE_AMD || X86_MCE_INTEL
bool
default y
+config X86_MCE_INJECT
+ depends on X86_NEW_MCE
+ tristate "Machine check injector support"
+ ---help---
+ Provide support for injecting machine checks for testing purposes.
+ If you don't know what a machine check is and you don't do kernel
+ QA it is safe to say n.
+
config X86_MCE_NONFATAL
tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
- depends on X86_32 && X86_MCE
+ depends on X86_OLD_MCE
---help---
Enabling this feature starts a timer that triggers every 5 seconds which
will look at the machine check registers to see if anything happened.
@@ -825,11 +861,15 @@ config X86_MCE_NONFATAL
config X86_MCE_P4THERMAL
bool "check for P4 thermal throttling interrupt."
- depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP)
+ depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP)
---help---
Enabling this feature will cause a message to be printed when the P4
enters thermal throttling.
+config X86_THERMAL_VECTOR
+ def_bool y
+ depends on X86_MCE_P4THERMAL || X86_MCE_INTEL
+
config VM86
bool "Enable VM86 support" if EMBEDDED
default y
@@ -1466,9 +1506,7 @@ config KEXEC_JUMP
config PHYSICAL_START
hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
- default "0x1000000" if X86_NUMAQ
- default "0x200000" if X86_64
- default "0x100000"
+ default "0x1000000"
---help---
This gives the physical address where the kernel is loaded.
@@ -1487,15 +1525,15 @@ config PHYSICAL_START
to be specifically compiled to run from a specific memory area
(normally a reserved region) and this option comes handy.
- So if you are using bzImage for capturing the crash dump, leave
- the value here unchanged to 0x100000 and set CONFIG_RELOCATABLE=y.
- Otherwise if you plan to use vmlinux for capturing the crash dump
- change this value to start of the reserved region (Typically 16MB
- 0x1000000). In other words, it can be set based on the "X" value as
- specified in the "crashkernel=YM@XM" command line boot parameter
- passed to the panic-ed kernel. Typically this parameter is set as
- crashkernel=64M@16M. Please take a look at
- Documentation/kdump/kdump.txt for more details about crash dumps.
+ So if you are using bzImage for capturing the crash dump,
+ leave the value here unchanged to 0x1000000 and set
+ CONFIG_RELOCATABLE=y. Otherwise if you plan to use vmlinux
+ for capturing the crash dump change this value to start of
+ the reserved region. In other words, it can be set based on
+ the "X" value as specified in the "crashkernel=YM@XM"
+ command line boot parameter passed to the panic-ed
+ kernel. Please take a look at Documentation/kdump/kdump.txt
+ for more details about crash dumps.
Usage of bzImage for capturing the crash dump is recommended as
one does not have to build two kernels. Same kernel can be used
@@ -1508,8 +1546,8 @@ config PHYSICAL_START
Don't change this unless you know what you are doing.
config RELOCATABLE
- bool "Build a relocatable kernel (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+ bool "Build a relocatable kernel"
+ default y
---help---
This builds a kernel image that retains relocation information
so it can be loaded someplace besides the default 1MB.
@@ -1524,12 +1562,16 @@ config RELOCATABLE
it has been loaded at and the compile time physical address
(CONFIG_PHYSICAL_START) is ignored.
+# Relocation on x86-32 needs some additional build support
+config X86_NEED_RELOCS
+ def_bool y
+ depends on X86_32 && RELOCATABLE
+
config PHYSICAL_ALIGN
hex
prompt "Alignment value to which kernel should be aligned" if X86_32
- default "0x100000" if X86_32
- default "0x200000" if X86_64
- range 0x2000 0x400000
+ default "0x1000000"
+ range 0x2000 0x1000000
---help---
This value puts the alignment restrictions on physical address
where kernel is loaded and run from. Kernel is compiled for an
@@ -1871,25 +1913,26 @@ config DMAR_DEFAULT_ON
recommended you say N here while the DMAR code remains
experimental.
-config DMAR_GFX_WA
- def_bool y
- prompt "Support for Graphics workaround"
+config DMAR_BROKEN_GFX_WA
+ def_bool n
+ prompt "Workaround broken graphics drivers (going away soon)"
depends on DMAR
---help---
Current Graphics drivers tend to use physical address
for DMA and avoid using DMA APIs. Setting this config
option permits the IOMMU driver to set a unity map for
all the OS-visible memory. Hence the driver can continue
- to use physical addresses for DMA.
+ to use physical addresses for DMA, at least until this
+ option is removed in the 2.6.32 kernel.
config DMAR_FLOPPY_WA
def_bool y
depends on DMAR
---help---
- Floppy disk drivers are know to bypass DMA API calls
+ Floppy disk drivers are known to bypass DMA API calls
thereby failing to work when IOMMU is enabled. This
workaround will setup a 1:1 mapping for the first
- 16M to make floppy (an ISA device) work.
+ 16MiB to make floppy (an ISA device) work.
config INTR_REMAP
bool "Support for Interrupt Remapping (EXPERIMENTAL)"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d8359e73317..d105f29bb6b 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -159,14 +159,30 @@ config IOMMU_DEBUG
options. See Documentation/x86_64/boot-options.txt for more
details.
+config IOMMU_STRESS
+ bool "Enable IOMMU stress-test mode"
+ ---help---
+ This option disables various optimizations in IOMMU related
+ code to do real stress testing of the IOMMU code. This option
+ will cause a performance drop and should only be enabled for
+ testing.
+
config IOMMU_LEAK
bool "IOMMU leak tracing"
- depends on DEBUG_KERNEL
- depends on IOMMU_DEBUG
+ depends on IOMMU_DEBUG && DMA_API_DEBUG
---help---
Add a simple leak tracer to the IOMMU code. This is useful when you
are debugging a buggy device driver that leaks IOMMU mappings.
+config X86_DS_SELFTEST
+ bool "DS selftest"
+ default y
+ depends on DEBUG_KERNEL
+ depends on X86_DS
+ ---help---
+ Perform Debug Store selftests at boot time.
+ If in doubt, say "N".
+
config HAVE_MMIOTRACE_SUPPORT
def_bool y
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 8c86b72afdc..1b68659c41b 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -7,8 +7,6 @@ else
KBUILD_DEFCONFIG := $(ARCH)_defconfig
endif
-core-$(CONFIG_KVM) += arch/x86/kvm/
-
# BITS is used as extension for files which are available in a 32 bit
# and a 64 bit version to simplify shared Makefiles.
# e.g.: obj-y += foo_$(BITS).o
@@ -83,6 +81,11 @@ ifdef CONFIG_CC_STACKPROTECTOR
endif
endif
+# Don't unroll struct assignments with kmemcheck enabled
+ifeq ($(CONFIG_KMEMCHECK),y)
+ KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy)
+endif
+
# Stackpointer is addressed different for 32 bit and 64 bit x86
sp-$(CONFIG_X86_32) := esp
sp-$(CONFIG_X86_64) := rsp
@@ -118,21 +121,8 @@ head-y += arch/x86/kernel/init_task.o
libs-y += arch/x86/lib/
-# Sub architecture files that needs linking first
-core-y += $(fcore-y)
-
-# Xen paravirtualization support
-core-$(CONFIG_XEN) += arch/x86/xen/
-
-# lguest paravirtualization support
-core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
-
-core-y += arch/x86/kernel/
-core-y += arch/x86/mm/
-
-core-y += arch/x86/crypto/
-core-y += arch/x86/vdso/
-core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/
+# See arch/x86/Kbuild for content of core part of the kernel
+core-y += arch/x86/
# drivers-y are linked after core-y
drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore
index 172cf8a98bd..851fe936d24 100644
--- a/arch/x86/boot/.gitignore
+++ b/arch/x86/boot/.gitignore
@@ -3,6 +3,8 @@ bzImage
cpustr.h
mkcpustr
offsets.h
+voffset.h
+zoffset.h
setup
setup.bin
setup.elf
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 6633b6e7505..ec749c2bfdd 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -26,9 +26,10 @@ targets := vmlinux.bin setup.bin setup.elf bzImage
targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
subdir- := compressed
-setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o
+setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o
setup-y += header.o main.o mca.o memory.o pm.o pmjump.o
-setup-y += printf.o string.o tty.o video.o video-mode.o version.o
+setup-y += printf.o regs.o string.o tty.o video.o video-mode.o
+setup-y += version.o
setup-$(CONFIG_X86_APM_BOOT) += apm.o
# The link order of the video-*.o modules can matter. In particular,
@@ -69,6 +70,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
$(call cc-option, -mpreferred-stack-boundary=2)
KBUILD_CFLAGS += $(call cc-option, -m32)
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
+GCOV_PROFILE := n
$(obj)/bzImage: asflags-y := $(SVGA_MODE)
@@ -86,19 +88,27 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
-sed-offsets := -e 's/^00*/0/' \
- -e 's/^\([0-9a-fA-F]*\) . \(input_data\|input_data_end\)$$/\#define \2 0x\1/p'
+sed-voffset := -e 's/^\([0-9a-fA-F]*\) . \(_text\|_end\)$$/\#define VO_\2 0x\1/p'
-quiet_cmd_offsets = OFFSETS $@
- cmd_offsets = $(NM) $< | sed -n $(sed-offsets) > $@
+quiet_cmd_voffset = VOFFSET $@
+ cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
-$(obj)/offsets.h: $(obj)/compressed/vmlinux FORCE
- $(call if_changed,offsets)
+targets += voffset.h
+$(obj)/voffset.h: vmlinux FORCE
+ $(call if_changed,voffset)
+
+sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
+
+quiet_cmd_zoffset = ZOFFSET $@
+ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
+
+targets += zoffset.h
+$(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
+ $(call if_changed,zoffset)
-targets += offsets.h
AFLAGS_header.o += -I$(obj)
-$(obj)/header.o: $(obj)/offsets.h
+$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h
LDFLAGS_setup.elf := -T
$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
index 7c19ce8c244..64a31a6d751 100644
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -2,7 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007-2008 rPath, Inc. - All Rights Reserved
- * Copyright 2009 Intel Corporation
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2.
@@ -90,8 +90,11 @@ static int a20_test_long(void)
static void enable_a20_bios(void)
{
- asm volatile("pushfl; int $0x15; popfl"
- : : "a" ((u16)0x2401));
+ struct biosregs ireg;
+
+ initregs(&ireg);
+ ireg.ax = 0x2401;
+ intcall(0x15, &ireg, NULL);
}
static void enable_a20_kbc(void)
diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c
index 7aa6033001f..ee274834ea8 100644
--- a/arch/x86/boot/apm.c
+++ b/arch/x86/boot/apm.c
@@ -2,6 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* Original APM BIOS checking by Stephen Rothwell, May 1994
* (sfr@canb.auug.org.au)
@@ -19,75 +20,56 @@
int query_apm_bios(void)
{
- u16 ax, bx, cx, dx, di;
- u32 ebx, esi;
- u8 err;
+ struct biosregs ireg, oreg;
/* APM BIOS installation check */
- ax = 0x5300;
- bx = cx = 0;
- asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
- : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
- : : "esi", "edi");
+ initregs(&ireg);
+ ireg.ah = 0x53;
+ intcall(0x15, &ireg, &oreg);
- if (err)
+ if (oreg.flags & X86_EFLAGS_CF)
return -1; /* No APM BIOS */
- if (bx != 0x504d) /* "PM" signature */
+ if (oreg.bx != 0x504d) /* "PM" signature */
return -1;
- if (!(cx & 0x02)) /* 32 bits supported? */
+ if (!(oreg.cx & 0x02)) /* 32 bits supported? */
return -1;
/* Disconnect first, just in case */
- ax = 0x5304;
- bx = 0;
- asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
- : "+a" (ax), "+b" (bx)
- : : "ecx", "edx", "esi", "edi");
-
- /* Paranoia */
- ebx = esi = 0;
- cx = dx = di = 0;
+ ireg.al = 0x04;
+ intcall(0x15, &ireg, NULL);
/* 32-bit connect */
- asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %6"
- : "=a" (ax), "+b" (ebx), "+c" (cx), "+d" (dx),
- "+S" (esi), "+D" (di), "=m" (err)
- : "a" (0x5303));
-
- boot_params.apm_bios_info.cseg = ax;
- boot_params.apm_bios_info.offset = ebx;
- boot_params.apm_bios_info.cseg_16 = cx;
- boot_params.apm_bios_info.dseg = dx;
- boot_params.apm_bios_info.cseg_len = (u16)esi;
- boot_params.apm_bios_info.cseg_16_len = esi >> 16;
- boot_params.apm_bios_info.dseg_len = di;
-
- if (err)
+ ireg.al = 0x03;
+ intcall(0x15, &ireg, &oreg);
+
+ boot_params.apm_bios_info.cseg = oreg.ax;
+ boot_params.apm_bios_info.offset = oreg.ebx;
+ boot_params.apm_bios_info.cseg_16 = oreg.cx;
+ boot_params.apm_bios_info.dseg = oreg.dx;
+ boot_params.apm_bios_info.cseg_len = oreg.si;
+ boot_params.apm_bios_info.cseg_16_len = oreg.hsi;
+ boot_params.apm_bios_info.dseg_len = oreg.di;
+
+ if (oreg.flags & X86_EFLAGS_CF)
return -1;
/* Redo the installation check as the 32-bit connect;
some BIOSes return different flags this way... */
- ax = 0x5300;
- bx = cx = 0;
- asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
- : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
- : : "esi", "edi");
+ ireg.al = 0x00;
+ intcall(0x15, &ireg, &oreg);
- if (err || bx != 0x504d) {
+ if ((oreg.eflags & X86_EFLAGS_CF) || oreg.bx != 0x504d) {
/* Failure with 32-bit connect, try to disconect and ignore */
- ax = 0x5304;
- bx = 0;
- asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
- : "+a" (ax), "+b" (bx)
- : : "ecx", "edx", "esi", "edi");
+ ireg.al = 0x04;
+ intcall(0x15, &ireg, NULL);
return -1;
}
- boot_params.apm_bios_info.version = ax;
- boot_params.apm_bios_info.flags = cx;
+ boot_params.apm_bios_info.version = oreg.ax;
+ boot_params.apm_bios_info.flags = oreg.cx;
return 0;
}
diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S
new file mode 100644
index 00000000000..1dfbf64e52a
--- /dev/null
+++ b/arch/x86/boot/bioscall.S
@@ -0,0 +1,82 @@
+/* -----------------------------------------------------------------------
+ *
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2 or (at your
+ * option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * "Glove box" for BIOS calls. Avoids the constant problems with BIOSes
+ * touching registers they shouldn't be.
+ */
+
+ .code16gcc
+ .text
+ .globl intcall
+ .type intcall, @function
+intcall:
+ /* Self-modify the INT instruction. Ugly, but works. */
+ cmpb %al, 3f
+ je 1f
+ movb %al, 3f
+ jmp 1f /* Synchronize pipeline */
+1:
+ /* Save state */
+ pushfl
+ pushw %fs
+ pushw %gs
+ pushal
+
+ /* Copy input state to stack frame */
+ subw $44, %sp
+ movw %dx, %si
+ movw %sp, %di
+ movw $11, %cx
+ rep; movsd
+
+ /* Pop full state from the stack */
+ popal
+ popw %gs
+ popw %fs
+ popw %es
+ popw %ds
+ popfl
+
+ /* Actual INT */
+ .byte 0xcd /* INT opcode */
+3: .byte 0
+
+ /* Push full state to the stack */
+ pushfl
+ pushw %ds
+ pushw %es
+ pushw %fs
+ pushw %gs
+ pushal
+
+ /* Re-establish C environment invariants */
+ cld
+ movzwl %sp, %esp
+ movw %cs, %ax
+ movw %ax, %ds
+ movw %ax, %es
+
+ /* Copy output state from stack frame */
+ movw 68(%esp), %di /* Original %cx == 3rd argument */
+ andw %di, %di
+ jz 4f
+ movw %sp, %si
+ movw $11, %cx
+ rep; movsd
+4: addw $44, %sp
+
+ /* Restore state and return */
+ popal
+ popw %gs
+ popw %fs
+ popfl
+ retl
+ .size intcall, .-intcall
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 7b2692e897e..98239d2658f 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -2,6 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2.
@@ -26,6 +27,7 @@
#include <asm/setup.h>
#include "bitops.h"
#include <asm/cpufeature.h>
+#include <asm/processor-flags.h>
/* Useful macros */
#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
@@ -241,6 +243,49 @@ int enable_a20(void);
/* apm.c */
int query_apm_bios(void);
+/* bioscall.c */
+struct biosregs {
+ union {
+ struct {
+ u32 edi;
+ u32 esi;
+ u32 ebp;
+ u32 _esp;
+ u32 ebx;
+ u32 edx;
+ u32 ecx;
+ u32 eax;
+ u32 _fsgs;
+ u32 _dses;
+ u32 eflags;
+ };
+ struct {
+ u16 di, hdi;
+ u16 si, hsi;
+ u16 bp, hbp;
+ u16 _sp, _hsp;
+ u16 bx, hbx;
+ u16 dx, hdx;
+ u16 cx, hcx;
+ u16 ax, hax;
+ u16 gs, fs;
+ u16 es, ds;
+ u16 flags, hflags;
+ };
+ struct {
+ u8 dil, dih, edi2, edi3;
+ u8 sil, sih, esi2, esi3;
+ u8 bpl, bph, ebp2, ebp3;
+ u8 _spl, _sph, _esp2, _esp3;
+ u8 bl, bh, ebx2, ebx3;
+ u8 dl, dh, edx2, edx3;
+ u8 cl, ch, ecx2, ecx3;
+ u8 al, ah, eax2, eax3;
+ };
+ };
+};
+void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
+
/* cmdline.c */
int cmdline_find_option(const char *option, char *buffer, int bufsize);
int cmdline_find_option_bool(const char *option);
@@ -279,6 +324,9 @@ int sprintf(char *buf, const char *fmt, ...);
int vsprintf(char *buf, const char *fmt, va_list args);
int printf(const char *fmt, ...);
+/* regs.c */
+void initregs(struct biosregs *regs);
+
/* string.c */
int strcmp(const char *str1, const char *str2);
size_t strnlen(const char *s, size_t maxlen);
diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore
index 63eff3b04d0..4a46fab7162 100644
--- a/arch/x86/boot/compressed/.gitignore
+++ b/arch/x86/boot/compressed/.gitignore
@@ -1,3 +1,6 @@
relocs
vmlinux.bin.all
vmlinux.relocs
+vmlinux.lds
+mkpiggy
+piggy.S
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 65551c9f857..f8ed0658404 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,7 +4,7 @@
# create a compressed vmlinux image from the original vmlinux
#
-targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o
+targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o
KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
@@ -15,11 +15,14 @@ KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
+GCOV_PROFILE := n
LDFLAGS := -m elf_$(UTS_MACHINE)
LDFLAGS_vmlinux := -T
-$(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
+hostprogs-y := mkpiggy
+
+$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
$(call if_changed,ld)
@:
@@ -29,7 +32,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE
targets += vmlinux.bin.all vmlinux.relocs relocs
-hostprogs-$(CONFIG_X86_32) += relocs
+hostprogs-$(CONFIG_X86_NEED_RELOCS) += relocs
quiet_cmd_relocs = RELOCS $@
cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
@@ -37,46 +40,22 @@ $(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
$(call if_changed,relocs)
vmlinux.bin.all-y := $(obj)/vmlinux.bin
-vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs
-quiet_cmd_relocbin = BUILD $@
- cmd_relocbin = cat $(filter-out FORCE,$^) > $@
-$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,relocbin)
-
-ifeq ($(CONFIG_X86_32),y)
+vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
-ifdef CONFIG_RELOCATABLE
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
- $(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin.all FORCE
- $(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin.all FORCE
- $(call if_changed,lzma)
-else
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
+$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
$(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
+$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
$(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
+$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
$(call if_changed,lzma)
-endif
-LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
-else
+suffix-$(CONFIG_KERNEL_GZIP) := gz
+suffix-$(CONFIG_KERNEL_BZIP2) := bz2
+suffix-$(CONFIG_KERNEL_LZMA) := lzma
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
- $(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
- $(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
- $(call if_changed,lzma)
-
-LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
-endif
+quiet_cmd_mkpiggy = MKPIGGY $@
+ cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
-suffix_$(CONFIG_KERNEL_GZIP) = gz
-suffix_$(CONFIG_KERNEL_BZIP2) = bz2
-suffix_$(CONFIG_KERNEL_LZMA) = lzma
-
-$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE
- $(call if_changed,ld)
+targets += piggy.S
+$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE
+ $(call if_changed,mkpiggy)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 3a8a866fb2e..75e4f001e70 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -12,16 +12,16 @@
* the page directory. [According to comments etc elsewhere on a compressed
* kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC]
*
- * Page 0 is deliberately kept safe, since System Management Mode code in
+ * Page 0 is deliberately kept safe, since System Management Mode code in
* laptops may need to access the BIOS data stored there. This is also
- * useful for future device drivers that either access the BIOS via VM86
+ * useful for future device drivers that either access the BIOS via VM86
* mode.
*/
/*
* High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
*/
-.text
+ .text
#include <linux/linkage.h>
#include <asm/segment.h>
@@ -29,161 +29,151 @@
#include <asm/boot.h>
#include <asm/asm-offsets.h>
-.section ".text.head","ax",@progbits
+ .section ".text.head","ax",@progbits
ENTRY(startup_32)
cld
- /* test KEEP_SEGMENTS flag to see if the bootloader is asking
- * us to not reload segments */
- testb $(1<<6), BP_loadflags(%esi)
- jnz 1f
+ /*
+ * Test KEEP_SEGMENTS flag to see if the bootloader is asking
+ * us to not reload segments
+ */
+ testb $(1<<6), BP_loadflags(%esi)
+ jnz 1f
cli
- movl $(__BOOT_DS),%eax
- movl %eax,%ds
- movl %eax,%es
- movl %eax,%fs
- movl %eax,%gs
- movl %eax,%ss
+ movl $__BOOT_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %fs
+ movl %eax, %gs
+ movl %eax, %ss
1:
-/* Calculate the delta between where we were compiled to run
+/*
+ * Calculate the delta between where we were compiled to run
* at and where we were actually loaded at. This can only be done
* with a short local call on x86. Nothing else will tell us what
* address we are running at. The reserved chunk of the real-mode
* data at 0x1e4 (defined as a scratch field) are used as the stack
* for this calculation. Only 4 bytes are needed.
*/
- leal (0x1e4+4)(%esi), %esp
- call 1f
-1: popl %ebp
- subl $1b, %ebp
+ leal (BP_scratch+4)(%esi), %esp
+ call 1f
+1: popl %ebp
+ subl $1b, %ebp
-/* %ebp contains the address we are loaded at by the boot loader and %ebx
+/*
+ * %ebp contains the address we are loaded at by the boot loader and %ebx
* contains the address where we should move the kernel image temporarily
* for safe in-place decompression.
*/
#ifdef CONFIG_RELOCATABLE
- movl %ebp, %ebx
- addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebx
- andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx
+ movl %ebp, %ebx
+ movl BP_kernel_alignment(%esi), %eax
+ decl %eax
+ addl %eax, %ebx
+ notl %eax
+ andl %eax, %ebx
#else
- movl $LOAD_PHYSICAL_ADDR, %ebx
+ movl $LOAD_PHYSICAL_ADDR, %ebx
#endif
- /* Replace the compressed data size with the uncompressed size */
- subl input_len(%ebp), %ebx
- movl output_len(%ebp), %eax
- addl %eax, %ebx
- /* Add 8 bytes for every 32K input block */
- shrl $12, %eax
- addl %eax, %ebx
- /* Add 32K + 18 bytes of extra slack */
- addl $(32768 + 18), %ebx
- /* Align on a 4K boundary */
- addl $4095, %ebx
- andl $~4095, %ebx
-
-/* Copy the compressed kernel to the end of our buffer
+ /* Target address to relocate to for decompression */
+ addl $z_extract_offset, %ebx
+
+ /* Set up the stack */
+ leal boot_stack_end(%ebx), %esp
+
+ /* Zero EFLAGS */
+ pushl $0
+ popfl
+
+/*
+ * Copy the compressed kernel to the end of our buffer
* where decompression in place becomes safe.
*/
- pushl %esi
- leal _end(%ebp), %esi
- leal _end(%ebx), %edi
- movl $(_end - startup_32), %ecx
+ pushl %esi
+ leal (_bss-4)(%ebp), %esi
+ leal (_bss-4)(%ebx), %edi
+ movl $(_bss - startup_32), %ecx
+ shrl $2, %ecx
std
- rep
- movsb
+ rep movsl
cld
- popl %esi
-
-/* Compute the kernel start address.
- */
-#ifdef CONFIG_RELOCATABLE
- addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebp
- andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp
-#else
- movl $LOAD_PHYSICAL_ADDR, %ebp
-#endif
+ popl %esi
/*
* Jump to the relocated address.
*/
- leal relocated(%ebx), %eax
- jmp *%eax
+ leal relocated(%ebx), %eax
+ jmp *%eax
ENDPROC(startup_32)
-.section ".text"
+ .text
relocated:
/*
- * Clear BSS
- */
- xorl %eax,%eax
- leal _edata(%ebx),%edi
- leal _end(%ebx), %ecx
- subl %edi,%ecx
- cld
- rep
- stosb
-
-/*
- * Setup the stack for the decompressor
+ * Clear BSS (stack is currently empty)
*/
- leal boot_stack_end(%ebx), %esp
+ xorl %eax, %eax
+ leal _bss(%ebx), %edi
+ leal _ebss(%ebx), %ecx
+ subl %edi, %ecx
+ shrl $2, %ecx
+ rep stosl
/*
* Do the decompression, and jump to the new kernel..
*/
- movl output_len(%ebx), %eax
- pushl %eax
- # push arguments for decompress_kernel:
- pushl %ebp # output address
- movl input_len(%ebx), %eax
- pushl %eax # input_len
- leal input_data(%ebx), %eax
- pushl %eax # input_data
- leal boot_heap(%ebx), %eax
- pushl %eax # heap area
- pushl %esi # real mode pointer
- call decompress_kernel
- addl $20, %esp
- popl %ecx
+ leal z_extract_offset_negative(%ebx), %ebp
+ /* push arguments for decompress_kernel: */
+ pushl %ebp /* output address */
+ pushl $z_input_len /* input_len */
+ leal input_data(%ebx), %eax
+ pushl %eax /* input_data */
+ leal boot_heap(%ebx), %eax
+ pushl %eax /* heap area */
+ pushl %esi /* real mode pointer */
+ call decompress_kernel
+ addl $20, %esp
#if CONFIG_RELOCATABLE
-/* Find the address of the relocations.
+/*
+ * Find the address of the relocations.
*/
- movl %ebp, %edi
- addl %ecx, %edi
+ leal z_output_len(%ebp), %edi
-/* Calculate the delta between where vmlinux was compiled to run
+/*
+ * Calculate the delta between where vmlinux was compiled to run
* and where it was actually loaded.
*/
- movl %ebp, %ebx
- subl $LOAD_PHYSICAL_ADDR, %ebx
- jz 2f /* Nothing to be done if loaded at compiled addr. */
+ movl %ebp, %ebx
+ subl $LOAD_PHYSICAL_ADDR, %ebx
+ jz 2f /* Nothing to be done if loaded at compiled addr. */
/*
* Process relocations.
*/
-1: subl $4, %edi
- movl 0(%edi), %ecx
- testl %ecx, %ecx
- jz 2f
- addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
- jmp 1b
+1: subl $4, %edi
+ movl (%edi), %ecx
+ testl %ecx, %ecx
+ jz 2f
+ addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
+ jmp 1b
2:
#endif
/*
* Jump to the decompressed kernel.
*/
- xorl %ebx,%ebx
- jmp *%ebp
+ xorl %ebx, %ebx
+ jmp *%ebp
-.bss
-/* Stack and heap for uncompression */
-.balign 4
+/*
+ * Stack and heap for uncompression
+ */
+ .bss
+ .balign 4
boot_heap:
.fill BOOT_HEAP_SIZE, 1, 0
boot_stack:
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index ed4a8294800..f62c284db9e 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -21,8 +21,8 @@
/*
* High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
*/
-.code32
-.text
+ .code32
+ .text
#include <linux/linkage.h>
#include <asm/segment.h>
@@ -33,12 +33,14 @@
#include <asm/processor-flags.h>
#include <asm/asm-offsets.h>
-.section ".text.head"
+ .section ".text.head"
.code32
ENTRY(startup_32)
cld
- /* test KEEP_SEGMENTS flag to see if the bootloader is asking
- * us to not reload segments */
+ /*
+ * Test KEEP_SEGMENTS flag to see if the bootloader is asking
+ * us to not reload segments
+ */
testb $(1<<6), BP_loadflags(%esi)
jnz 1f
@@ -49,14 +51,15 @@ ENTRY(startup_32)
movl %eax, %ss
1:
-/* Calculate the delta between where we were compiled to run
+/*
+ * Calculate the delta between where we were compiled to run
* at and where we were actually loaded at. This can only be done
* with a short local call on x86. Nothing else will tell us what
* address we are running at. The reserved chunk of the real-mode
* data at 0x1e4 (defined as a scratch field) are used as the stack
* for this calculation. Only 4 bytes are needed.
*/
- leal (0x1e4+4)(%esi), %esp
+ leal (BP_scratch+4)(%esi), %esp
call 1f
1: popl %ebp
subl $1b, %ebp
@@ -70,32 +73,28 @@ ENTRY(startup_32)
testl %eax, %eax
jnz no_longmode
-/* Compute the delta between where we were compiled to run at
+/*
+ * Compute the delta between where we were compiled to run at
* and where the code will actually run at.
- */
-/* %ebp contains the address we are loaded at by the boot loader and %ebx
+ *
+ * %ebp contains the address we are loaded at by the boot loader and %ebx
* contains the address where we should move the kernel image temporarily
* for safe in-place decompression.
*/
#ifdef CONFIG_RELOCATABLE
movl %ebp, %ebx
- addl $(PMD_PAGE_SIZE -1), %ebx
- andl $PMD_PAGE_MASK, %ebx
+ movl BP_kernel_alignment(%esi), %eax
+ decl %eax
+ addl %eax, %ebx
+ notl %eax
+ andl %eax, %ebx
#else
- movl $CONFIG_PHYSICAL_START, %ebx
+ movl $LOAD_PHYSICAL_ADDR, %ebx
#endif
- /* Replace the compressed data size with the uncompressed size */
- subl input_len(%ebp), %ebx
- movl output_len(%ebp), %eax
- addl %eax, %ebx
- /* Add 8 bytes for every 32K input block */
- shrl $12, %eax
- addl %eax, %ebx
- /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
- addl $(32768 + 18 + 4095), %ebx
- andl $~4095, %ebx
+ /* Target address to relocate to for decompression */
+ addl $z_extract_offset, %ebx
/*
* Prepare for entering 64 bit mode
@@ -114,7 +113,7 @@ ENTRY(startup_32)
/*
* Build early 4G boot pagetable
*/
- /* Initialize Page tables to 0*/
+ /* Initialize Page tables to 0 */
leal pgtable(%ebx), %edi
xorl %eax, %eax
movl $((4096*6)/4), %ecx
@@ -155,7 +154,8 @@ ENTRY(startup_32)
btsl $_EFER_LME, %eax
wrmsr
- /* Setup for the jump to 64bit mode
+ /*
+ * Setup for the jump to 64bit mode
*
* When the jump is performend we will be in long mode but
* in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1
@@ -184,7 +184,8 @@ no_longmode:
#include "../../kernel/verify_cpu_64.S"
- /* Be careful here startup_64 needs to be at a predictable
+ /*
+ * Be careful here startup_64 needs to be at a predictable
* address so I can export it in an ELF header. Bootloaders
* should look at the ELF header to find this address, as
* it may change in the future.
@@ -192,7 +193,8 @@ no_longmode:
.code64
.org 0x200
ENTRY(startup_64)
- /* We come here either from startup_32 or directly from a
+ /*
+ * We come here either from startup_32 or directly from a
* 64bit bootloader. If we come here from a bootloader we depend on
* an identity mapped page table being provied that maps our
* entire text+data+bss and hopefully all of memory.
@@ -209,50 +211,54 @@ ENTRY(startup_64)
movl $0x20, %eax
ltr %ax
- /* Compute the decompressed kernel start address. It is where
+ /*
+ * Compute the decompressed kernel start address. It is where
* we were loaded at aligned to a 2M boundary. %rbp contains the
* decompressed kernel start address.
*
* If it is a relocatable kernel then decompress and run the kernel
* from load address aligned to 2MB addr, otherwise decompress and
- * run the kernel from CONFIG_PHYSICAL_START
+ * run the kernel from LOAD_PHYSICAL_ADDR
+ *
+ * We cannot rely on the calculation done in 32-bit mode, since we
+ * may have been invoked via the 64-bit entry point.
*/
/* Start with the delta to where the kernel will run at. */
#ifdef CONFIG_RELOCATABLE
leaq startup_32(%rip) /* - $startup_32 */, %rbp
- addq $(PMD_PAGE_SIZE - 1), %rbp
- andq $PMD_PAGE_MASK, %rbp
- movq %rbp, %rbx
+ movl BP_kernel_alignment(%rsi), %eax
+ decl %eax
+ addq %rax, %rbp
+ notq %rax
+ andq %rax, %rbp
#else
- movq $CONFIG_PHYSICAL_START, %rbp
- movq %rbp, %rbx
+ movq $LOAD_PHYSICAL_ADDR, %rbp
#endif
- /* Replace the compressed data size with the uncompressed size */
- movl input_len(%rip), %eax
- subq %rax, %rbx
- movl output_len(%rip), %eax
- addq %rax, %rbx
- /* Add 8 bytes for every 32K input block */
- shrq $12, %rax
- addq %rax, %rbx
- /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
- addq $(32768 + 18 + 4095), %rbx
- andq $~4095, %rbx
-
-/* Copy the compressed kernel to the end of our buffer
+ /* Target address to relocate to for decompression */
+ leaq z_extract_offset(%rbp), %rbx
+
+ /* Set up the stack */
+ leaq boot_stack_end(%rbx), %rsp
+
+ /* Zero EFLAGS */
+ pushq $0
+ popfq
+
+/*
+ * Copy the compressed kernel to the end of our buffer
* where decompression in place becomes safe.
*/
- leaq _end_before_pgt(%rip), %r8
- leaq _end_before_pgt(%rbx), %r9
- movq $_end_before_pgt /* - $startup_32 */, %rcx
-1: subq $8, %r8
- subq $8, %r9
- movq 0(%r8), %rax
- movq %rax, 0(%r9)
- subq $8, %rcx
- jnz 1b
+ pushq %rsi
+ leaq (_bss-8)(%rip), %rsi
+ leaq (_bss-8)(%rbx), %rdi
+ movq $_bss /* - $startup_32 */, %rcx
+ shrq $3, %rcx
+ std
+ rep movsq
+ cld
+ popq %rsi
/*
* Jump to the relocated address.
@@ -260,37 +266,28 @@ ENTRY(startup_64)
leaq relocated(%rbx), %rax
jmp *%rax
-.section ".text"
+ .text
relocated:
/*
- * Clear BSS
+ * Clear BSS (stack is currently empty)
*/
- xorq %rax, %rax
- leaq _edata(%rbx), %rdi
- leaq _end_before_pgt(%rbx), %rcx
+ xorl %eax, %eax
+ leaq _bss(%rip), %rdi
+ leaq _ebss(%rip), %rcx
subq %rdi, %rcx
- cld
- rep
- stosb
-
- /* Setup the stack */
- leaq boot_stack_end(%rip), %rsp
-
- /* zero EFLAGS after setting rsp */
- pushq $0
- popfq
+ shrq $3, %rcx
+ rep stosq
/*
* Do the decompression, and jump to the new kernel..
*/
- pushq %rsi # Save the real mode argument
- movq %rsi, %rdi # real mode address
- leaq boot_heap(%rip), %rsi # malloc area for uncompression
- leaq input_data(%rip), %rdx # input_data
- movl input_len(%rip), %eax
- movq %rax, %rcx # input_len
- movq %rbp, %r8 # output
+ pushq %rsi /* Save the real mode argument */
+ movq %rsi, %rdi /* real mode address */
+ leaq boot_heap(%rip), %rsi /* malloc area for uncompression */
+ leaq input_data(%rip), %rdx /* input_data */
+ movl $z_input_len, %ecx /* input_len */
+ movq %rbp, %r8 /* output target address */
call decompress_kernel
popq %rsi
@@ -311,11 +308,21 @@ gdt:
.quad 0x0000000000000000 /* TS continued */
gdt_end:
-.bss
-/* Stack and heap for uncompression */
-.balign 4
+/*
+ * Stack and heap for uncompression
+ */
+ .bss
+ .balign 4
boot_heap:
.fill BOOT_HEAP_SIZE, 1, 0
boot_stack:
.fill BOOT_STACK_SIZE, 1, 0
boot_stack_end:
+
+/*
+ * Space for page tables (not in .bss so not zeroed)
+ */
+ .section ".pgtable","a",@nobits
+ .balign 4096
+pgtable:
+ .fill 6*4096, 1, 0
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index e45be73684f..842b2a36174 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -325,21 +325,19 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
free_mem_ptr = heap; /* Heap */
free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
+ if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
+ error("Destination address inappropriately aligned");
#ifdef CONFIG_X86_64
- if ((unsigned long)output & (__KERNEL_ALIGN - 1))
- error("Destination address not 2M aligned");
- if ((unsigned long)output >= 0xffffffffffUL)
+ if (heap > 0x3fffffffffffUL)
error("Destination address too large");
#else
- if ((u32)output & (CONFIG_PHYSICAL_ALIGN - 1))
- error("Destination address not CONFIG_PHYSICAL_ALIGN aligned");
if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff))
error("Destination address too large");
+#endif
#ifndef CONFIG_RELOCATABLE
- if ((u32)output != LOAD_PHYSICAL_ADDR)
+ if ((unsigned long)output != LOAD_PHYSICAL_ADDR)
error("Wrong destination address");
#endif
-#endif
if (!quiet)
putstr("\nDecompressing Linux... ");
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
new file mode 100644
index 00000000000..bcbd36c4143
--- /dev/null
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -0,0 +1,97 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright (C) 2009 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ * H. Peter Anvin <hpa@linux.intel.com>
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Compute the desired load offset from a compressed program; outputs
+ * a small assembly wrapper with the appropriate symbols defined.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+
+static uint32_t getle32(const void *p)
+{
+ const uint8_t *cp = p;
+
+ return (uint32_t)cp[0] + ((uint32_t)cp[1] << 8) +
+ ((uint32_t)cp[2] << 16) + ((uint32_t)cp[3] << 24);
+}
+
+int main(int argc, char *argv[])
+{
+ uint32_t olen;
+ long ilen;
+ unsigned long offs;
+ FILE *f;
+
+ if (argc < 2) {
+ fprintf(stderr, "Usage: %s compressed_file\n", argv[0]);
+ return 1;
+ }
+
+ /* Get the information for the compressed kernel image first */
+
+ f = fopen(argv[1], "r");
+ if (!f) {
+ perror(argv[1]);
+ return 1;
+ }
+
+
+ if (fseek(f, -4L, SEEK_END)) {
+ perror(argv[1]);
+ }
+ fread(&olen, sizeof olen, 1, f);
+ ilen = ftell(f);
+ olen = getle32(&olen);
+ fclose(f);
+
+ /*
+ * Now we have the input (compressed) and output (uncompressed)
+ * sizes, compute the necessary decompression offset...
+ */
+
+ offs = (olen > ilen) ? olen - ilen : 0;
+ offs += olen >> 12; /* Add 8 bytes for each 32K block */
+ offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */
+ offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
+
+ printf(".section \".rodata.compressed\",\"a\",@progbits\n");
+ printf(".globl z_input_len\n");
+ printf("z_input_len = %lu\n", ilen);
+ printf(".globl z_output_len\n");
+ printf("z_output_len = %lu\n", (unsigned long)olen);
+ printf(".globl z_extract_offset\n");
+ printf("z_extract_offset = 0x%lx\n", offs);
+ /* z_extract_offset_negative allows simplification of head_32.S */
+ printf(".globl z_extract_offset_negative\n");
+ printf("z_extract_offset_negative = -0x%lx\n", offs);
+
+ printf(".globl input_data, input_data_end\n");
+ printf("input_data:\n");
+ printf(".incbin \"%s\"\n", argv[1]);
+ printf("input_data_end:\n");
+
+ return 0;
+}
diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux.lds.S
index bef1ac891bc..cc353e1b3ff 100644
--- a/arch/x86/boot/compressed/vmlinux_64.lds
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -1,6 +1,17 @@
-OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+
+#undef i386
+
+#include <asm/page_types.h>
+
+#ifdef CONFIG_X86_64
OUTPUT_ARCH(i386:x86-64)
ENTRY(startup_64)
+#else
+OUTPUT_ARCH(i386)
+ENTRY(startup_32)
+#endif
+
SECTIONS
{
/* Be careful parts of head_64.S assume startup_32 is at
@@ -33,16 +44,22 @@ SECTIONS
*(.data.*)
_edata = . ;
}
+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
.bss : {
_bss = . ;
*(.bss)
*(.bss.*)
*(COMMON)
- . = ALIGN(8);
- _end_before_pgt = . ;
- . = ALIGN(4096);
- pgtable = . ;
- . = . + 4096 * 6;
+ . = ALIGN(8); /* For convenience during zeroing */
_ebss = .;
}
+#ifdef CONFIG_X86_64
+ . = ALIGN(PAGE_SIZE);
+ .pgtable : {
+ _pgtable = . ;
+ *(.pgtable)
+ _epgtable = . ;
+ }
+#endif
+ _end = .;
}
diff --git a/arch/x86/boot/compressed/vmlinux.scr b/arch/x86/boot/compressed/vmlinux.scr
deleted file mode 100644
index f02382ae5c4..00000000000
--- a/arch/x86/boot/compressed/vmlinux.scr
+++ /dev/null
@@ -1,10 +0,0 @@
-SECTIONS
-{
- .rodata.compressed : {
- input_len = .;
- LONG(input_data_end - input_data) input_data = .;
- *(.data)
- output_len = . - 4;
- input_data_end = .;
- }
-}
diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds
deleted file mode 100644
index bb3c48379c4..00000000000
--- a/arch/x86/boot/compressed/vmlinux_32.lds
+++ /dev/null
@@ -1,43 +0,0 @@
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(startup_32)
-SECTIONS
-{
- /* Be careful parts of head_32.S assume startup_32 is at
- * address 0.
- */
- . = 0;
- .text.head : {
- _head = . ;
- *(.text.head)
- _ehead = . ;
- }
- .rodata.compressed : {
- *(.rodata.compressed)
- }
- .text : {
- _text = .; /* Text */
- *(.text)
- *(.text.*)
- _etext = . ;
- }
- .rodata : {
- _rodata = . ;
- *(.rodata) /* read-only data */
- *(.rodata.*)
- _erodata = . ;
- }
- .data : {
- _data = . ;
- *(.data)
- *(.data.*)
- _edata = . ;
- }
- .bss : {
- _bss = . ;
- *(.bss)
- *(.bss.*)
- *(COMMON)
- _end = . ;
- }
-}
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index 1aae8f3e5ca..c501a5b466f 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -2,6 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2.
@@ -22,17 +23,17 @@
*/
static int read_mbr(u8 devno, void *buf)
{
- u16 ax, bx, cx, dx;
+ struct biosregs ireg, oreg;
- ax = 0x0201; /* Legacy Read, one sector */
- cx = 0x0001; /* Sector 0-0-1 */
- dx = devno;
- bx = (size_t)buf;
- asm volatile("pushfl; stc; int $0x13; setc %%al; popfl"
- : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx)
- : : "esi", "edi", "memory");
+ initregs(&ireg);
+ ireg.ax = 0x0201; /* Legacy Read, one sector */
+ ireg.cx = 0x0001; /* Sector 0-0-1 */
+ ireg.dl = devno;
+ ireg.bx = (size_t)buf;
- return -(u8)ax; /* 0 or -1 */
+ intcall(0x13, &ireg, &oreg);
+
+ return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
}
static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
@@ -72,56 +73,46 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
static int get_edd_info(u8 devno, struct edd_info *ei)
{
- u16 ax, bx, cx, dx, di;
+ struct biosregs ireg, oreg;
memset(ei, 0, sizeof *ei);
/* Check Extensions Present */
- ax = 0x4100;
- bx = EDDMAGIC1;
- dx = devno;
- asm("pushfl; stc; int $0x13; setc %%al; popfl"
- : "+a" (ax), "+b" (bx), "=c" (cx), "+d" (dx)
- : : "esi", "edi");
+ initregs(&ireg);
+ ireg.ah = 0x41;
+ ireg.bx = EDDMAGIC1;
+ ireg.dl = devno;
+ intcall(0x13, &ireg, &oreg);
- if ((u8)ax)
+ if (oreg.eflags & X86_EFLAGS_CF)
return -1; /* No extended information */
- if (bx != EDDMAGIC2)
+ if (oreg.bx != EDDMAGIC2)
return -1;
ei->device = devno;
- ei->version = ax >> 8; /* EDD version number */
- ei->interface_support = cx; /* EDD functionality subsets */
+ ei->version = oreg.ah; /* EDD version number */
+ ei->interface_support = oreg.cx; /* EDD functionality subsets */
/* Extended Get Device Parameters */
ei->params.length = sizeof(ei->params);
- ax = 0x4800;
- dx = devno;
- asm("pushfl; int $0x13; popfl"
- : "+a" (ax), "+d" (dx), "=m" (ei->params)
- : "S" (&ei->params)
- : "ebx", "ecx", "edi");
+ ireg.ah = 0x48;
+ ireg.si = (size_t)&ei->params;
+ intcall(0x13, &ireg, &oreg);
/* Get legacy CHS parameters */
/* Ralf Brown recommends setting ES:DI to 0:0 */
- ax = 0x0800;
- dx = devno;
- di = 0;
- asm("pushw %%es; "
- "movw %%di,%%es; "
- "pushfl; stc; int $0x13; setc %%al; popfl; "
- "popw %%es"
- : "+a" (ax), "=b" (bx), "=c" (cx), "+d" (dx), "+D" (di)
- : : "esi");
-
- if ((u8)ax == 0) {
- ei->legacy_max_cylinder = (cx >> 8) + ((cx & 0xc0) << 2);
- ei->legacy_max_head = dx >> 8;
- ei->legacy_sectors_per_track = cx & 0x3f;
+ ireg.ah = 0x08;
+ ireg.es = 0;
+ intcall(0x13, &ireg, &oreg);
+
+ if (!(oreg.eflags & X86_EFLAGS_CF)) {
+ ei->legacy_max_cylinder = oreg.ch + ((oreg.cl & 0xc0) << 2);
+ ei->legacy_max_head = oreg.dh;
+ ei->legacy_sectors_per_track = oreg.cl & 0x3f;
}
return 0;
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 5d84d1c74e4..b31cc54b464 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -22,7 +22,8 @@
#include <asm/page_types.h>
#include <asm/setup.h>
#include "boot.h"
-#include "offsets.h"
+#include "voffset.h"
+#include "zoffset.h"
BOOTSEG = 0x07C0 /* original address of boot-sector */
SYSSEG = 0x1000 /* historical load address >> 4 */
@@ -115,7 +116,7 @@ _start:
# Part 2 of the header, from the old setup.S
.ascii "HdrS" # header signature
- .word 0x0209 # header version number (>= 0x0105)
+ .word 0x020a # header version number (>= 0x0105)
# or else old loadlin-1.5 will fail)
.globl realmode_swtch
realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
@@ -168,7 +169,11 @@ heap_end_ptr: .word _end+STACK_SIZE-512
# end of setup code can be used by setup
# for local heap purposes.
-pad1: .word 0
+ext_loader_ver:
+ .byte 0 # Extended boot loader version
+ext_loader_type:
+ .byte 0 # Extended boot loader type
+
cmd_line_ptr: .long 0 # (Header version 0x0202 or later)
# If nonzero, a 32-bit pointer
# to the kernel command line.
@@ -200,7 +205,7 @@ relocatable_kernel: .byte 1
#else
relocatable_kernel: .byte 0
#endif
-pad2: .byte 0
+min_alignment: .byte MIN_KERNEL_ALIGN_LG2 # minimum alignment
pad3: .word 0
cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
@@ -212,16 +217,27 @@ hardware_subarch: .long 0 # subarchitecture, added with 2.07
hardware_subarch_data: .quad 0
-payload_offset: .long input_data
-payload_length: .long input_data_end-input_data
+payload_offset: .long ZO_input_data
+payload_length: .long ZO_z_input_len
setup_data: .quad 0 # 64-bit physical pointer to
# single linked list of
# struct setup_data
+pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
+
+#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset)
+#define VO_INIT_SIZE (VO__end - VO__text)
+#if ZO_INIT_SIZE > VO_INIT_SIZE
+#define INIT_SIZE ZO_INIT_SIZE
+#else
+#define INIT_SIZE VO_INIT_SIZE
+#endif
+init_size: .long INIT_SIZE # kernel initialization size
+
# End of setup header #####################################################
- .section ".inittext", "ax"
+ .section ".entrytext", "ax"
start_of_setup:
#ifdef SAFE_RESET_DISK_CONTROLLER
# Reset the disk controller.
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 58f0415d3ae..140172b895b 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -2,6 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2.
@@ -61,11 +62,10 @@ static void copy_boot_params(void)
*/
static void keyboard_set_repeat(void)
{
- u16 ax = 0x0305;
- u16 bx = 0;
- asm volatile("int $0x16"
- : "+a" (ax), "+b" (bx)
- : : "ecx", "edx", "esi", "edi");
+ struct biosregs ireg;
+ initregs(&ireg);
+ ireg.ax = 0x0305;
+ intcall(0x16, &ireg, NULL);
}
/*
@@ -73,18 +73,22 @@ static void keyboard_set_repeat(void)
*/
static void query_ist(void)
{
+ struct biosregs ireg, oreg;
+
/* Some older BIOSes apparently crash on this call, so filter
it from machines too old to have SpeedStep at all. */
if (cpu.level < 6)
return;
- asm("int $0x15"
- : "=a" (boot_params.ist_info.signature),
- "=b" (boot_params.ist_info.command),
- "=c" (boot_params.ist_info.event),
- "=d" (boot_params.ist_info.perf_level)
- : "a" (0x0000e980), /* IST Support */
- "d" (0x47534943)); /* Request value */
+ initregs(&ireg);
+ ireg.ax = 0xe980; /* IST Support */
+ ireg.edx = 0x47534943; /* Request value */
+ intcall(0x15, &ireg, &oreg);
+
+ boot_params.ist_info.signature = oreg.eax;
+ boot_params.ist_info.command = oreg.ebx;
+ boot_params.ist_info.event = oreg.ecx;
+ boot_params.ist_info.perf_level = oreg.edx;
}
/*
@@ -93,13 +97,12 @@ static void query_ist(void)
static void set_bios_mode(void)
{
#ifdef CONFIG_X86_64
- u32 eax, ebx;
+ struct biosregs ireg;
- eax = 0xec00;
- ebx = 2;
- asm volatile("int $0x15"
- : "+a" (eax), "+b" (ebx)
- : : "ecx", "edx", "esi", "edi");
+ initregs(&ireg);
+ ireg.ax = 0xec00;
+ ireg.bx = 2;
+ intcall(0x15, &ireg, NULL);
#endif
}
diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c
index 911eaae5d69..a95a531148e 100644
--- a/arch/x86/boot/mca.c
+++ b/arch/x86/boot/mca.c
@@ -2,6 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2.
@@ -16,26 +17,22 @@
int query_mca(void)
{
- u8 err;
- u16 es, bx, len;
-
- asm("pushw %%es ; "
- "int $0x15 ; "
- "setc %0 ; "
- "movw %%es, %1 ; "
- "popw %%es"
- : "=acd" (err), "=acdSD" (es), "=b" (bx)
- : "a" (0xc000));
-
- if (err)
+ struct biosregs ireg, oreg;
+ u16 len;
+
+ initregs(&ireg);
+ ireg.ah = 0xc0;
+ intcall(0x15, &ireg, &oreg);
+
+ if (oreg.eflags & X86_EFLAGS_CF)
return -1; /* No MCA present */
- set_fs(es);
- len = rdfs16(bx);
+ set_fs(oreg.es);
+ len = rdfs16(oreg.bx);
if (len > sizeof(boot_params.sys_desc_table))
len = sizeof(boot_params.sys_desc_table);
- copy_from_fs(&boot_params.sys_desc_table, bx, len);
+ copy_from_fs(&boot_params.sys_desc_table, oreg.bx, len);
return 0;
}
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 74b3d2ba84e..cae3feb1035 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -20,12 +20,16 @@
static int detect_memory_e820(void)
{
int count = 0;
- u32 next = 0;
- u32 size, id, edi;
- u8 err;
+ struct biosregs ireg, oreg;
struct e820entry *desc = boot_params.e820_map;
static struct e820entry buf; /* static so it is zeroed */
+ initregs(&ireg);
+ ireg.ax = 0xe820;
+ ireg.cx = sizeof buf;
+ ireg.edx = SMAP;
+ ireg.di = (size_t)&buf;
+
/*
* Note: at least one BIOS is known which assumes that the
* buffer pointed to by one e820 call is the same one as
@@ -41,22 +45,13 @@ static int detect_memory_e820(void)
*/
do {
- size = sizeof buf;
-
- /* Important: %edx and %esi are clobbered by some BIOSes,
- so they must be either used for the error output
- or explicitly marked clobbered. Given that, assume there
- is something out there clobbering %ebp and %edi, too. */
- asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0"
- : "=d" (err), "+b" (next), "=a" (id), "+c" (size),
- "=D" (edi), "+m" (buf)
- : "D" (&buf), "d" (SMAP), "a" (0xe820)
- : "esi");
+ intcall(0x15, &ireg, &oreg);
+ ireg.ebx = oreg.ebx; /* for next iteration... */
/* BIOSes which terminate the chain with CF = 1 as opposed
to %ebx = 0 don't always report the SMAP signature on
the final, failing, probe. */
- if (err)
+ if (oreg.eflags & X86_EFLAGS_CF)
break;
/* Some BIOSes stop returning SMAP in the middle of
@@ -64,60 +59,64 @@ static int detect_memory_e820(void)
screwed up the map at that point, we might have a
partial map, the full map, or complete garbage, so
just return failure. */
- if (id != SMAP) {
+ if (oreg.eax != SMAP) {
count = 0;
break;
}
*desc++ = buf;
count++;
- } while (next && count < ARRAY_SIZE(boot_params.e820_map));
+ } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map));
return boot_params.e820_entries = count;
}
static int detect_memory_e801(void)
{
- u16 ax, bx, cx, dx;
- u8 err;
+ struct biosregs ireg, oreg;
- bx = cx = dx = 0;
- ax = 0xe801;
- asm("stc; int $0x15; setc %0"
- : "=m" (err), "+a" (ax), "+b" (bx), "+c" (cx), "+d" (dx));
+ initregs(&ireg);
+ ireg.ax = 0xe801;
+ intcall(0x15, &ireg, &oreg);
- if (err)
+ if (oreg.eflags & X86_EFLAGS_CF)
return -1;
/* Do we really need to do this? */
- if (cx || dx) {
- ax = cx;
- bx = dx;
+ if (oreg.cx || oreg.dx) {
+ oreg.ax = oreg.cx;
+ oreg.bx = oreg.dx;
}
- if (ax > 15*1024)
+ if (oreg.ax > 15*1024) {
return -1; /* Bogus! */
-
- /* This ignores memory above 16MB if we have a memory hole
- there. If someone actually finds a machine with a memory
- hole at 16MB and no support for 0E820h they should probably
- generate a fake e820 map. */
- boot_params.alt_mem_k = (ax == 15*1024) ? (dx << 6)+ax : ax;
+ } else if (oreg.ax == 15*1024) {
+ boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax;
+ } else {
+ /*
+ * This ignores memory above 16MB if we have a memory
+ * hole there. If someone actually finds a machine
+ * with a memory hole at 16MB and no support for
+ * 0E820h they should probably generate a fake e820
+ * map.
+ */
+ boot_params.alt_mem_k = oreg.ax;
+ }
return 0;
}
static int detect_memory_88(void)
{
- u16 ax;
- u8 err;
+ struct biosregs ireg, oreg;
- ax = 0x8800;
- asm("stc; int $0x15; setc %0" : "=bcdm" (err), "+a" (ax));
+ initregs(&ireg);
+ ireg.ah = 0x88;
+ intcall(0x15, &ireg, &oreg);
- boot_params.screen_info.ext_mem_k = ax;
+ boot_params.screen_info.ext_mem_k = oreg.ax;
- return -err;
+ return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
}
int detect_memory(void)
diff --git a/arch/x86/boot/regs.c b/arch/x86/boot/regs.c
new file mode 100644
index 00000000000..958019b1cfa
--- /dev/null
+++ b/arch/x86/boot/regs.c
@@ -0,0 +1,29 @@
+/* -----------------------------------------------------------------------
+ *
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2 or (at your
+ * option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Simple helper function for initializing a register set.
+ *
+ * Note that this sets EFLAGS_CF in the input register set; this
+ * makes it easier to catch functions which do nothing but don't
+ * explicitly set CF.
+ */
+
+#include "boot.h"
+
+void initregs(struct biosregs *reg)
+{
+ memset(reg, 0, sizeof *reg);
+ reg->eflags |= X86_EFLAGS_CF;
+ reg->ds = ds();
+ reg->es = ds();
+ reg->fs = fs();
+ reg->gs = gs();
+}
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index bb8dc2de796..0f6ec455a2b 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -15,8 +15,11 @@ SECTIONS
. = 497;
.header : { *(.header) }
+ .entrytext : { *(.entrytext) }
.inittext : { *(.inittext) }
.initdata : { *(.initdata) }
+ __end_init = .;
+
.text : { *(.text) }
.text32 : { *(.text32) }
@@ -52,4 +55,7 @@ SECTIONS
. = ASSERT(_end <= 0x8000, "Setup too big!");
. = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!");
+ /* Necessary for the very-old-loader check to work... */
+ . = ASSERT(__end_init <= 5*512, "init sections too big!");
+
}
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index 7e8e8b25f5f..01ec69c901c 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -2,6 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2.
@@ -22,24 +23,23 @@
void __attribute__((section(".inittext"))) putchar(int ch)
{
- unsigned char c = ch;
+ struct biosregs ireg;
- if (c == '\n')
+ if (ch == '\n')
putchar('\r'); /* \n -> \r\n */
- /* int $0x10 is known to have bugs involving touching registers
- it shouldn't. Be extra conservative... */
- asm volatile("pushal; pushw %%ds; int $0x10; popw %%ds; popal"
- : : "b" (0x0007), "c" (0x0001), "a" (0x0e00|ch));
+ initregs(&ireg);
+ ireg.bx = 0x0007;
+ ireg.cx = 0x0001;
+ ireg.ah = 0x0e;
+ ireg.al = ch;
+ intcall(0x10, &ireg, NULL);
}
void __attribute__((section(".inittext"))) puts(const char *str)
{
- int n = 0;
- while (*str) {
+ while (*str)
putchar(*str++);
- n++;
- }
}
/*
@@ -49,14 +49,13 @@ void __attribute__((section(".inittext"))) puts(const char *str)
static u8 gettime(void)
{
- u16 ax = 0x0200;
- u16 cx, dx;
+ struct biosregs ireg, oreg;
- asm volatile("int $0x1a"
- : "+a" (ax), "=c" (cx), "=d" (dx)
- : : "ebx", "esi", "edi");
+ initregs(&ireg);
+ ireg.ah = 0x02;
+ intcall(0x1a, &ireg, &oreg);
- return dx >> 8;
+ return oreg.dh;
}
/*
@@ -64,19 +63,24 @@ static u8 gettime(void)
*/
int getchar(void)
{
- u16 ax = 0;
- asm volatile("int $0x16" : "+a" (ax));
+ struct biosregs ireg, oreg;
+
+ initregs(&ireg);
+ /* ireg.ah = 0x00; */
+ intcall(0x16, &ireg, &oreg);
- return ax & 0xff;
+ return oreg.al;
}
static int kbd_pending(void)
{
- u8 pending;
- asm volatile("int $0x16; setnz %0"
- : "=qm" (pending)
- : "a" (0x0100));
- return pending;
+ struct biosregs ireg, oreg;
+
+ initregs(&ireg);
+ ireg.ah = 0x01;
+ intcall(0x16, &ireg, &oreg);
+
+ return !(oreg.eflags & X86_EFLAGS_ZF);
}
void kbd_flush(void)
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
index 3fa979c9c36..49e0c18833e 100644
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -2,6 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2.
@@ -29,21 +30,20 @@ static int bios_set_mode(struct mode_info *mi)
static int set_bios_mode(u8 mode)
{
- u16 ax;
+ struct biosregs ireg, oreg;
u8 new_mode;
- ax = mode; /* AH=0x00 Set Video Mode */
- asm volatile(INT10
- : "+a" (ax)
- : : "ebx", "ecx", "edx", "esi", "edi");
+ initregs(&ireg);
+ ireg.al = mode; /* AH=0x00 Set Video Mode */
+ intcall(0x10, &ireg, NULL);
- ax = 0x0f00; /* Get Current Video Mode */
- asm volatile(INT10
- : "+a" (ax)
- : : "ebx", "ecx", "edx", "esi", "edi");
+ ireg.ah = 0x0f; /* Get Current Video Mode */
+ intcall(0x10, &ireg, &oreg);
do_restore = 1; /* Assume video contents were lost */
- new_mode = ax & 0x7f; /* Not all BIOSes are clean with the top bit */
+
+ /* Not all BIOSes are clean with the top bit */
+ new_mode = oreg.al & 0x7f;
if (new_mode == mode)
return 0; /* Mode change OK */
@@ -53,10 +53,8 @@ static int set_bios_mode(u8 mode)
/* Mode setting failed, but we didn't end up where we
started. That's bad. Try to revert to the original
video mode. */
- ax = boot_params.screen_info.orig_video_mode;
- asm volatile(INT10
- : "+a" (ax)
- : : "ebx", "ecx", "edx", "esi", "edi");
+ ireg.ax = boot_params.screen_info.orig_video_mode;
+ intcall(0x10, &ireg, NULL);
}
#endif
return -1;
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 4a58c8ce3f6..275dd177f19 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -2,6 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2.
@@ -31,7 +32,7 @@ static inline void vesa_store_mode_params_graphics(void) {}
static int vesa_probe(void)
{
#if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID)
- u16 ax, cx, di;
+ struct biosregs ireg, oreg;
u16 mode;
addr_t mode_ptr;
struct mode_info *mi;
@@ -39,13 +40,12 @@ static int vesa_probe(void)
video_vesa.modes = GET_HEAP(struct mode_info, 0);
- ax = 0x4f00;
- di = (size_t)&vginfo;
- asm(INT10
- : "+a" (ax), "+D" (di), "=m" (vginfo)
- : : "ebx", "ecx", "edx", "esi");
+ initregs(&ireg);
+ ireg.ax = 0x4f00;
+ ireg.di = (size_t)&vginfo;
+ intcall(0x10, &ireg, &oreg);
- if (ax != 0x004f ||
+ if (oreg.ax != 0x004f ||
vginfo.signature != VESA_MAGIC ||
vginfo.version < 0x0102)
return 0; /* Not present */
@@ -65,14 +65,12 @@ static int vesa_probe(void)
memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
- ax = 0x4f01;
- cx = mode;
- di = (size_t)&vminfo;
- asm(INT10
- : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo)
- : : "ebx", "edx", "esi");
+ ireg.ax = 0x4f01;
+ ireg.cx = mode;
+ ireg.di = (size_t)&vminfo;
+ intcall(0x10, &ireg, &oreg);
- if (ax != 0x004f)
+ if (oreg.ax != 0x004f)
continue;
if ((vminfo.mode_attr & 0x15) == 0x05) {
@@ -111,20 +109,19 @@ static int vesa_probe(void)
static int vesa_set_mode(struct mode_info *mode)
{
- u16 ax, bx, cx, di;
+ struct biosregs ireg, oreg;
int is_graphic;
u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA;
memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
- ax = 0x4f01;
- cx = vesa_mode;
- di = (size_t)&vminfo;
- asm(INT10
- : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo)
- : : "ebx", "edx", "esi");
+ initregs(&ireg);
+ ireg.ax = 0x4f01;
+ ireg.cx = vesa_mode;
+ ireg.di = (size_t)&vminfo;
+ intcall(0x10, &ireg, &oreg);
- if (ax != 0x004f)
+ if (oreg.ax != 0x004f)
return -1;
if ((vminfo.mode_attr & 0x15) == 0x05) {
@@ -141,14 +138,12 @@ static int vesa_set_mode(struct mode_info *mode)
}
- ax = 0x4f02;
- bx = vesa_mode;
- di = 0;
- asm volatile(INT10
- : "+a" (ax), "+b" (bx), "+D" (di)
- : : "ecx", "edx", "esi");
+ initregs(&ireg);
+ ireg.ax = 0x4f02;
+ ireg.bx = vesa_mode;
+ intcall(0x10, &ireg, &oreg);
- if (ax != 0x004f)
+ if (oreg.ax != 0x004f)
return -1;
graphic_mode = is_graphic;
@@ -171,50 +166,45 @@ static int vesa_set_mode(struct mode_info *mode)
/* Switch DAC to 8-bit mode */
static void vesa_dac_set_8bits(void)
{
+ struct biosregs ireg, oreg;
u8 dac_size = 6;
/* If possible, switch the DAC to 8-bit mode */
if (vginfo.capabilities & 1) {
- u16 ax, bx;
-
- ax = 0x4f08;
- bx = 0x0800;
- asm volatile(INT10
- : "+a" (ax), "+b" (bx)
- : : "ecx", "edx", "esi", "edi");
-
- if (ax == 0x004f)
- dac_size = bx >> 8;
+ initregs(&ireg);
+ ireg.ax = 0x4f08;
+ ireg.bh = 0x08;
+ intcall(0x10, &ireg, &oreg);
+ if (oreg.ax == 0x004f)
+ dac_size = oreg.bh;
}
/* Set the color sizes to the DAC size, and offsets to 0 */
- boot_params.screen_info.red_size = dac_size;
+ boot_params.screen_info.red_size = dac_size;
boot_params.screen_info.green_size = dac_size;
- boot_params.screen_info.blue_size = dac_size;
- boot_params.screen_info.rsvd_size = dac_size;
+ boot_params.screen_info.blue_size = dac_size;
+ boot_params.screen_info.rsvd_size = dac_size;
- boot_params.screen_info.red_pos = 0;
- boot_params.screen_info.green_pos = 0;
- boot_params.screen_info.blue_pos = 0;
- boot_params.screen_info.rsvd_pos = 0;
+ boot_params.screen_info.red_pos = 0;
+ boot_params.screen_info.green_pos = 0;
+ boot_params.screen_info.blue_pos = 0;
+ boot_params.screen_info.rsvd_pos = 0;
}
/* Save the VESA protected mode info */
static void vesa_store_pm_info(void)
{
- u16 ax, bx, di, es;
+ struct biosregs ireg, oreg;
- ax = 0x4f0a;
- bx = di = 0;
- asm("pushw %%es; "INT10"; movw %%es,%0; popw %%es"
- : "=d" (es), "+a" (ax), "+b" (bx), "+D" (di)
- : : "ecx", "esi");
+ initregs(&ireg);
+ ireg.ax = 0x4f0a;
+ intcall(0x10, &ireg, &oreg);
- if (ax != 0x004f)
+ if (oreg.ax != 0x004f)
return;
- boot_params.screen_info.vesapm_seg = es;
- boot_params.screen_info.vesapm_off = di;
+ boot_params.screen_info.vesapm_seg = oreg.es;
+ boot_params.screen_info.vesapm_off = oreg.di;
}
/*
@@ -252,7 +242,7 @@ static void vesa_store_mode_params_graphics(void)
void vesa_store_edid(void)
{
#ifdef CONFIG_FIRMWARE_EDID
- u16 ax, bx, cx, dx, di;
+ struct biosregs ireg, oreg;
/* Apparently used as a nonsense token... */
memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info);
@@ -260,33 +250,26 @@ void vesa_store_edid(void)
if (vginfo.version < 0x0200)
return; /* EDID requires VBE 2.0+ */
- ax = 0x4f15; /* VBE DDC */
- bx = 0x0000; /* Report DDC capabilities */
- cx = 0; /* Controller 0 */
- di = 0; /* ES:DI must be 0 by spec */
-
- /* Note: The VBE DDC spec is different from the main VESA spec;
- we genuinely have to assume all registers are destroyed here. */
-
- asm("pushw %%es; movw %2,%%es; "INT10"; popw %%es"
- : "+a" (ax), "+b" (bx), "+c" (cx), "+D" (di)
- : : "esi", "edx");
+ initregs(&ireg);
+ ireg.ax = 0x4f15; /* VBE DDC */
+ /* ireg.bx = 0x0000; */ /* Report DDC capabilities */
+ /* ireg.cx = 0; */ /* Controller 0 */
+ ireg.es = 0; /* ES:DI must be 0 by spec */
+ intcall(0x10, &ireg, &oreg);
- if (ax != 0x004f)
+ if (oreg.ax != 0x004f)
return; /* No EDID */
/* BH = time in seconds to transfer EDD information */
/* BL = DDC level supported */
- ax = 0x4f15; /* VBE DDC */
- bx = 0x0001; /* Read EDID */
- cx = 0; /* Controller 0 */
- dx = 0; /* EDID block number */
- di =(size_t) &boot_params.edid_info; /* (ES:)Pointer to block */
- asm(INT10
- : "+a" (ax), "+b" (bx), "+d" (dx), "=m" (boot_params.edid_info),
- "+c" (cx), "+D" (di)
- : : "esi");
+ ireg.ax = 0x4f15; /* VBE DDC */
+ ireg.bx = 0x0001; /* Read EDID */
+ /* ireg.cx = 0; */ /* Controller 0 */
+ /* ireg.dx = 0; */ /* EDID block number */
+ ireg.es = ds();
+ ireg.di =(size_t)&boot_params.edid_info; /* (ES:)Pointer to block */
+ intcall(0x10, &ireg, &oreg);
#endif /* CONFIG_FIRMWARE_EDID */
}
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 9e0587a3776..8f8d827e254 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -2,6 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2.
@@ -39,30 +40,30 @@ static __videocard video_vga;
/* Set basic 80x25 mode */
static u8 vga_set_basic_mode(void)
{
+ struct biosregs ireg, oreg;
u16 ax;
u8 rows;
u8 mode;
+ initregs(&ireg);
+
#ifdef CONFIG_VIDEO_400_HACK
if (adapter >= ADAPTER_VGA) {
- asm volatile(INT10
- : : "a" (0x1202), "b" (0x0030)
- : "ecx", "edx", "esi", "edi");
+ ireg.ax = 0x1202;
+ ireg.bx = 0x0030;
+ intcall(0x10, &ireg, NULL);
}
#endif
ax = 0x0f00;
- asm volatile(INT10
- : "+a" (ax)
- : : "ebx", "ecx", "edx", "esi", "edi");
-
- mode = (u8)ax;
+ intcall(0x10, &ireg, &oreg);
+ mode = oreg.al;
set_fs(0);
rows = rdfs8(0x484); /* rows minus one */
#ifndef CONFIG_VIDEO_400_HACK
- if ((ax == 0x5003 || ax == 0x5007) &&
+ if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) &&
(rows == 0 || rows == 24))
return mode;
#endif
@@ -71,10 +72,8 @@ static u8 vga_set_basic_mode(void)
mode = 3;
/* Set the mode */
- ax = mode;
- asm volatile(INT10
- : "+a" (ax)
- : : "ebx", "ecx", "edx", "esi", "edi");
+ ireg.ax = mode; /* AH=0: set mode */
+ intcall(0x10, &ireg, NULL);
do_restore = 1;
return mode;
}
@@ -82,43 +81,69 @@ static u8 vga_set_basic_mode(void)
static void vga_set_8font(void)
{
/* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */
+ struct biosregs ireg;
+
+ initregs(&ireg);
/* Set 8x8 font */
- asm volatile(INT10 : : "a" (0x1112), "b" (0));
+ ireg.ax = 0x1112;
+ /* ireg.bl = 0; */
+ intcall(0x10, &ireg, NULL);
/* Use alternate print screen */
- asm volatile(INT10 : : "a" (0x1200), "b" (0x20));
+ ireg.ax = 0x1200;
+ ireg.bl = 0x20;
+ intcall(0x10, &ireg, NULL);
/* Turn off cursor emulation */
- asm volatile(INT10 : : "a" (0x1201), "b" (0x34));
+ ireg.ax = 0x1201;
+ ireg.bl = 0x34;
+ intcall(0x10, &ireg, NULL);
/* Cursor is scan lines 6-7 */
- asm volatile(INT10 : : "a" (0x0100), "c" (0x0607));
+ ireg.ax = 0x0100;
+ ireg.cx = 0x0607;
+ intcall(0x10, &ireg, NULL);
}
static void vga_set_14font(void)
{
/* Set 9x14 font - 80x28 on VGA */
+ struct biosregs ireg;
+
+ initregs(&ireg);
/* Set 9x14 font */
- asm volatile(INT10 : : "a" (0x1111), "b" (0));
+ ireg.ax = 0x1111;
+ /* ireg.bl = 0; */
+ intcall(0x10, &ireg, NULL);
/* Turn off cursor emulation */
- asm volatile(INT10 : : "a" (0x1201), "b" (0x34));
+ ireg.ax = 0x1201;
+ ireg.bl = 0x34;
+ intcall(0x10, &ireg, NULL);
/* Cursor is scan lines 11-12 */
- asm volatile(INT10 : : "a" (0x0100), "c" (0x0b0c));
+ ireg.ax = 0x0100;
+ ireg.cx = 0x0b0c;
+ intcall(0x10, &ireg, NULL);
}
static void vga_set_80x43(void)
{
/* Set 80x43 mode on VGA (not EGA) */
+ struct biosregs ireg;
+
+ initregs(&ireg);
/* Set 350 scans */
- asm volatile(INT10 : : "a" (0x1201), "b" (0x30));
+ ireg.ax = 0x1201;
+ ireg.bl = 0x30;
+ intcall(0x10, &ireg, NULL);
/* Reset video mode */
- asm volatile(INT10 : : "a" (0x0003));
+ ireg.ax = 0x0003;
+ intcall(0x10, &ireg, NULL);
vga_set_8font();
}
@@ -225,8 +250,6 @@ static int vga_set_mode(struct mode_info *mode)
*/
static int vga_probe(void)
{
- u16 ega_bx;
-
static const char *card_name[] = {
"CGA/MDA/HGC", "EGA", "VGA"
};
@@ -240,26 +263,26 @@ static int vga_probe(void)
sizeof(ega_modes)/sizeof(struct mode_info),
sizeof(vga_modes)/sizeof(struct mode_info),
};
- u8 vga_flag;
- asm(INT10
- : "=b" (ega_bx)
- : "a" (0x1200), "b" (0x10) /* Check EGA/VGA */
- : "ecx", "edx", "esi", "edi");
+ struct biosregs ireg, oreg;
+
+ initregs(&ireg);
+
+ ireg.ax = 0x1200;
+ ireg.bl = 0x10; /* Check EGA/VGA */
+ intcall(0x10, &ireg, &oreg);
#ifndef _WAKEUP
- boot_params.screen_info.orig_video_ega_bx = ega_bx;
+ boot_params.screen_info.orig_video_ega_bx = oreg.bx;
#endif
/* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */
- if ((u8)ega_bx != 0x10) {
+ if (oreg.bl != 0x10) {
/* EGA/VGA */
- asm(INT10
- : "=a" (vga_flag)
- : "a" (0x1a00)
- : "ebx", "ecx", "edx", "esi", "edi");
+ ireg.ax = 0x1a00;
+ intcall(0x10, &ireg, &oreg);
- if (vga_flag == 0x1a) {
+ if (oreg.al == 0x1a) {
adapter = ADAPTER_VGA;
#ifndef _WAKEUP
boot_params.screen_info.orig_video_isVGA = 1;
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index 3bef2c1febe..bad728b76fc 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -2,6 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2.
@@ -18,33 +19,29 @@
static void store_cursor_position(void)
{
- u16 curpos;
- u16 ax, bx;
+ struct biosregs ireg, oreg;
- ax = 0x0300;
- bx = 0;
- asm(INT10
- : "=d" (curpos), "+a" (ax), "+b" (bx)
- : : "ecx", "esi", "edi");
+ initregs(&ireg);
+ ireg.ah = 0x03;
+ intcall(0x10, &ireg, &oreg);
- boot_params.screen_info.orig_x = curpos;
- boot_params.screen_info.orig_y = curpos >> 8;
+ boot_params.screen_info.orig_x = oreg.dl;
+ boot_params.screen_info.orig_y = oreg.dh;
}
static void store_video_mode(void)
{
- u16 ax, page;
+ struct biosregs ireg, oreg;
/* N.B.: the saving of the video page here is a bit silly,
since we pretty much assume page 0 everywhere. */
- ax = 0x0f00;
- asm(INT10
- : "+a" (ax), "=b" (page)
- : : "ecx", "edx", "esi", "edi");
+ initregs(&ireg);
+ ireg.ah = 0x0f;
+ intcall(0x10, &ireg, &oreg);
/* Not all BIOSes are clean with respect to the top bit */
- boot_params.screen_info.orig_video_mode = ax & 0x7f;
- boot_params.screen_info.orig_video_page = page >> 8;
+ boot_params.screen_info.orig_video_mode = oreg.al & 0x7f;
+ boot_params.screen_info.orig_video_page = oreg.bh;
}
/*
@@ -257,7 +254,7 @@ static void restore_screen(void)
int y;
addr_t dst = 0;
u16 *src = saved.data;
- u16 ax, bx, dx;
+ struct biosregs ireg;
if (graphic_mode)
return; /* Can't restore onto a graphic mode */
@@ -296,12 +293,11 @@ static void restore_screen(void)
}
/* Restore cursor position */
- ax = 0x0200; /* Set cursor position */
- bx = 0; /* Page number (<< 8) */
- dx = (saved.cury << 8)+saved.curx;
- asm volatile(INT10
- : "+a" (ax), "+b" (bx), "+d" (dx)
- : : "ecx", "esi", "edi");
+ initregs(&ireg);
+ ireg.ah = 0x02; /* Set cursor position */
+ ireg.dh = saved.cury;
+ ireg.dl = saved.curx;
+ intcall(0x10, &ireg, NULL);
}
#else
#define save_screen() ((void)0)
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
index ee63f5d1446..5bb174a997f 100644
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -112,20 +112,6 @@ extern int force_x, force_y; /* Don't query the BIOS for cols/rows */
extern int do_restore; /* Restore screen contents */
extern int graphic_mode; /* Graphics mode with linear frame buffer */
-/*
- * int $0x10 is notorious for touching registers it shouldn't.
- * gcc doesn't like %ebp being clobbered, so define it as a push/pop
- * sequence here.
- *
- * A number of systems, including the original PC can clobber %bp in
- * certain circumstances, like when scrolling. There exists at least
- * one Trident video card which could clobber DS under a set of
- * circumstances that we are unlikely to encounter (scrolling when
- * using an extended graphics mode of more than 800x600 pixels), but
- * it's cheap insurance to deal with that here.
- */
-#define INT10 "pushl %%ebp; pushw %%ds; int $0x10; popw %%ds; popl %%ebp"
-
/* Accessing VGA indexed registers */
static inline u8 in_idx(u16 port, u8 index)
{
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 235b81d0f6f..edb992ebef9 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,12 +1,13 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29-rc4
-# Tue Feb 24 15:50:58 2009
+# Linux kernel version: 2.6.30-rc2
+# Mon May 11 16:21:55 2009
#
# CONFIG_64BIT is not set
CONFIG_X86_32=y
# CONFIG_X86_64 is not set
CONFIG_X86=y
+CONFIG_OUTPUT_FORMAT="elf32-i386"
CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
CONFIG_GENERIC_TIME=y
CONFIG_GENERIC_CMOS_UPDATE=y
@@ -33,6 +34,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y
CONFIG_ARCH_HAS_DEFAULT_IDLE=y
CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
CONFIG_HAVE_SETUP_PER_CPU_AREA=y
+CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
# CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set
CONFIG_ARCH_HIBERNATION_POSSIBLE=y
CONFIG_ARCH_SUSPEND_POSSIBLE=y
@@ -40,15 +42,16 @@ CONFIG_ARCH_SUSPEND_POSSIBLE=y
CONFIG_ARCH_POPULATES_NODE_MAP=y
# CONFIG_AUDIT_ARCH is not set
CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
+CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
CONFIG_GENERIC_IRQ_PROBE=y
CONFIG_GENERIC_PENDING_IRQ=y
-CONFIG_X86_SMP=y
CONFIG_USE_GENERIC_SMP_HELPERS=y
CONFIG_X86_32_SMP=y
CONFIG_X86_HT=y
-CONFIG_X86_BIOS_REBOOT=y
CONFIG_X86_TRAMPOLINE=y
+CONFIG_X86_32_LAZY_GS=y
CONFIG_KTIME_SCALAR=y
CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
@@ -60,10 +63,17 @@ CONFIG_LOCK_KERNEL=y
CONFIG_INIT_ENV_ARG_LIMIT=32
CONFIG_LOCALVERSION=""
# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_HAVE_KERNEL_GZIP=y
+CONFIG_HAVE_KERNEL_BZIP2=y
+CONFIG_HAVE_KERNEL_LZMA=y
+CONFIG_KERNEL_GZIP=y
+# CONFIG_KERNEL_BZIP2 is not set
+# CONFIG_KERNEL_LZMA is not set
CONFIG_SWAP=y
CONFIG_SYSVIPC=y
CONFIG_SYSVIPC_SYSCTL=y
CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
CONFIG_BSD_PROCESS_ACCT=y
# CONFIG_BSD_PROCESS_ACCT_V3 is not set
CONFIG_TASKSTATS=y
@@ -113,23 +123,26 @@ CONFIG_PID_NS=y
CONFIG_NET_NS=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_INITRAMFS_SOURCE=""
+CONFIG_RD_GZIP=y
+CONFIG_RD_BZIP2=y
+CONFIG_RD_LZMA=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_SYSCTL=y
+CONFIG_ANON_INODES=y
# CONFIG_EMBEDDED is not set
CONFIG_UID16=y
CONFIG_SYSCTL_SYSCALL=y
CONFIG_KALLSYMS=y
CONFIG_KALLSYMS_ALL=y
CONFIG_KALLSYMS_EXTRA_PASS=y
+# CONFIG_STRIP_ASM_SYMS is not set
CONFIG_HOTPLUG=y
CONFIG_PRINTK=y
CONFIG_BUG=y
CONFIG_ELF_CORE=y
CONFIG_PCSPKR_PLATFORM=y
-# CONFIG_COMPAT_BRK is not set
CONFIG_BASE_FULL=y
CONFIG_FUTEX=y
-CONFIG_ANON_INODES=y
CONFIG_EPOLL=y
CONFIG_SIGNALFD=y
CONFIG_TIMERFD=y
@@ -139,6 +152,7 @@ CONFIG_AIO=y
CONFIG_VM_EVENT_COUNTERS=y
CONFIG_PCI_QUIRKS=y
CONFIG_SLUB_DEBUG=y
+# CONFIG_COMPAT_BRK is not set
# CONFIG_SLAB is not set
CONFIG_SLUB=y
# CONFIG_SLOB is not set
@@ -154,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y
CONFIG_HAVE_KPROBES=y
CONFIG_HAVE_KRETPROBES=y
CONFIG_HAVE_ARCH_TRACEHOOK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
CONFIG_HAVE_GENERIC_DMA_COHERENT=y
CONFIG_SLABINFO=y
CONFIG_RT_MUTEXES=y
@@ -167,7 +183,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
CONFIG_STOP_MACHINE=y
CONFIG_BLOCK=y
# CONFIG_LBD is not set
-CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_BLK_DEV_BSG=y
# CONFIG_BLK_DEV_INTEGRITY is not set
@@ -194,12 +209,12 @@ CONFIG_HIGH_RES_TIMERS=y
CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
CONFIG_SMP=y
CONFIG_SPARSE_IRQ=y
-CONFIG_X86_FIND_SMP_CONFIG=y
CONFIG_X86_MPPARSE=y
+# CONFIG_X86_BIGSMP is not set
+CONFIG_X86_EXTENDED_PLATFORM=y
# CONFIG_X86_ELAN is not set
-# CONFIG_X86_GENERICARCH is not set
-# CONFIG_X86_VSMP is not set
# CONFIG_X86_RDC321X is not set
+# CONFIG_X86_32_NON_STANDARD is not set
CONFIG_SCHED_OMIT_FRAME_POINTER=y
# CONFIG_PARAVIRT_GUEST is not set
# CONFIG_MEMTEST is not set
@@ -230,8 +245,10 @@ CONFIG_M686=y
# CONFIG_GENERIC_CPU is not set
CONFIG_X86_GENERIC=y
CONFIG_X86_CPU=y
+CONFIG_X86_L1_CACHE_BYTES=64
+CONFIG_X86_INTERNODE_CACHE_BYTES=64
CONFIG_X86_CMPXCHG=y
-CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_L1_CACHE_SHIFT=5
CONFIG_X86_XADD=y
# CONFIG_X86_PPRO_FENCE is not set
CONFIG_X86_WP_WORKS_OK=y
@@ -247,7 +264,7 @@ CONFIG_X86_DEBUGCTLMSR=y
CONFIG_CPU_SUP_INTEL=y
CONFIG_CPU_SUP_CYRIX_32=y
CONFIG_CPU_SUP_AMD=y
-CONFIG_CPU_SUP_CENTAUR_32=y
+CONFIG_CPU_SUP_CENTAUR=y
CONFIG_CPU_SUP_TRANSMETA_32=y
CONFIG_CPU_SUP_UMC_32=y
CONFIG_X86_DS=y
@@ -279,6 +296,7 @@ CONFIG_MICROCODE_AMD=y
CONFIG_MICROCODE_OLD_INTERFACE=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
+# CONFIG_X86_CPU_DEBUG is not set
# CONFIG_NOHIGHMEM is not set
CONFIG_HIGHMEM4G=y
# CONFIG_HIGHMEM64G is not set
@@ -302,6 +320,8 @@ CONFIG_ZONE_DMA_FLAG=1
CONFIG_BOUNCE=y
CONFIG_VIRT_TO_BUS=y
CONFIG_UNEVICTABLE_LRU=y
+CONFIG_HAVE_MLOCK=y
+CONFIG_HAVE_MLOCKED_PAGE_BIT=y
CONFIG_HIGHPTE=y
CONFIG_X86_CHECK_BIOS_CORRUPTION=y
CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
@@ -312,6 +332,7 @@ CONFIG_MTRR=y
CONFIG_X86_PAT=y
CONFIG_EFI=y
CONFIG_SECCOMP=y
+# CONFIG_CC_STACKPROTECTOR is not set
# CONFIG_HZ_100 is not set
# CONFIG_HZ_250 is not set
# CONFIG_HZ_300 is not set
@@ -322,8 +343,9 @@ CONFIG_KEXEC=y
CONFIG_CRASH_DUMP=y
# CONFIG_KEXEC_JUMP is not set
CONFIG_PHYSICAL_START=0x1000000
-# CONFIG_RELOCATABLE is not set
-CONFIG_PHYSICAL_ALIGN=0x200000
+CONFIG_RELOCATABLE=y
+CONFIG_X86_NEED_RELOCS=y
+CONFIG_PHYSICAL_ALIGN=0x1000000
CONFIG_HOTPLUG_CPU=y
# CONFIG_COMPAT_VDSO is not set
# CONFIG_CMDLINE_BOOL is not set
@@ -363,7 +385,6 @@ CONFIG_ACPI_THERMAL=y
CONFIG_ACPI_BLACKLIST_YEAR=0
# CONFIG_ACPI_DEBUG is not set
# CONFIG_ACPI_PCI_SLOT is not set
-CONFIG_ACPI_SYSTEM=y
CONFIG_X86_PM_TIMER=y
CONFIG_ACPI_CONTAINER=y
# CONFIG_ACPI_SBS is not set
@@ -425,6 +446,7 @@ CONFIG_PCI_BIOS=y
CONFIG_PCI_DIRECT=y
CONFIG_PCI_MMCONFIG=y
CONFIG_PCI_DOMAINS=y
+# CONFIG_DMAR is not set
CONFIG_PCIEPORTBUS=y
# CONFIG_HOTPLUG_PCI_PCIE is not set
CONFIG_PCIEAER=y
@@ -435,6 +457,7 @@ CONFIG_PCI_MSI=y
# CONFIG_PCI_DEBUG is not set
# CONFIG_PCI_STUB is not set
CONFIG_HT_IRQ=y
+# CONFIG_PCI_IOV is not set
CONFIG_ISA_DMA_API=y
# CONFIG_ISA is not set
# CONFIG_MCA is not set
@@ -481,7 +504,6 @@ CONFIG_NET=y
#
# Networking options
#
-CONFIG_COMPAT_NET_DEV_OPS=y
CONFIG_PACKET=y
CONFIG_PACKET_MMAP=y
CONFIG_UNIX=y
@@ -639,6 +661,7 @@ CONFIG_LLC=y
# CONFIG_LAPB is not set
# CONFIG_ECONET is not set
# CONFIG_WAN_ROUTER is not set
+# CONFIG_PHONET is not set
CONFIG_NET_SCHED=y
#
@@ -696,6 +719,7 @@ CONFIG_NET_SCH_FIFO=y
#
# CONFIG_NET_PKTGEN is not set
# CONFIG_NET_TCPPROBE is not set
+# CONFIG_NET_DROP_MONITOR is not set
CONFIG_HAMRADIO=y
#
@@ -706,12 +730,10 @@ CONFIG_HAMRADIO=y
# CONFIG_IRDA is not set
# CONFIG_BT is not set
# CONFIG_AF_RXRPC is not set
-# CONFIG_PHONET is not set
CONFIG_FIB_RULES=y
CONFIG_WIRELESS=y
CONFIG_CFG80211=y
# CONFIG_CFG80211_REG_DEBUG is not set
-CONFIG_NL80211=y
CONFIG_WIRELESS_OLD_REGULATORY=y
CONFIG_WIRELESS_EXT=y
CONFIG_WIRELESS_EXT_SYSFS=y
@@ -789,6 +811,7 @@ CONFIG_MISC_DEVICES=y
# CONFIG_ICS932S401 is not set
# CONFIG_ENCLOSURE_SERVICES is not set
# CONFIG_HP_ILO is not set
+# CONFIG_ISL29003 is not set
# CONFIG_C2PORT is not set
#
@@ -842,6 +865,7 @@ CONFIG_SCSI_SPI_ATTRS=y
# CONFIG_SCSI_LOWLEVEL is not set
# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
# CONFIG_SCSI_DH is not set
+# CONFIG_SCSI_OSD_INITIATOR is not set
CONFIG_ATA=y
# CONFIG_ATA_NONSTANDARD is not set
CONFIG_ATA_ACPI=y
@@ -940,6 +964,7 @@ CONFIG_DM_ZERO=y
CONFIG_MACINTOSH_DRIVERS=y
CONFIG_MAC_EMUMOUSEBTN=y
CONFIG_NETDEVICES=y
+CONFIG_COMPAT_NET_DEV_OPS=y
# CONFIG_IFB is not set
# CONFIG_DUMMY is not set
# CONFIG_BONDING is not set
@@ -977,6 +1002,8 @@ CONFIG_MII=y
CONFIG_NET_VENDOR_3COM=y
# CONFIG_VORTEX is not set
# CONFIG_TYPHOON is not set
+# CONFIG_ETHOC is not set
+# CONFIG_DNET is not set
CONFIG_NET_TULIP=y
# CONFIG_DE2104X is not set
# CONFIG_TULIP is not set
@@ -1026,6 +1053,7 @@ CONFIG_E1000=y
CONFIG_E1000E=y
# CONFIG_IP1000 is not set
# CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
# CONFIG_NS83820 is not set
# CONFIG_HAMACHI is not set
# CONFIG_YELLOWFIN is not set
@@ -1040,6 +1068,7 @@ CONFIG_BNX2=y
# CONFIG_QLA3XXX is not set
# CONFIG_ATL1 is not set
# CONFIG_ATL1E is not set
+# CONFIG_ATL1C is not set
# CONFIG_JME is not set
CONFIG_NETDEV_10000=y
# CONFIG_CHELSIO_T1 is not set
@@ -1049,6 +1078,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
# CONFIG_IXGBE is not set
# CONFIG_IXGB is not set
# CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
# CONFIG_MYRI10GE is not set
# CONFIG_NETXEN_NIC is not set
# CONFIG_NIU is not set
@@ -1058,6 +1088,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
# CONFIG_BNX2X is not set
# CONFIG_QLGE is not set
# CONFIG_SFC is not set
+# CONFIG_BE2NET is not set
CONFIG_TR=y
# CONFIG_IBMOL is not set
# CONFIG_IBMLS is not set
@@ -1073,8 +1104,8 @@ CONFIG_WLAN_80211=y
# CONFIG_LIBERTAS is not set
# CONFIG_LIBERTAS_THINFIRM is not set
# CONFIG_AIRO is not set
-# CONFIG_HERMES is not set
# CONFIG_ATMEL is not set
+# CONFIG_AT76C50X_USB is not set
# CONFIG_AIRO_CS is not set
# CONFIG_PCMCIA_WL3501 is not set
# CONFIG_PRISM54 is not set
@@ -1084,21 +1115,21 @@ CONFIG_WLAN_80211=y
# CONFIG_RTL8187 is not set
# CONFIG_ADM8211 is not set
# CONFIG_MAC80211_HWSIM is not set
+# CONFIG_MWL8K is not set
# CONFIG_P54_COMMON is not set
CONFIG_ATH5K=y
# CONFIG_ATH5K_DEBUG is not set
# CONFIG_ATH9K is not set
+# CONFIG_AR9170_USB is not set
# CONFIG_IPW2100 is not set
# CONFIG_IPW2200 is not set
-# CONFIG_IWLCORE is not set
-# CONFIG_IWLWIFI_LEDS is not set
-# CONFIG_IWLAGN is not set
-# CONFIG_IWL3945 is not set
+# CONFIG_IWLWIFI is not set
# CONFIG_HOSTAP is not set
# CONFIG_B43 is not set
# CONFIG_B43LEGACY is not set
# CONFIG_ZD1211RW is not set
# CONFIG_RT2X00 is not set
+# CONFIG_HERMES is not set
#
# Enable WiMAX (Networking options) to see the WiMAX drivers
@@ -1209,6 +1240,8 @@ CONFIG_INPUT_TABLET=y
# CONFIG_TABLET_USB_KBTAB is not set
# CONFIG_TABLET_USB_WACOM is not set
CONFIG_INPUT_TOUCHSCREEN=y
+# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
+# CONFIG_TOUCHSCREEN_AD7879 is not set
# CONFIG_TOUCHSCREEN_FUJITSU is not set
# CONFIG_TOUCHSCREEN_GUNZE is not set
# CONFIG_TOUCHSCREEN_ELO is not set
@@ -1303,6 +1336,7 @@ CONFIG_UNIX98_PTYS=y
# CONFIG_LEGACY_PTYS is not set
# CONFIG_IPMI_HANDLER is not set
CONFIG_HW_RANDOM=y
+# CONFIG_HW_RANDOM_TIMERIOMEM is not set
CONFIG_HW_RANDOM_INTEL=y
CONFIG_HW_RANDOM_AMD=y
CONFIG_HW_RANDOM_GEODE=y
@@ -1390,7 +1424,6 @@ CONFIG_I2C_I801=y
# CONFIG_SENSORS_PCF8574 is not set
# CONFIG_PCF8575 is not set
# CONFIG_SENSORS_PCA9539 is not set
-# CONFIG_SENSORS_PCF8591 is not set
# CONFIG_SENSORS_MAX6875 is not set
# CONFIG_SENSORS_TSL2550 is not set
# CONFIG_I2C_DEBUG_CORE is not set
@@ -1424,6 +1457,7 @@ CONFIG_HWMON=y
# CONFIG_SENSORS_ADT7475 is not set
# CONFIG_SENSORS_K8TEMP is not set
# CONFIG_SENSORS_ASB100 is not set
+# CONFIG_SENSORS_ATK0110 is not set
# CONFIG_SENSORS_ATXP1 is not set
# CONFIG_SENSORS_DS1621 is not set
# CONFIG_SENSORS_I5K_AMB is not set
@@ -1433,6 +1467,7 @@ CONFIG_HWMON=y
# CONFIG_SENSORS_FSCHER is not set
# CONFIG_SENSORS_FSCPOS is not set
# CONFIG_SENSORS_FSCHMD is not set
+# CONFIG_SENSORS_G760A is not set
# CONFIG_SENSORS_GL518SM is not set
# CONFIG_SENSORS_GL520SM is not set
# CONFIG_SENSORS_CORETEMP is not set
@@ -1448,11 +1483,14 @@ CONFIG_HWMON=y
# CONFIG_SENSORS_LM90 is not set
# CONFIG_SENSORS_LM92 is not set
# CONFIG_SENSORS_LM93 is not set
+# CONFIG_SENSORS_LTC4215 is not set
# CONFIG_SENSORS_LTC4245 is not set
+# CONFIG_SENSORS_LM95241 is not set
# CONFIG_SENSORS_MAX1619 is not set
# CONFIG_SENSORS_MAX6650 is not set
# CONFIG_SENSORS_PC87360 is not set
# CONFIG_SENSORS_PC87427 is not set
+# CONFIG_SENSORS_PCF8591 is not set
# CONFIG_SENSORS_SIS5595 is not set
# CONFIG_SENSORS_DME1737 is not set
# CONFIG_SENSORS_SMSC47M1 is not set
@@ -1643,7 +1681,6 @@ CONFIG_FB_EFI=y
# CONFIG_FB_3DFX is not set
# CONFIG_FB_VOODOO1 is not set
# CONFIG_FB_VT8623 is not set
-# CONFIG_FB_CYBLA is not set
# CONFIG_FB_TRIDENT is not set
# CONFIG_FB_ARK is not set
# CONFIG_FB_PM3 is not set
@@ -1652,6 +1689,7 @@ CONFIG_FB_EFI=y
# CONFIG_FB_VIRTUAL is not set
# CONFIG_FB_METRONOME is not set
# CONFIG_FB_MB862XX is not set
+# CONFIG_FB_BROADSHEET is not set
CONFIG_BACKLIGHT_LCD_SUPPORT=y
# CONFIG_LCD_CLASS_DEVICE is not set
CONFIG_BACKLIGHT_CLASS_DEVICE=y
@@ -1738,6 +1776,8 @@ CONFIG_SND_PCI=y
# CONFIG_SND_INDIGO is not set
# CONFIG_SND_INDIGOIO is not set
# CONFIG_SND_INDIGODJ is not set
+# CONFIG_SND_INDIGOIOX is not set
+# CONFIG_SND_INDIGODJX is not set
# CONFIG_SND_EMU10K1 is not set
# CONFIG_SND_EMU10K1X is not set
# CONFIG_SND_ENS1370 is not set
@@ -1811,15 +1851,17 @@ CONFIG_USB_HIDDEV=y
#
# Special HID drivers
#
-CONFIG_HID_COMPAT=y
CONFIG_HID_A4TECH=y
CONFIG_HID_APPLE=y
CONFIG_HID_BELKIN=y
CONFIG_HID_CHERRY=y
CONFIG_HID_CHICONY=y
CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
CONFIG_HID_EZKEY=y
+CONFIG_HID_KYE=y
CONFIG_HID_GYRATION=y
+CONFIG_HID_KENSINGTON=y
CONFIG_HID_LOGITECH=y
CONFIG_LOGITECH_FF=y
# CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1885,11 +1927,11 @@ CONFIG_USB_PRINTER=y
# CONFIG_USB_TMC is not set
#
-# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed;
+# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
#
#
-# see USB_STORAGE Help for more information
+# also be needed; see USB_STORAGE Help for more info
#
CONFIG_USB_STORAGE=y
# CONFIG_USB_STORAGE_DEBUG is not set
@@ -1931,7 +1973,6 @@ CONFIG_USB_LIBUSUAL=y
# CONFIG_USB_LED is not set
# CONFIG_USB_CYPRESS_CY7C63 is not set
# CONFIG_USB_CYTHERM is not set
-# CONFIG_USB_PHIDGET is not set
# CONFIG_USB_IDMOUSE is not set
# CONFIG_USB_FTDI_ELAN is not set
# CONFIG_USB_APPLEDISPLAY is not set
@@ -1947,6 +1988,7 @@ CONFIG_USB_LIBUSUAL=y
#
# OTG and related infrastructure
#
+# CONFIG_NOP_USB_XCEIV is not set
# CONFIG_UWB is not set
# CONFIG_MMC is not set
# CONFIG_MEMSTICK is not set
@@ -1958,8 +2000,10 @@ CONFIG_LEDS_CLASS=y
#
# CONFIG_LEDS_ALIX2 is not set
# CONFIG_LEDS_PCA9532 is not set
+# CONFIG_LEDS_LP5521 is not set
# CONFIG_LEDS_CLEVO_MAIL is not set
# CONFIG_LEDS_PCA955X is not set
+# CONFIG_LEDS_BD2802 is not set
#
# LED Triggers
@@ -1969,6 +2013,10 @@ CONFIG_LEDS_TRIGGERS=y
# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
+
+#
+# iptables trigger is under Netfilter config (LED target)
+#
# CONFIG_ACCESSIBILITY is not set
# CONFIG_INFINIBAND is not set
CONFIG_EDAC=y
@@ -2037,6 +2085,7 @@ CONFIG_DMADEVICES=y
# DMA Devices
#
# CONFIG_INTEL_IOATDMA is not set
+# CONFIG_AUXDISPLAY is not set
# CONFIG_UIO is not set
# CONFIG_STAGING is not set
CONFIG_X86_PLATFORM_DEVICES=y
@@ -2071,6 +2120,7 @@ CONFIG_DMIID=y
#
# CONFIG_EXT2_FS is not set
CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
CONFIG_EXT3_FS_XATTR=y
CONFIG_EXT3_FS_POSIX_ACL=y
CONFIG_EXT3_FS_SECURITY=y
@@ -2101,6 +2151,11 @@ CONFIG_AUTOFS4_FS=y
CONFIG_GENERIC_ACL=y
#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
# CD-ROM/DVD Filesystems
#
CONFIG_ISO9660_FS=y
@@ -2151,6 +2206,7 @@ CONFIG_MISC_FILESYSTEMS=y
# CONFIG_ROMFS_FS is not set
# CONFIG_SYSV_FS is not set
# CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
CONFIG_NETWORK_FILESYSTEMS=y
CONFIG_NFS_FS=y
CONFIG_NFS_V3=y
@@ -2164,7 +2220,6 @@ CONFIG_NFS_ACL_SUPPORT=y
CONFIG_NFS_COMMON=y
CONFIG_SUNRPC=y
CONFIG_SUNRPC_GSS=y
-# CONFIG_SUNRPC_REGISTER_V4 is not set
CONFIG_RPCSEC_GSS_KRB5=y
# CONFIG_RPCSEC_GSS_SPKM3 is not set
# CONFIG_SMB_FS is not set
@@ -2251,6 +2306,7 @@ CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
# CONFIG_DEBUG_SHIRQ is not set
# CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
# CONFIG_SCHED_DEBUG is not set
CONFIG_SCHEDSTATS=y
CONFIG_TIMER_STATS=y
@@ -2266,6 +2322,7 @@ CONFIG_TIMER_STATS=y
# CONFIG_LOCK_STAT is not set
# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
+CONFIG_STACKTRACE=y
# CONFIG_DEBUG_KOBJECT is not set
# CONFIG_DEBUG_HIGHMEM is not set
CONFIG_DEBUG_BUGVERBOSE=y
@@ -2289,13 +2346,19 @@ CONFIG_FRAME_POINTER=y
# CONFIG_FAULT_INJECTION is not set
# CONFIG_LATENCYTOP is not set
CONFIG_SYSCTL_SYSCALL_CHECK=y
+# CONFIG_DEBUG_PAGEALLOC is not set
CONFIG_USER_STACKTRACE_SUPPORT=y
+CONFIG_NOP_TRACER=y
CONFIG_HAVE_FUNCTION_TRACER=y
CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
CONFIG_HAVE_DYNAMIC_FTRACE=y
CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
CONFIG_HAVE_HW_BRANCH_TRACER=y
+CONFIG_HAVE_FTRACE_SYSCALLS=y
+CONFIG_RING_BUFFER=y
+CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
#
# Tracers
@@ -2305,13 +2368,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y
# CONFIG_SYSPROF_TRACER is not set
# CONFIG_SCHED_TRACER is not set
# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_FTRACE_SYSCALLS is not set
# CONFIG_BOOT_TRACER is not set
# CONFIG_TRACE_BRANCH_PROFILING is not set
# CONFIG_POWER_TRACER is not set
# CONFIG_STACK_TRACER is not set
# CONFIG_HW_BRANCH_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+CONFIG_BLK_DEV_IO_TRACE=y
+# CONFIG_FTRACE_STARTUP_TEST is not set
+# CONFIG_MMIOTRACE is not set
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
-# CONFIG_DYNAMIC_PRINTK_DEBUG is not set
+# CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
# CONFIG_SAMPLES is not set
CONFIG_HAVE_ARCH_KGDB=y
# CONFIG_KGDB is not set
@@ -2321,7 +2392,6 @@ CONFIG_EARLY_PRINTK=y
CONFIG_EARLY_PRINTK_DBGP=y
CONFIG_DEBUG_STACKOVERFLOW=y
CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_DEBUG_PAGEALLOC is not set
# CONFIG_DEBUG_PER_CPU_MAPS is not set
# CONFIG_X86_PTDUMP is not set
CONFIG_DEBUG_RODATA=y
@@ -2329,7 +2399,7 @@ CONFIG_DEBUG_RODATA=y
CONFIG_DEBUG_NX_TEST=m
# CONFIG_4KSTACKS is not set
CONFIG_DOUBLEFAULT=y
-# CONFIG_MMIOTRACE is not set
+CONFIG_HAVE_MMIOTRACE_SUPPORT=y
CONFIG_IO_DELAY_TYPE_0X80=0
CONFIG_IO_DELAY_TYPE_0XED=1
CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2365,6 +2435,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y
CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
# CONFIG_SECURITY_SMACK is not set
+# CONFIG_SECURITY_TOMOYO is not set
+# CONFIG_IMA is not set
CONFIG_CRYPTO=y
#
@@ -2380,10 +2452,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y
CONFIG_CRYPTO_HASH=y
CONFIG_CRYPTO_HASH2=y
CONFIG_CRYPTO_RNG2=y
+CONFIG_CRYPTO_PCOMP=y
CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_MANAGER2=y
# CONFIG_CRYPTO_GF128MUL is not set
# CONFIG_CRYPTO_NULL is not set
+CONFIG_CRYPTO_WORKQUEUE=y
# CONFIG_CRYPTO_CRYPTD is not set
CONFIG_CRYPTO_AUTHENC=y
# CONFIG_CRYPTO_TEST is not set
@@ -2456,6 +2530,7 @@ CONFIG_CRYPTO_DES=y
# Compression
#
# CONFIG_CRYPTO_DEFLATE is not set
+# CONFIG_CRYPTO_ZLIB is not set
# CONFIG_CRYPTO_LZO is not set
#
@@ -2467,11 +2542,13 @@ CONFIG_CRYPTO_HW=y
# CONFIG_CRYPTO_DEV_GEODE is not set
# CONFIG_CRYPTO_DEV_HIFN_795X is not set
CONFIG_HAVE_KVM=y
+CONFIG_HAVE_KVM_IRQCHIP=y
CONFIG_VIRTUALIZATION=y
# CONFIG_KVM is not set
# CONFIG_LGUEST is not set
# CONFIG_VIRTIO_PCI is not set
# CONFIG_VIRTIO_BALLOON is not set
+CONFIG_BINARY_PRINTF=y
#
# Library routines
@@ -2489,7 +2566,10 @@ CONFIG_CRC32=y
# CONFIG_LIBCRC32C is not set
CONFIG_AUDIT_GENERIC=y
CONFIG_ZLIB_INFLATE=y
-CONFIG_PLIST=y
+CONFIG_DECOMPRESS_GZIP=y
+CONFIG_DECOMPRESS_BZIP2=y
+CONFIG_DECOMPRESS_LZMA=y
CONFIG_HAS_IOMEM=y
CONFIG_HAS_IOPORT=y
CONFIG_HAS_DMA=y
+CONFIG_NLATTR=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 9fe5d212ab4..cee1dd2e69b 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,12 +1,13 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29-rc4
-# Tue Feb 24 15:44:16 2009
+# Linux kernel version: 2.6.30-rc2
+# Mon May 11 16:22:00 2009
#
CONFIG_64BIT=y
# CONFIG_X86_32 is not set
CONFIG_X86_64=y
CONFIG_X86=y
+CONFIG_OUTPUT_FORMAT="elf64-x86-64"
CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
CONFIG_GENERIC_TIME=y
CONFIG_GENERIC_CMOS_UPDATE=y
@@ -34,6 +35,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y
CONFIG_ARCH_HAS_DEFAULT_IDLE=y
CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
CONFIG_HAVE_SETUP_PER_CPU_AREA=y
+CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y
CONFIG_ARCH_HIBERNATION_POSSIBLE=y
CONFIG_ARCH_SUSPEND_POSSIBLE=y
@@ -41,14 +43,14 @@ CONFIG_ZONE_DMA32=y
CONFIG_ARCH_POPULATES_NODE_MAP=y
CONFIG_AUDIT_ARCH=y
CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
+CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
CONFIG_GENERIC_IRQ_PROBE=y
CONFIG_GENERIC_PENDING_IRQ=y
-CONFIG_X86_SMP=y
CONFIG_USE_GENERIC_SMP_HELPERS=y
CONFIG_X86_64_SMP=y
CONFIG_X86_HT=y
-CONFIG_X86_BIOS_REBOOT=y
CONFIG_X86_TRAMPOLINE=y
# CONFIG_KTIME_SCALAR is not set
CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
@@ -61,10 +63,17 @@ CONFIG_LOCK_KERNEL=y
CONFIG_INIT_ENV_ARG_LIMIT=32
CONFIG_LOCALVERSION=""
# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_HAVE_KERNEL_GZIP=y
+CONFIG_HAVE_KERNEL_BZIP2=y
+CONFIG_HAVE_KERNEL_LZMA=y
+CONFIG_KERNEL_GZIP=y
+# CONFIG_KERNEL_BZIP2 is not set
+# CONFIG_KERNEL_LZMA is not set
CONFIG_SWAP=y
CONFIG_SYSVIPC=y
CONFIG_SYSVIPC_SYSCTL=y
CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
CONFIG_BSD_PROCESS_ACCT=y
# CONFIG_BSD_PROCESS_ACCT_V3 is not set
CONFIG_TASKSTATS=y
@@ -114,23 +123,26 @@ CONFIG_PID_NS=y
CONFIG_NET_NS=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_INITRAMFS_SOURCE=""
+CONFIG_RD_GZIP=y
+CONFIG_RD_BZIP2=y
+CONFIG_RD_LZMA=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_SYSCTL=y
+CONFIG_ANON_INODES=y
# CONFIG_EMBEDDED is not set
CONFIG_UID16=y
CONFIG_SYSCTL_SYSCALL=y
CONFIG_KALLSYMS=y
CONFIG_KALLSYMS_ALL=y
CONFIG_KALLSYMS_EXTRA_PASS=y
+# CONFIG_STRIP_ASM_SYMS is not set
CONFIG_HOTPLUG=y
CONFIG_PRINTK=y
CONFIG_BUG=y
CONFIG_ELF_CORE=y
CONFIG_PCSPKR_PLATFORM=y
-# CONFIG_COMPAT_BRK is not set
CONFIG_BASE_FULL=y
CONFIG_FUTEX=y
-CONFIG_ANON_INODES=y
CONFIG_EPOLL=y
CONFIG_SIGNALFD=y
CONFIG_TIMERFD=y
@@ -140,6 +152,7 @@ CONFIG_AIO=y
CONFIG_VM_EVENT_COUNTERS=y
CONFIG_PCI_QUIRKS=y
CONFIG_SLUB_DEBUG=y
+# CONFIG_COMPAT_BRK is not set
# CONFIG_SLAB is not set
CONFIG_SLUB=y
# CONFIG_SLOB is not set
@@ -155,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y
CONFIG_HAVE_KPROBES=y
CONFIG_HAVE_KRETPROBES=y
CONFIG_HAVE_ARCH_TRACEHOOK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
CONFIG_SLABINFO=y
CONFIG_RT_MUTEXES=y
@@ -167,7 +182,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
# CONFIG_MODULE_SRCVERSION_ALL is not set
CONFIG_STOP_MACHINE=y
CONFIG_BLOCK=y
-CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_BLK_DEV_BSG=y
# CONFIG_BLK_DEV_INTEGRITY is not set
CONFIG_BLOCK_COMPAT=y
@@ -195,12 +209,10 @@ CONFIG_HIGH_RES_TIMERS=y
CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
CONFIG_SMP=y
CONFIG_SPARSE_IRQ=y
-# CONFIG_NUMA_MIGRATE_IRQ_DESC is not set
-CONFIG_X86_FIND_SMP_CONFIG=y
CONFIG_X86_MPPARSE=y
-# CONFIG_X86_ELAN is not set
-# CONFIG_X86_GENERICARCH is not set
+CONFIG_X86_EXTENDED_PLATFORM=y
# CONFIG_X86_VSMP is not set
+# CONFIG_X86_UV is not set
CONFIG_SCHED_OMIT_FRAME_POINTER=y
# CONFIG_PARAVIRT_GUEST is not set
# CONFIG_MEMTEST is not set
@@ -230,10 +242,10 @@ CONFIG_SCHED_OMIT_FRAME_POINTER=y
# CONFIG_MCORE2 is not set
CONFIG_GENERIC_CPU=y
CONFIG_X86_CPU=y
-CONFIG_X86_L1_CACHE_BYTES=128
-CONFIG_X86_INTERNODE_CACHE_BYTES=128
+CONFIG_X86_L1_CACHE_BYTES=64
+CONFIG_X86_INTERNODE_CACHE_BYTES=64
CONFIG_X86_CMPXCHG=y
-CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_L1_CACHE_SHIFT=6
CONFIG_X86_WP_WORKS_OK=y
CONFIG_X86_TSC=y
CONFIG_X86_CMPXCHG64=y
@@ -242,7 +254,7 @@ CONFIG_X86_MINIMUM_CPU_FAMILY=64
CONFIG_X86_DEBUGCTLMSR=y
CONFIG_CPU_SUP_INTEL=y
CONFIG_CPU_SUP_AMD=y
-CONFIG_CPU_SUP_CENTAUR_64=y
+CONFIG_CPU_SUP_CENTAUR=y
CONFIG_X86_DS=y
CONFIG_X86_PTRACE_BTS=y
CONFIG_HPET_TIMER=y
@@ -269,6 +281,7 @@ CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
CONFIG_X86_MCE=y
CONFIG_X86_MCE_INTEL=y
CONFIG_X86_MCE_AMD=y
+CONFIG_X86_MCE_THRESHOLD=y
# CONFIG_I8K is not set
CONFIG_MICROCODE=y
CONFIG_MICROCODE_INTEL=y
@@ -276,6 +289,7 @@ CONFIG_MICROCODE_AMD=y
CONFIG_MICROCODE_OLD_INTERFACE=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
+# CONFIG_X86_CPU_DEBUG is not set
CONFIG_ARCH_PHYS_ADDR_T_64BIT=y
CONFIG_DIRECT_GBPAGES=y
CONFIG_NUMA=y
@@ -309,6 +323,8 @@ CONFIG_ZONE_DMA_FLAG=1
CONFIG_BOUNCE=y
CONFIG_VIRT_TO_BUS=y
CONFIG_UNEVICTABLE_LRU=y
+CONFIG_HAVE_MLOCK=y
+CONFIG_HAVE_MLOCKED_PAGE_BIT=y
CONFIG_X86_CHECK_BIOS_CORRUPTION=y
CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
CONFIG_X86_RESERVE_LOW_64K=y
@@ -317,6 +333,7 @@ CONFIG_MTRR=y
CONFIG_X86_PAT=y
CONFIG_EFI=y
CONFIG_SECCOMP=y
+# CONFIG_CC_STACKPROTECTOR is not set
# CONFIG_HZ_100 is not set
# CONFIG_HZ_250 is not set
# CONFIG_HZ_300 is not set
@@ -325,9 +342,10 @@ CONFIG_HZ=1000
CONFIG_SCHED_HRTICK=y
CONFIG_KEXEC=y
CONFIG_CRASH_DUMP=y
+# CONFIG_KEXEC_JUMP is not set
CONFIG_PHYSICAL_START=0x1000000
-# CONFIG_RELOCATABLE is not set
-CONFIG_PHYSICAL_ALIGN=0x200000
+CONFIG_RELOCATABLE=y
+CONFIG_PHYSICAL_ALIGN=0x1000000
CONFIG_HOTPLUG_CPU=y
# CONFIG_COMPAT_VDSO is not set
# CONFIG_CMDLINE_BOOL is not set
@@ -370,7 +388,6 @@ CONFIG_ACPI_NUMA=y
CONFIG_ACPI_BLACKLIST_YEAR=0
# CONFIG_ACPI_DEBUG is not set
# CONFIG_ACPI_PCI_SLOT is not set
-CONFIG_ACPI_SYSTEM=y
CONFIG_X86_PM_TIMER=y
CONFIG_ACPI_CONTAINER=y
# CONFIG_ACPI_SBS is not set
@@ -436,6 +453,7 @@ CONFIG_PCI_MSI=y
# CONFIG_PCI_DEBUG is not set
# CONFIG_PCI_STUB is not set
CONFIG_HT_IRQ=y
+# CONFIG_PCI_IOV is not set
CONFIG_ISA_DMA_API=y
CONFIG_K8_NB=y
CONFIG_PCCARD=y
@@ -481,7 +499,6 @@ CONFIG_NET=y
#
# Networking options
#
-CONFIG_COMPAT_NET_DEV_OPS=y
CONFIG_PACKET=y
CONFIG_PACKET_MMAP=y
CONFIG_UNIX=y
@@ -639,6 +656,7 @@ CONFIG_LLC=y
# CONFIG_LAPB is not set
# CONFIG_ECONET is not set
# CONFIG_WAN_ROUTER is not set
+# CONFIG_PHONET is not set
CONFIG_NET_SCHED=y
#
@@ -696,6 +714,7 @@ CONFIG_NET_SCH_FIFO=y
#
# CONFIG_NET_PKTGEN is not set
# CONFIG_NET_TCPPROBE is not set
+# CONFIG_NET_DROP_MONITOR is not set
CONFIG_HAMRADIO=y
#
@@ -706,12 +725,10 @@ CONFIG_HAMRADIO=y
# CONFIG_IRDA is not set
# CONFIG_BT is not set
# CONFIG_AF_RXRPC is not set
-# CONFIG_PHONET is not set
CONFIG_FIB_RULES=y
CONFIG_WIRELESS=y
CONFIG_CFG80211=y
# CONFIG_CFG80211_REG_DEBUG is not set
-CONFIG_NL80211=y
CONFIG_WIRELESS_OLD_REGULATORY=y
CONFIG_WIRELESS_EXT=y
CONFIG_WIRELESS_EXT_SYSFS=y
@@ -788,9 +805,8 @@ CONFIG_MISC_DEVICES=y
# CONFIG_TIFM_CORE is not set
# CONFIG_ICS932S401 is not set
# CONFIG_ENCLOSURE_SERVICES is not set
-# CONFIG_SGI_XP is not set
# CONFIG_HP_ILO is not set
-# CONFIG_SGI_GRU is not set
+# CONFIG_ISL29003 is not set
# CONFIG_C2PORT is not set
#
@@ -844,6 +860,7 @@ CONFIG_SCSI_SPI_ATTRS=y
# CONFIG_SCSI_LOWLEVEL is not set
# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
# CONFIG_SCSI_DH is not set
+# CONFIG_SCSI_OSD_INITIATOR is not set
CONFIG_ATA=y
# CONFIG_ATA_NONSTANDARD is not set
CONFIG_ATA_ACPI=y
@@ -940,6 +957,7 @@ CONFIG_DM_ZERO=y
CONFIG_MACINTOSH_DRIVERS=y
CONFIG_MAC_EMUMOUSEBTN=y
CONFIG_NETDEVICES=y
+CONFIG_COMPAT_NET_DEV_OPS=y
# CONFIG_IFB is not set
# CONFIG_DUMMY is not set
# CONFIG_BONDING is not set
@@ -977,6 +995,8 @@ CONFIG_MII=y
CONFIG_NET_VENDOR_3COM=y
# CONFIG_VORTEX is not set
# CONFIG_TYPHOON is not set
+# CONFIG_ETHOC is not set
+# CONFIG_DNET is not set
CONFIG_NET_TULIP=y
# CONFIG_DE2104X is not set
# CONFIG_TULIP is not set
@@ -1026,6 +1046,7 @@ CONFIG_E1000=y
# CONFIG_E1000E is not set
# CONFIG_IP1000 is not set
# CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
# CONFIG_NS83820 is not set
# CONFIG_HAMACHI is not set
# CONFIG_YELLOWFIN is not set
@@ -1040,6 +1061,7 @@ CONFIG_TIGON3=y
# CONFIG_QLA3XXX is not set
# CONFIG_ATL1 is not set
# CONFIG_ATL1E is not set
+# CONFIG_ATL1C is not set
# CONFIG_JME is not set
CONFIG_NETDEV_10000=y
# CONFIG_CHELSIO_T1 is not set
@@ -1049,6 +1071,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
# CONFIG_IXGBE is not set
# CONFIG_IXGB is not set
# CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
# CONFIG_MYRI10GE is not set
# CONFIG_NETXEN_NIC is not set
# CONFIG_NIU is not set
@@ -1058,6 +1081,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
# CONFIG_BNX2X is not set
# CONFIG_QLGE is not set
# CONFIG_SFC is not set
+# CONFIG_BE2NET is not set
CONFIG_TR=y
# CONFIG_IBMOL is not set
# CONFIG_3C359 is not set
@@ -1072,8 +1096,8 @@ CONFIG_WLAN_80211=y
# CONFIG_LIBERTAS is not set
# CONFIG_LIBERTAS_THINFIRM is not set
# CONFIG_AIRO is not set
-# CONFIG_HERMES is not set
# CONFIG_ATMEL is not set
+# CONFIG_AT76C50X_USB is not set
# CONFIG_AIRO_CS is not set
# CONFIG_PCMCIA_WL3501 is not set
# CONFIG_PRISM54 is not set
@@ -1083,21 +1107,21 @@ CONFIG_WLAN_80211=y
# CONFIG_RTL8187 is not set
# CONFIG_ADM8211 is not set
# CONFIG_MAC80211_HWSIM is not set
+# CONFIG_MWL8K is not set
# CONFIG_P54_COMMON is not set
CONFIG_ATH5K=y
# CONFIG_ATH5K_DEBUG is not set
# CONFIG_ATH9K is not set
+# CONFIG_AR9170_USB is not set
# CONFIG_IPW2100 is not set
# CONFIG_IPW2200 is not set
-# CONFIG_IWLCORE is not set
-# CONFIG_IWLWIFI_LEDS is not set
-# CONFIG_IWLAGN is not set
-# CONFIG_IWL3945 is not set
+# CONFIG_IWLWIFI is not set
# CONFIG_HOSTAP is not set
# CONFIG_B43 is not set
# CONFIG_B43LEGACY is not set
# CONFIG_ZD1211RW is not set
# CONFIG_RT2X00 is not set
+# CONFIG_HERMES is not set
#
# Enable WiMAX (Networking options) to see the WiMAX drivers
@@ -1208,6 +1232,8 @@ CONFIG_INPUT_TABLET=y
# CONFIG_TABLET_USB_KBTAB is not set
# CONFIG_TABLET_USB_WACOM is not set
CONFIG_INPUT_TOUCHSCREEN=y
+# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
+# CONFIG_TOUCHSCREEN_AD7879 is not set
# CONFIG_TOUCHSCREEN_FUJITSU is not set
# CONFIG_TOUCHSCREEN_GUNZE is not set
# CONFIG_TOUCHSCREEN_ELO is not set
@@ -1301,6 +1327,7 @@ CONFIG_UNIX98_PTYS=y
# CONFIG_LEGACY_PTYS is not set
# CONFIG_IPMI_HANDLER is not set
CONFIG_HW_RANDOM=y
+# CONFIG_HW_RANDOM_TIMERIOMEM is not set
# CONFIG_HW_RANDOM_INTEL is not set
# CONFIG_HW_RANDOM_AMD is not set
CONFIG_NVRAM=y
@@ -1382,7 +1409,6 @@ CONFIG_I2C_I801=y
# CONFIG_SENSORS_PCF8574 is not set
# CONFIG_PCF8575 is not set
# CONFIG_SENSORS_PCA9539 is not set
-# CONFIG_SENSORS_PCF8591 is not set
# CONFIG_SENSORS_MAX6875 is not set
# CONFIG_SENSORS_TSL2550 is not set
# CONFIG_I2C_DEBUG_CORE is not set
@@ -1416,6 +1442,7 @@ CONFIG_HWMON=y
# CONFIG_SENSORS_ADT7475 is not set
# CONFIG_SENSORS_K8TEMP is not set
# CONFIG_SENSORS_ASB100 is not set
+# CONFIG_SENSORS_ATK0110 is not set
# CONFIG_SENSORS_ATXP1 is not set
# CONFIG_SENSORS_DS1621 is not set
# CONFIG_SENSORS_I5K_AMB is not set
@@ -1425,6 +1452,7 @@ CONFIG_HWMON=y
# CONFIG_SENSORS_FSCHER is not set
# CONFIG_SENSORS_FSCPOS is not set
# CONFIG_SENSORS_FSCHMD is not set
+# CONFIG_SENSORS_G760A is not set
# CONFIG_SENSORS_GL518SM is not set
# CONFIG_SENSORS_GL520SM is not set
# CONFIG_SENSORS_CORETEMP is not set
@@ -1440,11 +1468,14 @@ CONFIG_HWMON=y
# CONFIG_SENSORS_LM90 is not set
# CONFIG_SENSORS_LM92 is not set
# CONFIG_SENSORS_LM93 is not set
+# CONFIG_SENSORS_LTC4215 is not set
# CONFIG_SENSORS_LTC4245 is not set
+# CONFIG_SENSORS_LM95241 is not set
# CONFIG_SENSORS_MAX1619 is not set
# CONFIG_SENSORS_MAX6650 is not set
# CONFIG_SENSORS_PC87360 is not set
# CONFIG_SENSORS_PC87427 is not set
+# CONFIG_SENSORS_PCF8591 is not set
# CONFIG_SENSORS_SIS5595 is not set
# CONFIG_SENSORS_DME1737 is not set
# CONFIG_SENSORS_SMSC47M1 is not set
@@ -1635,6 +1666,7 @@ CONFIG_FB_EFI=y
# CONFIG_FB_VIRTUAL is not set
# CONFIG_FB_METRONOME is not set
# CONFIG_FB_MB862XX is not set
+# CONFIG_FB_BROADSHEET is not set
CONFIG_BACKLIGHT_LCD_SUPPORT=y
# CONFIG_LCD_CLASS_DEVICE is not set
CONFIG_BACKLIGHT_CLASS_DEVICE=y
@@ -1720,6 +1752,8 @@ CONFIG_SND_PCI=y
# CONFIG_SND_INDIGO is not set
# CONFIG_SND_INDIGOIO is not set
# CONFIG_SND_INDIGODJ is not set
+# CONFIG_SND_INDIGOIOX is not set
+# CONFIG_SND_INDIGODJX is not set
# CONFIG_SND_EMU10K1 is not set
# CONFIG_SND_EMU10K1X is not set
# CONFIG_SND_ENS1370 is not set
@@ -1792,15 +1826,17 @@ CONFIG_USB_HIDDEV=y
#
# Special HID drivers
#
-CONFIG_HID_COMPAT=y
CONFIG_HID_A4TECH=y
CONFIG_HID_APPLE=y
CONFIG_HID_BELKIN=y
CONFIG_HID_CHERRY=y
CONFIG_HID_CHICONY=y
CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
CONFIG_HID_EZKEY=y
+CONFIG_HID_KYE=y
CONFIG_HID_GYRATION=y
+CONFIG_HID_KENSINGTON=y
CONFIG_HID_LOGITECH=y
CONFIG_LOGITECH_FF=y
# CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1866,11 +1902,11 @@ CONFIG_USB_PRINTER=y
# CONFIG_USB_TMC is not set
#
-# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed;
+# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
#
#
-# see USB_STORAGE Help for more information
+# also be needed; see USB_STORAGE Help for more info
#
CONFIG_USB_STORAGE=y
# CONFIG_USB_STORAGE_DEBUG is not set
@@ -1912,7 +1948,6 @@ CONFIG_USB_LIBUSUAL=y
# CONFIG_USB_LED is not set
# CONFIG_USB_CYPRESS_CY7C63 is not set
# CONFIG_USB_CYTHERM is not set
-# CONFIG_USB_PHIDGET is not set
# CONFIG_USB_IDMOUSE is not set
# CONFIG_USB_FTDI_ELAN is not set
# CONFIG_USB_APPLEDISPLAY is not set
@@ -1928,6 +1963,7 @@ CONFIG_USB_LIBUSUAL=y
#
# OTG and related infrastructure
#
+# CONFIG_NOP_USB_XCEIV is not set
# CONFIG_UWB is not set
# CONFIG_MMC is not set
# CONFIG_MEMSTICK is not set
@@ -1939,8 +1975,10 @@ CONFIG_LEDS_CLASS=y
#
# CONFIG_LEDS_ALIX2 is not set
# CONFIG_LEDS_PCA9532 is not set
+# CONFIG_LEDS_LP5521 is not set
# CONFIG_LEDS_CLEVO_MAIL is not set
# CONFIG_LEDS_PCA955X is not set
+# CONFIG_LEDS_BD2802 is not set
#
# LED Triggers
@@ -1950,6 +1988,10 @@ CONFIG_LEDS_TRIGGERS=y
# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
+
+#
+# iptables trigger is under Netfilter config (LED target)
+#
# CONFIG_ACCESSIBILITY is not set
# CONFIG_INFINIBAND is not set
CONFIG_EDAC=y
@@ -2018,6 +2060,7 @@ CONFIG_DMADEVICES=y
# DMA Devices
#
# CONFIG_INTEL_IOATDMA is not set
+# CONFIG_AUXDISPLAY is not set
# CONFIG_UIO is not set
# CONFIG_STAGING is not set
CONFIG_X86_PLATFORM_DEVICES=y
@@ -2051,6 +2094,7 @@ CONFIG_DMIID=y
#
# CONFIG_EXT2_FS is not set
CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
CONFIG_EXT3_FS_XATTR=y
CONFIG_EXT3_FS_POSIX_ACL=y
CONFIG_EXT3_FS_SECURITY=y
@@ -2082,6 +2126,11 @@ CONFIG_AUTOFS4_FS=y
CONFIG_GENERIC_ACL=y
#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
# CD-ROM/DVD Filesystems
#
CONFIG_ISO9660_FS=y
@@ -2132,6 +2181,7 @@ CONFIG_MISC_FILESYSTEMS=y
# CONFIG_ROMFS_FS is not set
# CONFIG_SYSV_FS is not set
# CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
CONFIG_NETWORK_FILESYSTEMS=y
CONFIG_NFS_FS=y
CONFIG_NFS_V3=y
@@ -2145,7 +2195,6 @@ CONFIG_NFS_ACL_SUPPORT=y
CONFIG_NFS_COMMON=y
CONFIG_SUNRPC=y
CONFIG_SUNRPC_GSS=y
-# CONFIG_SUNRPC_REGISTER_V4 is not set
CONFIG_RPCSEC_GSS_KRB5=y
# CONFIG_RPCSEC_GSS_SPKM3 is not set
# CONFIG_SMB_FS is not set
@@ -2232,6 +2281,7 @@ CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
# CONFIG_DEBUG_SHIRQ is not set
# CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
# CONFIG_SCHED_DEBUG is not set
CONFIG_SCHEDSTATS=y
CONFIG_TIMER_STATS=y
@@ -2247,6 +2297,7 @@ CONFIG_TIMER_STATS=y
# CONFIG_LOCK_STAT is not set
# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
+CONFIG_STACKTRACE=y
# CONFIG_DEBUG_KOBJECT is not set
CONFIG_DEBUG_BUGVERBOSE=y
# CONFIG_DEBUG_INFO is not set
@@ -2269,13 +2320,19 @@ CONFIG_FRAME_POINTER=y
# CONFIG_FAULT_INJECTION is not set
# CONFIG_LATENCYTOP is not set
CONFIG_SYSCTL_SYSCALL_CHECK=y
+# CONFIG_DEBUG_PAGEALLOC is not set
CONFIG_USER_STACKTRACE_SUPPORT=y
+CONFIG_NOP_TRACER=y
CONFIG_HAVE_FUNCTION_TRACER=y
CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
CONFIG_HAVE_DYNAMIC_FTRACE=y
CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
CONFIG_HAVE_HW_BRANCH_TRACER=y
+CONFIG_HAVE_FTRACE_SYSCALLS=y
+CONFIG_RING_BUFFER=y
+CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
#
# Tracers
@@ -2285,13 +2342,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y
# CONFIG_SYSPROF_TRACER is not set
# CONFIG_SCHED_TRACER is not set
# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_FTRACE_SYSCALLS is not set
# CONFIG_BOOT_TRACER is not set
# CONFIG_TRACE_BRANCH_PROFILING is not set
# CONFIG_POWER_TRACER is not set
# CONFIG_STACK_TRACER is not set
# CONFIG_HW_BRANCH_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+CONFIG_BLK_DEV_IO_TRACE=y
+# CONFIG_FTRACE_STARTUP_TEST is not set
+# CONFIG_MMIOTRACE is not set
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
-# CONFIG_DYNAMIC_PRINTK_DEBUG is not set
+# CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
# CONFIG_SAMPLES is not set
CONFIG_HAVE_ARCH_KGDB=y
# CONFIG_KGDB is not set
@@ -2301,14 +2366,13 @@ CONFIG_EARLY_PRINTK=y
CONFIG_EARLY_PRINTK_DBGP=y
CONFIG_DEBUG_STACKOVERFLOW=y
CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_DEBUG_PAGEALLOC is not set
# CONFIG_DEBUG_PER_CPU_MAPS is not set
# CONFIG_X86_PTDUMP is not set
CONFIG_DEBUG_RODATA=y
# CONFIG_DEBUG_RODATA_TEST is not set
CONFIG_DEBUG_NX_TEST=m
# CONFIG_IOMMU_DEBUG is not set
-# CONFIG_MMIOTRACE is not set
+CONFIG_HAVE_MMIOTRACE_SUPPORT=y
CONFIG_IO_DELAY_TYPE_0X80=0
CONFIG_IO_DELAY_TYPE_0XED=1
CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2344,6 +2408,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y
CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
# CONFIG_SECURITY_SMACK is not set
+# CONFIG_SECURITY_TOMOYO is not set
+# CONFIG_IMA is not set
CONFIG_CRYPTO=y
#
@@ -2359,10 +2425,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y
CONFIG_CRYPTO_HASH=y
CONFIG_CRYPTO_HASH2=y
CONFIG_CRYPTO_RNG2=y
+CONFIG_CRYPTO_PCOMP=y
CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_MANAGER2=y
# CONFIG_CRYPTO_GF128MUL is not set
# CONFIG_CRYPTO_NULL is not set
+CONFIG_CRYPTO_WORKQUEUE=y
# CONFIG_CRYPTO_CRYPTD is not set
CONFIG_CRYPTO_AUTHENC=y
# CONFIG_CRYPTO_TEST is not set
@@ -2414,6 +2482,7 @@ CONFIG_CRYPTO_SHA1=y
#
CONFIG_CRYPTO_AES=y
# CONFIG_CRYPTO_AES_X86_64 is not set
+# CONFIG_CRYPTO_AES_NI_INTEL is not set
# CONFIG_CRYPTO_ANUBIS is not set
CONFIG_CRYPTO_ARC4=y
# CONFIG_CRYPTO_BLOWFISH is not set
@@ -2435,6 +2504,7 @@ CONFIG_CRYPTO_DES=y
# Compression
#
# CONFIG_CRYPTO_DEFLATE is not set
+# CONFIG_CRYPTO_ZLIB is not set
# CONFIG_CRYPTO_LZO is not set
#
@@ -2444,10 +2514,12 @@ CONFIG_CRYPTO_DES=y
CONFIG_CRYPTO_HW=y
# CONFIG_CRYPTO_DEV_HIFN_795X is not set
CONFIG_HAVE_KVM=y
+CONFIG_HAVE_KVM_IRQCHIP=y
CONFIG_VIRTUALIZATION=y
# CONFIG_KVM is not set
# CONFIG_VIRTIO_PCI is not set
# CONFIG_VIRTIO_BALLOON is not set
+CONFIG_BINARY_PRINTF=y
#
# Library routines
@@ -2464,7 +2536,10 @@ CONFIG_CRC32=y
# CONFIG_CRC7 is not set
# CONFIG_LIBCRC32C is not set
CONFIG_ZLIB_INFLATE=y
-CONFIG_PLIST=y
+CONFIG_DECOMPRESS_GZIP=y
+CONFIG_DECOMPRESS_BZIP2=y
+CONFIG_DECOMPRESS_LZMA=y
CONFIG_HAS_IOMEM=y
CONFIG_HAS_IOPORT=y
CONFIG_HAS_DMA=y
+CONFIG_NLATTR=y
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index ebe7deedd5b..cfb0010fa94 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,6 +2,8 @@
# Arch-specific CryptoAPI modules.
#
+obj-$(CONFIG_CRYPTO_FPU) += fpu.o
+
obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index caba9960170..eb0566e8331 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -845,7 +845,7 @@ ENTRY(aesni_cbc_enc)
*/
ENTRY(aesni_cbc_dec)
cmp $16, LEN
- jb .Lcbc_dec_ret
+ jb .Lcbc_dec_just_ret
mov 480(KEYP), KLEN
add $240, KEYP
movups (IVP), IV
@@ -891,6 +891,7 @@ ENTRY(aesni_cbc_dec)
add $16, OUTP
cmp $16, LEN
jge .Lcbc_dec_loop1
- movups IV, (IVP)
.Lcbc_dec_ret:
+ movups IV, (IVP)
+.Lcbc_dec_just_ret:
ret
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 02af0af6549..c580c5ec1ca 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -21,6 +21,22 @@
#include <asm/i387.h>
#include <asm/aes.h>
+#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
+#define HAS_CTR
+#endif
+
+#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE)
+#define HAS_LRW
+#endif
+
+#if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE)
+#define HAS_PCBC
+#endif
+
+#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE)
+#define HAS_XTS
+#endif
+
struct async_aes_ctx {
struct cryptd_ablkcipher *cryptd_tfm;
};
@@ -137,6 +153,41 @@ static struct crypto_alg aesni_alg = {
}
};
+static void __aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
+
+ aesni_enc(ctx, dst, src);
+}
+
+static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
+
+ aesni_dec(ctx, dst, src);
+}
+
+static struct crypto_alg __aesni_alg = {
+ .cra_name = "__aes-aesni",
+ .cra_driver_name = "__driver-aes-aesni",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
+ .cra_alignmask = 0,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(__aesni_alg.cra_list),
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = AES_MIN_KEY_SIZE,
+ .cia_max_keysize = AES_MAX_KEY_SIZE,
+ .cia_setkey = aes_set_key,
+ .cia_encrypt = __aes_encrypt,
+ .cia_decrypt = __aes_decrypt
+ }
+ }
+};
+
static int ecb_encrypt(struct blkcipher_desc *desc,
struct scatterlist *dst, struct scatterlist *src,
unsigned int nbytes)
@@ -147,6 +198,7 @@ static int ecb_encrypt(struct blkcipher_desc *desc,
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
@@ -170,6 +222,7 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
@@ -215,6 +268,7 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
@@ -238,6 +292,7 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
@@ -277,8 +332,16 @@ static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
unsigned int key_len)
{
struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+ struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
+ int err;
- return crypto_ablkcipher_setkey(&ctx->cryptd_tfm->base, key, key_len);
+ crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
+ crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
+ & CRYPTO_TFM_REQ_MASK);
+ err = crypto_ablkcipher_setkey(child, key, key_len);
+ crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
+ & CRYPTO_TFM_RES_MASK);
+ return err;
}
static int ablk_encrypt(struct ablkcipher_request *req)
@@ -411,6 +474,163 @@ static struct crypto_alg ablk_cbc_alg = {
},
};
+#ifdef HAS_CTR
+static int ablk_ctr_init(struct crypto_tfm *tfm)
+{
+ struct cryptd_ablkcipher *cryptd_tfm;
+
+ cryptd_tfm = cryptd_alloc_ablkcipher("fpu(ctr(__driver-aes-aesni))",
+ 0, 0);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+ ablk_init_common(tfm, cryptd_tfm);
+ return 0;
+}
+
+static struct crypto_alg ablk_ctr_alg = {
+ .cra_name = "ctr(aes)",
+ .cra_driver_name = "ctr-aes-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct async_aes_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list),
+ .cra_init = ablk_ctr_init,
+ .cra_exit = ablk_exit,
+ .cra_u = {
+ .ablkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ .geniv = "chainiv",
+ },
+ },
+};
+#endif
+
+#ifdef HAS_LRW
+static int ablk_lrw_init(struct crypto_tfm *tfm)
+{
+ struct cryptd_ablkcipher *cryptd_tfm;
+
+ cryptd_tfm = cryptd_alloc_ablkcipher("fpu(lrw(__driver-aes-aesni))",
+ 0, 0);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+ ablk_init_common(tfm, cryptd_tfm);
+ return 0;
+}
+
+static struct crypto_alg ablk_lrw_alg = {
+ .cra_name = "lrw(aes)",
+ .cra_driver_name = "lrw-aes-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct async_aes_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(ablk_lrw_alg.cra_list),
+ .cra_init = ablk_lrw_init,
+ .cra_exit = ablk_exit,
+ .cra_u = {
+ .ablkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ },
+ },
+};
+#endif
+
+#ifdef HAS_PCBC
+static int ablk_pcbc_init(struct crypto_tfm *tfm)
+{
+ struct cryptd_ablkcipher *cryptd_tfm;
+
+ cryptd_tfm = cryptd_alloc_ablkcipher("fpu(pcbc(__driver-aes-aesni))",
+ 0, 0);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+ ablk_init_common(tfm, cryptd_tfm);
+ return 0;
+}
+
+static struct crypto_alg ablk_pcbc_alg = {
+ .cra_name = "pcbc(aes)",
+ .cra_driver_name = "pcbc-aes-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct async_aes_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(ablk_pcbc_alg.cra_list),
+ .cra_init = ablk_pcbc_init,
+ .cra_exit = ablk_exit,
+ .cra_u = {
+ .ablkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ },
+ },
+};
+#endif
+
+#ifdef HAS_XTS
+static int ablk_xts_init(struct crypto_tfm *tfm)
+{
+ struct cryptd_ablkcipher *cryptd_tfm;
+
+ cryptd_tfm = cryptd_alloc_ablkcipher("fpu(xts(__driver-aes-aesni))",
+ 0, 0);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+ ablk_init_common(tfm, cryptd_tfm);
+ return 0;
+}
+
+static struct crypto_alg ablk_xts_alg = {
+ .cra_name = "xts(aes)",
+ .cra_driver_name = "xts-aes-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct async_aes_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(ablk_xts_alg.cra_list),
+ .cra_init = ablk_xts_init,
+ .cra_exit = ablk_exit,
+ .cra_u = {
+ .ablkcipher = {
+ .min_keysize = 2 * AES_MIN_KEY_SIZE,
+ .max_keysize = 2 * AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ },
+ },
+};
+#endif
+
static int __init aesni_init(void)
{
int err;
@@ -421,6 +641,8 @@ static int __init aesni_init(void)
}
if ((err = crypto_register_alg(&aesni_alg)))
goto aes_err;
+ if ((err = crypto_register_alg(&__aesni_alg)))
+ goto __aes_err;
if ((err = crypto_register_alg(&blk_ecb_alg)))
goto blk_ecb_err;
if ((err = crypto_register_alg(&blk_cbc_alg)))
@@ -429,9 +651,41 @@ static int __init aesni_init(void)
goto ablk_ecb_err;
if ((err = crypto_register_alg(&ablk_cbc_alg)))
goto ablk_cbc_err;
+#ifdef HAS_CTR
+ if ((err = crypto_register_alg(&ablk_ctr_alg)))
+ goto ablk_ctr_err;
+#endif
+#ifdef HAS_LRW
+ if ((err = crypto_register_alg(&ablk_lrw_alg)))
+ goto ablk_lrw_err;
+#endif
+#ifdef HAS_PCBC
+ if ((err = crypto_register_alg(&ablk_pcbc_alg)))
+ goto ablk_pcbc_err;
+#endif
+#ifdef HAS_XTS
+ if ((err = crypto_register_alg(&ablk_xts_alg)))
+ goto ablk_xts_err;
+#endif
return err;
+#ifdef HAS_XTS
+ablk_xts_err:
+#endif
+#ifdef HAS_PCBC
+ crypto_unregister_alg(&ablk_pcbc_alg);
+ablk_pcbc_err:
+#endif
+#ifdef HAS_LRW
+ crypto_unregister_alg(&ablk_lrw_alg);
+ablk_lrw_err:
+#endif
+#ifdef HAS_CTR
+ crypto_unregister_alg(&ablk_ctr_alg);
+ablk_ctr_err:
+#endif
+ crypto_unregister_alg(&ablk_cbc_alg);
ablk_cbc_err:
crypto_unregister_alg(&ablk_ecb_alg);
ablk_ecb_err:
@@ -439,6 +693,8 @@ ablk_ecb_err:
blk_cbc_err:
crypto_unregister_alg(&blk_ecb_alg);
blk_ecb_err:
+ crypto_unregister_alg(&__aesni_alg);
+__aes_err:
crypto_unregister_alg(&aesni_alg);
aes_err:
return err;
@@ -446,10 +702,23 @@ aes_err:
static void __exit aesni_exit(void)
{
+#ifdef HAS_XTS
+ crypto_unregister_alg(&ablk_xts_alg);
+#endif
+#ifdef HAS_PCBC
+ crypto_unregister_alg(&ablk_pcbc_alg);
+#endif
+#ifdef HAS_LRW
+ crypto_unregister_alg(&ablk_lrw_alg);
+#endif
+#ifdef HAS_CTR
+ crypto_unregister_alg(&ablk_ctr_alg);
+#endif
crypto_unregister_alg(&ablk_cbc_alg);
crypto_unregister_alg(&ablk_ecb_alg);
crypto_unregister_alg(&blk_cbc_alg);
crypto_unregister_alg(&blk_ecb_alg);
+ crypto_unregister_alg(&__aesni_alg);
crypto_unregister_alg(&aesni_alg);
}
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
new file mode 100644
index 00000000000..daef6cd2b45
--- /dev/null
+++ b/arch/x86/crypto/fpu.c
@@ -0,0 +1,166 @@
+/*
+ * FPU: Wrapper for blkcipher touching fpu
+ *
+ * Copyright (c) Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/algapi.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/i387.h>
+
+struct crypto_fpu_ctx {
+ struct crypto_blkcipher *child;
+};
+
+static int crypto_fpu_setkey(struct crypto_tfm *parent, const u8 *key,
+ unsigned int keylen)
+{
+ struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(parent);
+ struct crypto_blkcipher *child = ctx->child;
+ int err;
+
+ crypto_blkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
+ crypto_blkcipher_set_flags(child, crypto_tfm_get_flags(parent) &
+ CRYPTO_TFM_REQ_MASK);
+ err = crypto_blkcipher_setkey(child, key, keylen);
+ crypto_tfm_set_flags(parent, crypto_blkcipher_get_flags(child) &
+ CRYPTO_TFM_RES_MASK);
+ return err;
+}
+
+static int crypto_fpu_encrypt(struct blkcipher_desc *desc_in,
+ struct scatterlist *dst, struct scatterlist *src,
+ unsigned int nbytes)
+{
+ int err;
+ struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm);
+ struct crypto_blkcipher *child = ctx->child;
+ struct blkcipher_desc desc = {
+ .tfm = child,
+ .info = desc_in->info,
+ .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP,
+ };
+
+ kernel_fpu_begin();
+ err = crypto_blkcipher_crt(desc.tfm)->encrypt(&desc, dst, src, nbytes);
+ kernel_fpu_end();
+ return err;
+}
+
+static int crypto_fpu_decrypt(struct blkcipher_desc *desc_in,
+ struct scatterlist *dst, struct scatterlist *src,
+ unsigned int nbytes)
+{
+ int err;
+ struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm);
+ struct crypto_blkcipher *child = ctx->child;
+ struct blkcipher_desc desc = {
+ .tfm = child,
+ .info = desc_in->info,
+ .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP,
+ };
+
+ kernel_fpu_begin();
+ err = crypto_blkcipher_crt(desc.tfm)->decrypt(&desc, dst, src, nbytes);
+ kernel_fpu_end();
+ return err;
+}
+
+static int crypto_fpu_init_tfm(struct crypto_tfm *tfm)
+{
+ struct crypto_instance *inst = crypto_tfm_alg_instance(tfm);
+ struct crypto_spawn *spawn = crypto_instance_ctx(inst);
+ struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm);
+ struct crypto_blkcipher *cipher;
+
+ cipher = crypto_spawn_blkcipher(spawn);
+ if (IS_ERR(cipher))
+ return PTR_ERR(cipher);
+
+ ctx->child = cipher;
+ return 0;
+}
+
+static void crypto_fpu_exit_tfm(struct crypto_tfm *tfm)
+{
+ struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm);
+ crypto_free_blkcipher(ctx->child);
+}
+
+static struct crypto_instance *crypto_fpu_alloc(struct rtattr **tb)
+{
+ struct crypto_instance *inst;
+ struct crypto_alg *alg;
+ int err;
+
+ err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_BLKCIPHER);
+ if (err)
+ return ERR_PTR(err);
+
+ alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_BLKCIPHER,
+ CRYPTO_ALG_TYPE_MASK);
+ if (IS_ERR(alg))
+ return ERR_CAST(alg);
+
+ inst = crypto_alloc_instance("fpu", alg);
+ if (IS_ERR(inst))
+ goto out_put_alg;
+
+ inst->alg.cra_flags = alg->cra_flags;
+ inst->alg.cra_priority = alg->cra_priority;
+ inst->alg.cra_blocksize = alg->cra_blocksize;
+ inst->alg.cra_alignmask = alg->cra_alignmask;
+ inst->alg.cra_type = alg->cra_type;
+ inst->alg.cra_blkcipher.ivsize = alg->cra_blkcipher.ivsize;
+ inst->alg.cra_blkcipher.min_keysize = alg->cra_blkcipher.min_keysize;
+ inst->alg.cra_blkcipher.max_keysize = alg->cra_blkcipher.max_keysize;
+ inst->alg.cra_ctxsize = sizeof(struct crypto_fpu_ctx);
+ inst->alg.cra_init = crypto_fpu_init_tfm;
+ inst->alg.cra_exit = crypto_fpu_exit_tfm;
+ inst->alg.cra_blkcipher.setkey = crypto_fpu_setkey;
+ inst->alg.cra_blkcipher.encrypt = crypto_fpu_encrypt;
+ inst->alg.cra_blkcipher.decrypt = crypto_fpu_decrypt;
+
+out_put_alg:
+ crypto_mod_put(alg);
+ return inst;
+}
+
+static void crypto_fpu_free(struct crypto_instance *inst)
+{
+ crypto_drop_spawn(crypto_instance_ctx(inst));
+ kfree(inst);
+}
+
+static struct crypto_template crypto_fpu_tmpl = {
+ .name = "fpu",
+ .alloc = crypto_fpu_alloc,
+ .free = crypto_fpu_free,
+ .module = THIS_MODULE,
+};
+
+static int __init crypto_fpu_module_init(void)
+{
+ return crypto_register_template(&crypto_fpu_tmpl);
+}
+
+static void __exit crypto_fpu_module_exit(void)
+{
+ crypto_unregister_template(&crypto_fpu_tmpl);
+}
+
+module_init(crypto_fpu_module_init);
+module_exit(crypto_fpu_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("FPU block cipher wrapper");
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a505202086e..e590261ba05 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -825,9 +825,11 @@ ia32_sys_call_table:
.quad compat_sys_signalfd4
.quad sys_eventfd2
.quad sys_epoll_create1
- .quad sys_dup3 /* 330 */
+ .quad sys_dup3 /* 330 */
.quad sys_pipe2
.quad sys_inotify_init1
.quad compat_sys_preadv
.quad compat_sys_pwritev
+ .quad compat_sys_rt_tgsigqueueinfo /* 335 */
+ .quad sys_perf_counter_open
ia32_syscall_end:
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 4518dc50090..20d1465a2ab 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -144,6 +144,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
#else /* !CONFIG_ACPI */
+#define acpi_disabled 1
#define acpi_lapic 0
#define acpi_ioapic 0
static inline void acpi_noirq_set(void) { }
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index f6aa18eadf7..1a37bcdc860 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -3,6 +3,7 @@
#include <linux/types.h>
#include <linux/stddef.h>
+#include <linux/stringify.h>
#include <asm/asm.h>
/*
@@ -74,6 +75,22 @@ static inline void alternatives_smp_switch(int smp) {}
const unsigned char *const *find_nop_table(void);
+/* alternative assembly primitive: */
+#define ALTERNATIVE(oldinstr, newinstr, feature) \
+ \
+ "661:\n\t" oldinstr "\n662:\n" \
+ ".section .altinstructions,\"a\"\n" \
+ _ASM_ALIGN "\n" \
+ _ASM_PTR "661b\n" /* label */ \
+ _ASM_PTR "663f\n" /* new instruction */ \
+ " .byte " __stringify(feature) "\n" /* feature bit */ \
+ " .byte 662b-661b\n" /* sourcelen */ \
+ " .byte 664f-663f\n" /* replacementlen */ \
+ ".previous\n" \
+ ".section .altinstr_replacement, \"ax\"\n" \
+ "663:\n\t" newinstr "\n664:\n" /* replacement */ \
+ ".previous"
+
/*
* Alternative instructions for different CPU types or capabilities.
*
@@ -87,18 +104,7 @@ const unsigned char *const *find_nop_table(void);
* without volatile and memory clobber.
*/
#define alternative(oldinstr, newinstr, feature) \
- asm volatile ("661:\n\t" oldinstr "\n662:\n" \
- ".section .altinstructions,\"a\"\n" \
- _ASM_ALIGN "\n" \
- _ASM_PTR "661b\n" /* label */ \
- _ASM_PTR "663f\n" /* new instruction */ \
- " .byte %c0\n" /* feature bit */ \
- " .byte 662b-661b\n" /* sourcelen */ \
- " .byte 664f-663f\n" /* replacementlen */ \
- ".previous\n" \
- ".section .altinstr_replacement,\"ax\"\n" \
- "663:\n\t" newinstr "\n664:\n" /* replacement */ \
- ".previous" :: "i" (feature) : "memory")
+ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
/*
* Alternative inline assembly with input.
@@ -109,35 +115,16 @@ const unsigned char *const *find_nop_table(void);
* Best is to use constraints that are fixed size (like (%1) ... "r")
* If you use variable sized constraints like "m" or "g" in the
* replacement make sure to pad to the worst case length.
+ * Leaving an unused argument 0 to keep API compatibility.
*/
#define alternative_input(oldinstr, newinstr, feature, input...) \
- asm volatile ("661:\n\t" oldinstr "\n662:\n" \
- ".section .altinstructions,\"a\"\n" \
- _ASM_ALIGN "\n" \
- _ASM_PTR "661b\n" /* label */ \
- _ASM_PTR "663f\n" /* new instruction */ \
- " .byte %c0\n" /* feature bit */ \
- " .byte 662b-661b\n" /* sourcelen */ \
- " .byte 664f-663f\n" /* replacementlen */ \
- ".previous\n" \
- ".section .altinstr_replacement,\"ax\"\n" \
- "663:\n\t" newinstr "\n664:\n" /* replacement */ \
- ".previous" :: "i" (feature), ##input)
+ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
+ : : "i" (0), ## input)
/* Like alternative_input, but with a single output argument */
#define alternative_io(oldinstr, newinstr, feature, output, input...) \
- asm volatile ("661:\n\t" oldinstr "\n662:\n" \
- ".section .altinstructions,\"a\"\n" \
- _ASM_ALIGN "\n" \
- _ASM_PTR "661b\n" /* label */ \
- _ASM_PTR "663f\n" /* new instruction */ \
- " .byte %c[feat]\n" /* feature bit */ \
- " .byte 662b-661b\n" /* sourcelen */ \
- " .byte 664f-663f\n" /* replacementlen */ \
- ".previous\n" \
- ".section .altinstr_replacement,\"ax\"\n" \
- "663:\n\t" newinstr "\n664:\n" /* replacement */ \
- ".previous" : output : [feat] "i" (feature), ##input)
+ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
+ : output : "i" (0), ## input)
/*
* use this macro(s) if you need more than one output parameter
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index f712344329b..bdf96f119f0 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -27,9 +27,13 @@ extern int amd_iommu_init(void);
extern int amd_iommu_init_dma_ops(void);
extern void amd_iommu_detect(void);
extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
+extern void amd_iommu_flush_all_domains(void);
+extern void amd_iommu_flush_all_devices(void);
+extern void amd_iommu_shutdown(void);
#else
static inline int amd_iommu_init(void) { return -ENODEV; }
static inline void amd_iommu_detect(void) { }
+static inline void amd_iommu_shutdown(void) { }
#endif
#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 95c8cd9d22b..0c878caaa0a 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -194,6 +194,27 @@
#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */
#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops
domain for an IOMMU */
+extern bool amd_iommu_dump;
+#define DUMP_printk(format, arg...) \
+ do { \
+ if (amd_iommu_dump) \
+ printk(KERN_INFO "AMD IOMMU: " format, ## arg); \
+ } while(0);
+
+/*
+ * Make iterating over all IOMMUs easier
+ */
+#define for_each_iommu(iommu) \
+ list_for_each_entry((iommu), &amd_iommu_list, list)
+#define for_each_iommu_safe(iommu, next) \
+ list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list)
+
+#define APERTURE_RANGE_SHIFT 27 /* 128 MB */
+#define APERTURE_RANGE_SIZE (1ULL << APERTURE_RANGE_SHIFT)
+#define APERTURE_RANGE_PAGES (APERTURE_RANGE_SIZE >> PAGE_SHIFT)
+#define APERTURE_MAX_RANGES 32 /* allows 4GB of DMA address space */
+#define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT)
+#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL)
/*
* This structure contains generic data for IOMMU protection domains
@@ -210,6 +231,26 @@ struct protection_domain {
};
/*
+ * For dynamic growth the aperture size is split into ranges of 128MB of
+ * DMA address space each. This struct represents one such range.
+ */
+struct aperture_range {
+
+ /* address allocation bitmap */
+ unsigned long *bitmap;
+
+ /*
+ * Array of PTE pages for the aperture. In this array we save all the
+ * leaf pages of the domain page table used for the aperture. This way
+ * we don't need to walk the page table to find a specific PTE. We can
+ * just calculate its address in constant time.
+ */
+ u64 *pte_pages[64];
+
+ unsigned long offset;
+};
+
+/*
* Data container for a dma_ops specific protection domain
*/
struct dma_ops_domain {
@@ -222,18 +263,10 @@ struct dma_ops_domain {
unsigned long aperture_size;
/* address we start to search for free addresses */
- unsigned long next_bit;
-
- /* address allocation bitmap */
- unsigned long *bitmap;
+ unsigned long next_address;
- /*
- * Array of PTE pages for the aperture. In this array we save all the
- * leaf pages of the domain page table used for the aperture. This way
- * we don't need to walk the page table to find a specific PTE. We can
- * just calculate its address in constant time.
- */
- u64 **pte_pages;
+ /* address space relevant data */
+ struct aperture_range *aperture[APERTURE_MAX_RANGES];
/* This will be set to true when TLB needs to be flushed */
bool need_flush;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 42f2f837742..bb7d4792584 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -107,8 +107,7 @@ extern u32 native_safe_apic_wait_icr_idle(void);
extern void native_apic_icr_write(u32 low, u32 id);
extern u64 native_apic_icr_read(void);
-#define EIM_8BIT_APIC_ID 0
-#define EIM_32BIT_APIC_ID 1
+extern int x2apic_mode;
#ifdef CONFIG_X86_X2APIC
/*
@@ -166,10 +165,9 @@ static inline u64 native_x2apic_icr_read(void)
return val;
}
-extern int x2apic, x2apic_phys;
+extern int x2apic_phys;
extern void check_x2apic(void);
extern void enable_x2apic(void);
-extern void enable_IR_x2apic(void);
extern void x2apic_icr_write(u32 low, u32 id);
static inline int x2apic_enabled(void)
{
@@ -183,6 +181,8 @@ static inline int x2apic_enabled(void)
return 1;
return 0;
}
+
+#define x2apic_supported() (cpu_has_x2apic)
#else
static inline void check_x2apic(void)
{
@@ -190,28 +190,20 @@ static inline void check_x2apic(void)
static inline void enable_x2apic(void)
{
}
-static inline void enable_IR_x2apic(void)
-{
-}
static inline int x2apic_enabled(void)
{
return 0;
}
-#define x2apic 0
-
+#define x2apic_preenabled 0
+#define x2apic_supported() 0
#endif
-extern int get_physical_broadcast(void);
+extern void enable_IR_x2apic(void);
-#ifdef CONFIG_X86_X2APIC
-static inline void ack_x2APIC_irq(void)
-{
- /* Docs say use 0 for future compatibility */
- native_apic_msr_write(APIC_EOI, 0);
-}
-#endif
+extern int get_physical_broadcast(void);
+extern void apic_disable(void);
extern int lapic_get_maxlvt(void);
extern void clear_local_APIC(void);
extern void connect_bsp_APIC(void);
@@ -252,7 +244,7 @@ static inline void lapic_shutdown(void) { }
#define local_apic_timer_c2_ok 1
static inline void init_apic_mappings(void) { }
static inline void disable_local_APIC(void) { }
-
+static inline void apic_disable(void) { }
#endif /* !CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_64
@@ -410,7 +402,7 @@ static inline unsigned default_get_apic_id(unsigned long x)
{
unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
- if (APIC_XAPIC(ver))
+ if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID))
return (x >> 24) & 0xFF;
else
return (x >> 24) & 0x0F;
@@ -478,6 +470,9 @@ static inline unsigned int read_apic_id(void)
extern void default_setup_apic_routing(void);
#ifdef CONFIG_X86_32
+
+extern struct apic apic_default;
+
/*
* Set up the logical destination ID.
*
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index bc9514fb3b1..7ddb36ab933 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -22,6 +22,7 @@
# define APIC_INTEGRATED(x) (1)
#endif
#define APIC_XAPIC(x) ((x) >= 0x14)
+#define APIC_EXT_SPACE(x) ((x) & 0x80000000)
#define APIC_TASKPRI 0x80
#define APIC_TPRI_MASK 0xFFu
#define APIC_ARBPRI 0x90
@@ -116,7 +117,9 @@
#define APIC_TDR_DIV_32 0x8
#define APIC_TDR_DIV_64 0x9
#define APIC_TDR_DIV_128 0xA
-#define APIC_EILVT0 0x500
+#define APIC_EFEAT 0x400
+#define APIC_ECTRL 0x410
+#define APIC_EILVTn(n) (0x500 + 0x10 * n)
#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */
#define APIC_EILVT_NR_AMD_10H 4
#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF)
@@ -125,9 +128,6 @@
#define APIC_EILVT_MSG_NMI 0x4
#define APIC_EILVT_MSG_EXT 0x7
#define APIC_EILVT_MASKED (1 << 16)
-#define APIC_EILVT1 0x510
-#define APIC_EILVT2 0x520
-#define APIC_EILVT3 0x530
#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
#define APIC_BASE_MSR 0x800
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 85b46fba422..dc5a667ff79 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -19,7 +19,10 @@
*
* Atomically reads the value of @v.
*/
-#define atomic_read(v) ((v)->counter)
+static inline int atomic_read(const atomic_t *v)
+{
+ return v->counter;
+}
/**
* atomic_set - set atomic variable
@@ -28,7 +31,10 @@
*
* Atomically sets the value of @v to @i.
*/
-#define atomic_set(v, i) (((v)->counter) = (i))
+static inline void atomic_set(atomic_t *v, int i)
+{
+ v->counter = i;
+}
/**
* atomic_add - add integer to atomic variable
@@ -200,8 +206,15 @@ static inline int atomic_sub_return(int i, atomic_t *v)
return atomic_add_return(-i, v);
}
-#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new)))
-#define atomic_xchg(v, new) (xchg(&((v)->counter), (new)))
+static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
+{
+ return cmpxchg(&v->counter, old, new);
+}
+
+static inline int atomic_xchg(atomic_t *v, int new)
+{
+ return xchg(&v->counter, new);
+}
/**
* atomic_add_unless - add unless the number is already a given value
@@ -247,5 +260,156 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
#define smp_mb__before_atomic_inc() barrier()
#define smp_mb__after_atomic_inc() barrier()
-#include <asm-generic/atomic.h>
+/* An 64bit atomic type */
+
+typedef struct {
+ u64 __aligned(8) counter;
+} atomic64_t;
+
+#define ATOMIC64_INIT(val) { (val) }
+
+extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val);
+
+/**
+ * atomic64_xchg - xchg atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ * @new_val: value to assign
+ *
+ * Atomically xchgs the value of @ptr to @new_val and returns
+ * the old value.
+ */
+extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val);
+
+/**
+ * atomic64_set - set atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ * @new_val: value to assign
+ *
+ * Atomically sets the value of @ptr to @new_val.
+ */
+extern void atomic64_set(atomic64_t *ptr, u64 new_val);
+
+/**
+ * atomic64_read - read atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically reads the value of @ptr and returns it.
+ */
+static inline u64 atomic64_read(atomic64_t *ptr)
+{
+ u64 res;
+
+ /*
+ * Note, we inline this atomic64_t primitive because
+ * it only clobbers EAX/EDX and leaves the others
+ * untouched. We also (somewhat subtly) rely on the
+ * fact that cmpxchg8b returns the current 64-bit value
+ * of the memory location we are touching:
+ */
+ asm volatile(
+ "mov %%ebx, %%eax\n\t"
+ "mov %%ecx, %%edx\n\t"
+ LOCK_PREFIX "cmpxchg8b %1\n"
+ : "=&A" (res)
+ : "m" (*ptr)
+ );
+
+ return res;
+}
+
+extern u64 atomic64_read(atomic64_t *ptr);
+
+/**
+ * atomic64_add_return - add and return
+ * @delta: integer value to add
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns @delta + *@ptr
+ */
+extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr);
+
+/*
+ * Other variants with different arithmetic operators:
+ */
+extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr);
+extern u64 atomic64_inc_return(atomic64_t *ptr);
+extern u64 atomic64_dec_return(atomic64_t *ptr);
+
+/**
+ * atomic64_add - add integer to atomic64 variable
+ * @delta: integer value to add
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr.
+ */
+extern void atomic64_add(u64 delta, atomic64_t *ptr);
+
+/**
+ * atomic64_sub - subtract the atomic64 variable
+ * @delta: integer value to subtract
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr.
+ */
+extern void atomic64_sub(u64 delta, atomic64_t *ptr);
+
+/**
+ * atomic64_sub_and_test - subtract value from variable and test result
+ * @delta: integer value to subtract
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr);
+
+/**
+ * atomic64_inc - increment atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1.
+ */
+extern void atomic64_inc(atomic64_t *ptr);
+
+/**
+ * atomic64_dec - decrement atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1.
+ */
+extern void atomic64_dec(atomic64_t *ptr);
+
+/**
+ * atomic64_dec_and_test - decrement and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+extern int atomic64_dec_and_test(atomic64_t *ptr);
+
+/**
+ * atomic64_inc_and_test - increment and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+extern int atomic64_inc_and_test(atomic64_t *ptr);
+
+/**
+ * atomic64_add_negative - add and test if negative
+ * @delta: integer value to add
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+extern int atomic64_add_negative(u64 delta, atomic64_t *ptr);
+
+#include <asm-generic/atomic-long.h>
#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h
index 8c21731984d..d605dc268e7 100644
--- a/arch/x86/include/asm/atomic_64.h
+++ b/arch/x86/include/asm/atomic_64.h
@@ -18,7 +18,10 @@
*
* Atomically reads the value of @v.
*/
-#define atomic_read(v) ((v)->counter)
+static inline int atomic_read(const atomic_t *v)
+{
+ return v->counter;
+}
/**
* atomic_set - set atomic variable
@@ -27,7 +30,10 @@
*
* Atomically sets the value of @v to @i.
*/
-#define atomic_set(v, i) (((v)->counter) = (i))
+static inline void atomic_set(atomic_t *v, int i)
+{
+ v->counter = i;
+}
/**
* atomic_add - add integer to atomic variable
@@ -192,7 +198,10 @@ static inline int atomic_sub_return(int i, atomic_t *v)
* Atomically reads the value of @v.
* Doesn't imply a read memory barrier.
*/
-#define atomic64_read(v) ((v)->counter)
+static inline long atomic64_read(const atomic64_t *v)
+{
+ return v->counter;
+}
/**
* atomic64_set - set atomic64 variable
@@ -201,7 +210,10 @@ static inline int atomic_sub_return(int i, atomic_t *v)
*
* Atomically sets the value of @v to @i.
*/
-#define atomic64_set(v, i) (((v)->counter) = (i))
+static inline void atomic64_set(atomic64_t *v, long i)
+{
+ v->counter = i;
+}
/**
* atomic64_add - add integer to atomic64 variable
@@ -355,11 +367,25 @@ static inline long atomic64_sub_return(long i, atomic64_t *v)
#define atomic64_inc_return(v) (atomic64_add_return(1, (v)))
#define atomic64_dec_return(v) (atomic64_sub_return(1, (v)))
-#define atomic64_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new)))
-#define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
+static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
+{
+ return cmpxchg(&v->counter, old, new);
+}
+
+static inline long atomic64_xchg(atomic64_t *v, long new)
+{
+ return xchg(&v->counter, new);
+}
-#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new)))
-#define atomic_xchg(v, new) (xchg(&((v)->counter), (new)))
+static inline long atomic_cmpxchg(atomic_t *v, int old, int new)
+{
+ return cmpxchg(&v->counter, old, new);
+}
+
+static inline long atomic_xchg(atomic_t *v, int new)
+{
+ return xchg(&v->counter, new);
+}
/**
* atomic_add_unless - add unless the number is a given value
@@ -455,5 +481,5 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
#define smp_mb__before_atomic_inc() barrier()
#define smp_mb__after_atomic_inc() barrier()
-#include <asm-generic/atomic.h>
+#include <asm-generic/atomic-long.h>
#endif /* _ASM_X86_ATOMIC_64_H */
diff --git a/arch/x86/include/asm/bitsperlong.h b/arch/x86/include/asm/bitsperlong.h
new file mode 100644
index 00000000000..b0ae1c4dc79
--- /dev/null
+++ b/arch/x86/include/asm/bitsperlong.h
@@ -0,0 +1,13 @@
+#ifndef __ASM_X86_BITSPERLONG_H
+#define __ASM_X86_BITSPERLONG_H
+
+#ifdef __x86_64__
+# define __BITS_PER_LONG 64
+#else
+# define __BITS_PER_LONG 32
+#endif
+
+#include <asm-generic/bitsperlong.h>
+
+#endif /* __ASM_X86_BITSPERLONG_H */
+
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 6ba23dd9fc9..7a1065958ba 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -8,11 +8,26 @@
#ifdef __KERNEL__
+#include <asm/pgtable_types.h>
+
/* Physical address where kernel should be loaded. */
#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
+ (CONFIG_PHYSICAL_ALIGN - 1)) \
& ~(CONFIG_PHYSICAL_ALIGN - 1))
+/* Minimum kernel alignment, as a power of two */
+#ifdef CONFIG_X86_64
+#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT
+#else
+#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_ORDER)
+#endif
+#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
+
+#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \
+ (CONFIG_PHYSICAL_ALIGN < (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2))
+#error "Invalid value for CONFIG_PHYSICAL_ALIGN"
+#endif
+
#ifdef CONFIG_KERNEL_BZIP2
#define BOOT_HEAP_SIZE 0x400000
#else /* !CONFIG_KERNEL_BZIP2 */
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 433adaebf9b..1724e8de317 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -50,7 +50,8 @@ struct setup_header {
__u32 ramdisk_size;
__u32 bootsect_kludge;
__u16 heap_end_ptr;
- __u16 _pad1;
+ __u8 ext_loader_ver;
+ __u8 ext_loader_type;
__u32 cmd_line_ptr;
__u32 initrd_addr_max;
__u32 kernel_alignment;
diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
index 222802029fa..d96c1ee3a95 100644
--- a/arch/x86/include/asm/cpu_debug.h
+++ b/arch/x86/include/asm/cpu_debug.h
@@ -86,105 +86,7 @@ enum cpu_file_bit {
CPU_VALUE_BIT, /* value */
};
-#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT)
-
-/*
- * DisplayFamily_DisplayModel Processor Families/Processor Number Series
- * -------------------------- ------------------------------------------
- * 05_01, 05_02, 05_04 Pentium, Pentium with MMX
- *
- * 06_01 Pentium Pro
- * 06_03, 06_05 Pentium II Xeon, Pentium II
- * 06_07, 06_08, 06_0A, 06_0B Pentium III Xeon, Pentum III
- *
- * 06_09, 060D Pentium M
- *
- * 06_0E Core Duo, Core Solo
- *
- * 06_0F Xeon 3000, 3200, 5100, 5300, 7300 series,
- * Core 2 Quad, Core 2 Extreme, Core 2 Duo,
- * Pentium dual-core
- * 06_17 Xeon 5200, 5400 series, Core 2 Quad Q9650
- *
- * 06_1C Atom
- *
- * 0F_00, 0F_01, 0F_02 Xeon, Xeon MP, Pentium 4
- * 0F_03, 0F_04 Xeon, Xeon MP, Pentium 4, Pentium D
- *
- * 0F_06 Xeon 7100, 5000 Series, Xeon MP,
- * Pentium 4, Pentium D
- */
-
-/* Register processors bits */
-enum cpu_processor_bit {
- CPU_NONE,
-/* Intel */
- CPU_INTEL_PENTIUM_BIT,
- CPU_INTEL_P6_BIT,
- CPU_INTEL_PENTIUM_M_BIT,
- CPU_INTEL_CORE_BIT,
- CPU_INTEL_CORE2_BIT,
- CPU_INTEL_ATOM_BIT,
- CPU_INTEL_XEON_P4_BIT,
- CPU_INTEL_XEON_MP_BIT,
-/* AMD */
- CPU_AMD_K6_BIT,
- CPU_AMD_K7_BIT,
- CPU_AMD_K8_BIT,
- CPU_AMD_0F_BIT,
- CPU_AMD_10_BIT,
- CPU_AMD_11_BIT,
-};
-
-#define CPU_INTEL_PENTIUM (1 << CPU_INTEL_PENTIUM_BIT)
-#define CPU_INTEL_P6 (1 << CPU_INTEL_P6_BIT)
-#define CPU_INTEL_PENTIUM_M (1 << CPU_INTEL_PENTIUM_M_BIT)
-#define CPU_INTEL_CORE (1 << CPU_INTEL_CORE_BIT)
-#define CPU_INTEL_CORE2 (1 << CPU_INTEL_CORE2_BIT)
-#define CPU_INTEL_ATOM (1 << CPU_INTEL_ATOM_BIT)
-#define CPU_INTEL_XEON_P4 (1 << CPU_INTEL_XEON_P4_BIT)
-#define CPU_INTEL_XEON_MP (1 << CPU_INTEL_XEON_MP_BIT)
-
-#define CPU_INTEL_PX (CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M)
-#define CPU_INTEL_COREX (CPU_INTEL_CORE | CPU_INTEL_CORE2)
-#define CPU_INTEL_XEON (CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP)
-#define CPU_CO_AT (CPU_INTEL_CORE | CPU_INTEL_ATOM)
-#define CPU_C2_AT (CPU_INTEL_CORE2 | CPU_INTEL_ATOM)
-#define CPU_CX_AT (CPU_INTEL_COREX | CPU_INTEL_ATOM)
-#define CPU_CX_XE (CPU_INTEL_COREX | CPU_INTEL_XEON)
-#define CPU_P6_XE (CPU_INTEL_P6 | CPU_INTEL_XEON)
-#define CPU_PM_CO_AT (CPU_INTEL_PENTIUM_M | CPU_CO_AT)
-#define CPU_C2_AT_XE (CPU_C2_AT | CPU_INTEL_XEON)
-#define CPU_CX_AT_XE (CPU_CX_AT | CPU_INTEL_XEON)
-#define CPU_P6_CX_AT (CPU_INTEL_P6 | CPU_CX_AT)
-#define CPU_P6_CX_XE (CPU_P6_XE | CPU_INTEL_COREX)
-#define CPU_P6_CX_AT_XE (CPU_INTEL_P6 | CPU_CX_AT_XE)
-#define CPU_PM_CX_AT_XE (CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE)
-#define CPU_PM_CX_AT (CPU_INTEL_PENTIUM_M | CPU_CX_AT)
-#define CPU_PM_CX_XE (CPU_INTEL_PENTIUM_M | CPU_CX_XE)
-#define CPU_PX_CX_AT (CPU_INTEL_PX | CPU_CX_AT)
-#define CPU_PX_CX_AT_XE (CPU_INTEL_PX | CPU_CX_AT_XE)
-
-/* Select all supported Intel CPUs */
-#define CPU_INTEL_ALL (CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE)
-
-#define CPU_AMD_K6 (1 << CPU_AMD_K6_BIT)
-#define CPU_AMD_K7 (1 << CPU_AMD_K7_BIT)
-#define CPU_AMD_K8 (1 << CPU_AMD_K8_BIT)
-#define CPU_AMD_0F (1 << CPU_AMD_0F_BIT)
-#define CPU_AMD_10 (1 << CPU_AMD_10_BIT)
-#define CPU_AMD_11 (1 << CPU_AMD_11_BIT)
-
-#define CPU_K10_PLUS (CPU_AMD_10 | CPU_AMD_11)
-#define CPU_K0F_PLUS (CPU_AMD_0F | CPU_K10_PLUS)
-#define CPU_K8_PLUS (CPU_AMD_K8 | CPU_K0F_PLUS)
-#define CPU_K7_PLUS (CPU_AMD_K7 | CPU_K8_PLUS)
-
-/* Select all supported AMD CPUs */
-#define CPU_AMD_ALL (CPU_AMD_K6 | CPU_K7_PLUS)
-
-/* Select all supported CPUs */
-#define CPU_ALL (CPU_INTEL_ALL | CPU_AMD_ALL)
+#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT)
#define MAX_CPU_FILES 512
@@ -220,7 +122,6 @@ struct cpu_debug_range {
unsigned min; /* Register range min */
unsigned max; /* Register range max */
unsigned flag; /* Supported flags */
- unsigned model; /* Supported models */
};
#endif /* _ASM_X86_CPU_DEBUG_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index bb83b1c397a..4a28d22d479 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -22,7 +22,7 @@
#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */
#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */
#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */
-#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Architecture */
+#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Exception */
#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */
#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */
#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */
@@ -94,6 +94,7 @@
#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
+#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */
/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
@@ -115,6 +116,8 @@
#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */
#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */
#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */
+#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */
+#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */
#define X86_FEATURE_AES (4*32+25) /* AES instructions */
#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
@@ -192,11 +195,11 @@ extern const char * const x86_power_flags[32];
#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability))
#define setup_clear_cpu_cap(bit) do { \
clear_cpu_cap(&boot_cpu_data, bit); \
- set_bit(bit, (unsigned long *)cleared_cpu_caps); \
+ set_bit(bit, (unsigned long *)cpu_caps_cleared); \
} while (0)
#define setup_force_cpu_cap(bit) do { \
set_cpu_cap(&boot_cpu_data, bit); \
- clear_bit(bit, (unsigned long *)cleared_cpu_caps); \
+ set_bit(bit, (unsigned long *)cpu_caps_set); \
} while (0)
#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU)
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index c45f415ce31..c993e9e0fed 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -1,7 +1,6 @@
#ifndef _ASM_X86_DESC_H
#define _ASM_X86_DESC_H
-#ifndef __ASSEMBLY__
#include <asm/desc_defs.h>
#include <asm/ldt.h>
#include <asm/mmu.h>
@@ -380,29 +379,4 @@ static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
_set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
}
-#else
-/*
- * GET_DESC_BASE reads the descriptor base of the specified segment.
- *
- * Args:
- * idx - descriptor index
- * gdt - GDT pointer
- * base - 32bit register to which the base will be written
- * lo_w - lo word of the "base" register
- * lo_b - lo byte of the "base" register
- * hi_b - hi byte of the low word of the "base" register
- *
- * Example:
- * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
- * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
- */
-#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
- movb idx * 8 + 4(gdt), lo_b; \
- movb idx * 8 + 7(gdt), hi_b; \
- shll $16, base; \
- movw idx * 8 + 2(gdt), lo_w;
-
-
-#endif /* __ASSEMBLY__ */
-
#endif /* _ASM_X86_DESC_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index f82fdc412c6..1c3f9435f1c 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -6,6 +6,7 @@
* Documentation/DMA-API.txt for documentation.
*/
+#include <linux/kmemcheck.h>
#include <linux/scatterlist.h>
#include <linux/dma-debug.h>
#include <linux/dma-attrs.h>
@@ -32,6 +33,8 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
#endif
}
+#include <asm-generic/dma-mapping-common.h>
+
/* Make sure we keep the same behaviour */
static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
{
@@ -52,171 +55,6 @@ extern int dma_set_mask(struct device *dev, u64 mask);
extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_addr, gfp_t flag);
-static inline dma_addr_t
-dma_map_single(struct device *hwdev, void *ptr, size_t size,
- enum dma_data_direction dir)
-{
- struct dma_map_ops *ops = get_dma_ops(hwdev);
- dma_addr_t addr;
-
- BUG_ON(!valid_dma_direction(dir));
- addr = ops->map_page(hwdev, virt_to_page(ptr),
- (unsigned long)ptr & ~PAGE_MASK, size,
- dir, NULL);
- debug_dma_map_page(hwdev, virt_to_page(ptr),
- (unsigned long)ptr & ~PAGE_MASK, size,
- dir, addr, true);
- return addr;
-}
-
-static inline void
-dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size,
- enum dma_data_direction dir)
-{
- struct dma_map_ops *ops = get_dma_ops(dev);
-
- BUG_ON(!valid_dma_direction(dir));
- if (ops->unmap_page)
- ops->unmap_page(dev, addr, size, dir, NULL);
- debug_dma_unmap_page(dev, addr, size, dir, true);
-}
-
-static inline int
-dma_map_sg(struct device *hwdev, struct scatterlist *sg,
- int nents, enum dma_data_direction dir)
-{
- struct dma_map_ops *ops = get_dma_ops(hwdev);
- int ents;
-
- BUG_ON(!valid_dma_direction(dir));
- ents = ops->map_sg(hwdev, sg, nents, dir, NULL);
- debug_dma_map_sg(hwdev, sg, nents, ents, dir);
-
- return ents;
-}
-
-static inline void
-dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
- enum dma_data_direction dir)
-{
- struct dma_map_ops *ops = get_dma_ops(hwdev);
-
- BUG_ON(!valid_dma_direction(dir));
- debug_dma_unmap_sg(hwdev, sg, nents, dir);
- if (ops->unmap_sg)
- ops->unmap_sg(hwdev, sg, nents, dir, NULL);
-}
-
-static inline void
-dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
- size_t size, enum dma_data_direction dir)
-{
- struct dma_map_ops *ops = get_dma_ops(hwdev);
-
- BUG_ON(!valid_dma_direction(dir));
- if (ops->sync_single_for_cpu)
- ops->sync_single_for_cpu(hwdev, dma_handle, size, dir);
- debug_dma_sync_single_for_cpu(hwdev, dma_handle, size, dir);
- flush_write_buffers();
-}
-
-static inline void
-dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
- size_t size, enum dma_data_direction dir)
-{
- struct dma_map_ops *ops = get_dma_ops(hwdev);
-
- BUG_ON(!valid_dma_direction(dir));
- if (ops->sync_single_for_device)
- ops->sync_single_for_device(hwdev, dma_handle, size, dir);
- debug_dma_sync_single_for_device(hwdev, dma_handle, size, dir);
- flush_write_buffers();
-}
-
-static inline void
-dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
- unsigned long offset, size_t size,
- enum dma_data_direction dir)
-{
- struct dma_map_ops *ops = get_dma_ops(hwdev);
-
- BUG_ON(!valid_dma_direction(dir));
- if (ops->sync_single_range_for_cpu)
- ops->sync_single_range_for_cpu(hwdev, dma_handle, offset,
- size, dir);
- debug_dma_sync_single_range_for_cpu(hwdev, dma_handle,
- offset, size, dir);
- flush_write_buffers();
-}
-
-static inline void
-dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
- unsigned long offset, size_t size,
- enum dma_data_direction dir)
-{
- struct dma_map_ops *ops = get_dma_ops(hwdev);
-
- BUG_ON(!valid_dma_direction(dir));
- if (ops->sync_single_range_for_device)
- ops->sync_single_range_for_device(hwdev, dma_handle,
- offset, size, dir);
- debug_dma_sync_single_range_for_device(hwdev, dma_handle,
- offset, size, dir);
- flush_write_buffers();
-}
-
-static inline void
-dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
- int nelems, enum dma_data_direction dir)
-{
- struct dma_map_ops *ops = get_dma_ops(hwdev);
-
- BUG_ON(!valid_dma_direction(dir));
- if (ops->sync_sg_for_cpu)
- ops->sync_sg_for_cpu(hwdev, sg, nelems, dir);
- debug_dma_sync_sg_for_cpu(hwdev, sg, nelems, dir);
- flush_write_buffers();
-}
-
-static inline void
-dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
- int nelems, enum dma_data_direction dir)
-{
- struct dma_map_ops *ops = get_dma_ops(hwdev);
-
- BUG_ON(!valid_dma_direction(dir));
- if (ops->sync_sg_for_device)
- ops->sync_sg_for_device(hwdev, sg, nelems, dir);
- debug_dma_sync_sg_for_device(hwdev, sg, nelems, dir);
-
- flush_write_buffers();
-}
-
-static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
- size_t offset, size_t size,
- enum dma_data_direction dir)
-{
- struct dma_map_ops *ops = get_dma_ops(dev);
- dma_addr_t addr;
-
- BUG_ON(!valid_dma_direction(dir));
- addr = ops->map_page(dev, page, offset, size, dir, NULL);
- debug_dma_map_page(dev, page, offset, size, dir, addr, false);
-
- return addr;
-}
-
-static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
- size_t size, enum dma_data_direction dir)
-{
- struct dma_map_ops *ops = get_dma_ops(dev);
-
- BUG_ON(!valid_dma_direction(dir));
- if (ops->unmap_page)
- ops->unmap_page(dev, addr, size, dir, NULL);
- debug_dma_unmap_page(dev, addr, size, dir, false);
-}
-
static inline void
dma_cache_sync(struct device *dev, void *vaddr, size_t size,
enum dma_data_direction dir)
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index a8f672ba100..70dac199b09 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -15,8 +15,8 @@
* - buffer allocation (memory accounting)
*
*
- * Copyright (C) 2007-2008 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
+ * Copyright (C) 2007-2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
*/
#ifndef _ASM_X86_DS_H
@@ -83,8 +83,10 @@ enum ds_feature {
* The interrupt threshold is independent from the overflow callback
* to allow users to use their own overflow interrupt handling mechanism.
*
- * task: the task to request recording for;
- * NULL for per-cpu recording on the current cpu
+ * The function might sleep.
+ *
+ * task: the task to request recording for
+ * cpu: the cpu to request recording for
* base: the base pointer for the (non-pageable) buffer;
* size: the size of the provided buffer in bytes
* ovfl: pointer to a function to be called on buffer overflow;
@@ -93,19 +95,28 @@ enum ds_feature {
* -1 if no interrupt threshold is requested.
* flags: a bit-mask of the above flags
*/
-extern struct bts_tracer *ds_request_bts(struct task_struct *task,
- void *base, size_t size,
- bts_ovfl_callback_t ovfl,
- size_t th, unsigned int flags);
-extern struct pebs_tracer *ds_request_pebs(struct task_struct *task,
- void *base, size_t size,
- pebs_ovfl_callback_t ovfl,
- size_t th, unsigned int flags);
+extern struct bts_tracer *ds_request_bts_task(struct task_struct *task,
+ void *base, size_t size,
+ bts_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags);
+extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
+ bts_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags);
+extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
+ void *base, size_t size,
+ pebs_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags);
+extern struct pebs_tracer *ds_request_pebs_cpu(int cpu,
+ void *base, size_t size,
+ pebs_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags);
/*
* Release BTS or PEBS resources
* Suspend and resume BTS or PEBS tracing
*
+ * Must be called with irq's enabled.
+ *
* tracer: the tracer handle returned from ds_request_~()
*/
extern void ds_release_bts(struct bts_tracer *tracer);
@@ -115,6 +126,28 @@ extern void ds_release_pebs(struct pebs_tracer *tracer);
extern void ds_suspend_pebs(struct pebs_tracer *tracer);
extern void ds_resume_pebs(struct pebs_tracer *tracer);
+/*
+ * Release BTS or PEBS resources
+ * Suspend and resume BTS or PEBS tracing
+ *
+ * Cpu tracers must call this on the traced cpu.
+ * Task tracers must call ds_release_~_noirq() for themselves.
+ *
+ * May be called with irq's disabled.
+ *
+ * Returns 0 if successful;
+ * -EPERM if the cpu tracer does not trace the current cpu.
+ * -EPERM if the task tracer does not trace itself.
+ *
+ * tracer: the tracer handle returned from ds_request_~()
+ */
+extern int ds_release_bts_noirq(struct bts_tracer *tracer);
+extern int ds_suspend_bts_noirq(struct bts_tracer *tracer);
+extern int ds_resume_bts_noirq(struct bts_tracer *tracer);
+extern int ds_release_pebs_noirq(struct pebs_tracer *tracer);
+extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer);
+extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer);
+
/*
* The raw DS buffer state as it is used for BTS and PEBS recording.
@@ -170,9 +203,9 @@ struct bts_struct {
} lbr;
/* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
struct {
- __u64 jiffies;
+ __u64 clock;
pid_t pid;
- } timestamp;
+ } event;
} variant;
};
@@ -201,8 +234,12 @@ struct bts_trace {
struct pebs_trace {
struct ds_trace ds;
- /* the PEBS reset value */
- unsigned long long reset_value;
+ /* the number of valid counters in the below array */
+ unsigned int counters;
+
+#define MAX_PEBS_COUNTERS 4
+ /* the counter reset value */
+ unsigned long long counter_reset[MAX_PEBS_COUNTERS];
};
@@ -237,9 +274,11 @@ extern int ds_reset_pebs(struct pebs_tracer *tracer);
* Returns 0 on success; -Eerrno on error
*
* tracer: the tracer handle returned from ds_request_pebs()
+ * counter: the index of the counter
* value: the new counter reset value
*/
-extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value);
+extern int ds_set_pebs_reset(struct pebs_tracer *tracer,
+ unsigned int counter, u64 value);
/*
* Initialization
@@ -252,21 +291,12 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
*/
extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
-/*
- * Task clone/init and cleanup work
- */
-extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father);
-extern void ds_exit_thread(struct task_struct *tsk);
-
#else /* CONFIG_X86_DS */
struct cpuinfo_x86;
static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
static inline void ds_switch_to(struct task_struct *prev,
struct task_struct *next) {}
-static inline void ds_copy_thread(struct task_struct *tsk,
- struct task_struct *father) {}
-static inline void ds_exit_thread(struct task_struct *tsk) {}
#endif /* CONFIG_X86_DS */
#endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index edc90f23e70..8406ed7f992 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -33,7 +33,7 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
efi_call_virt(f, a1, a2, a3, a4, a5, a6)
-#define efi_ioremap(addr, size) ioremap_cache(addr, size)
+#define efi_ioremap(addr, size, type) ioremap_cache(addr, size)
#else /* !CONFIG_X86_32 */
@@ -84,7 +84,8 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
(u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
-extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size);
+extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
+ u32 type);
#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index c2e6bedaf25..ff8cbfa0785 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -14,6 +14,7 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
+BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0,
smp_invalidate_interrupt)
@@ -49,11 +50,19 @@ BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
#ifdef CONFIG_PERF_COUNTERS
-BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
+BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
#endif
-#ifdef CONFIG_X86_MCE_P4THERMAL
+#ifdef CONFIG_X86_THERMAL_VECTOR
BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
+BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
+#endif
+
+#ifdef CONFIG_X86_NEW_MCE
+BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
+#endif
+
#endif
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 2d81af3974a..7b2d71df39a 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -111,12 +111,9 @@ enum fixed_addresses {
#ifdef CONFIG_PARAVIRT
FIX_PARAVIRT_BOOTMAP,
#endif
- FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */
- FIX_TEXT_POKE1,
+ FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
+ FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
__end_of_permanent_fixed_addresses,
-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
- FIX_OHCI1394_BASE,
-#endif
/*
* 256 temporary boot-time mappings, used by early_ioremap(),
* before ioremap() is functional.
@@ -129,6 +126,9 @@ enum fixed_addresses {
FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
(__end_of_permanent_fixed_addresses & 255),
FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+ FIX_OHCI1394_BASE,
+#endif
#ifdef CONFIG_X86_32
FIX_WP_TEST,
#endif
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 37555e52f98..82e3e8f0104 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -13,6 +13,8 @@ typedef struct {
unsigned int irq_spurious_count;
#endif
unsigned int generic_irqs; /* arch dependent */
+ unsigned int apic_perf_irqs;
+ unsigned int apic_pending_irqs;
#ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
@@ -20,7 +22,7 @@ typedef struct {
#endif
#ifdef CONFIG_X86_MCE
unsigned int irq_thermal_count;
-# ifdef CONFIG_X86_64
+# ifdef CONFIG_X86_MCE_THRESHOLD
unsigned int irq_threshold_count;
# endif
#endif
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b762ea49bd7..ba180d93b08 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,9 +29,12 @@
extern void apic_timer_interrupt(void);
extern void generic_interrupt(void);
extern void error_interrupt(void);
+extern void perf_pending_interrupt(void);
+
extern void spurious_interrupt(void);
extern void thermal_interrupt(void);
extern void reschedule_interrupt(void);
+extern void mce_self_interrupt(void);
extern void invalidate_interrupt(void);
extern void invalidate_interrupt0(void);
@@ -44,6 +47,7 @@ extern void invalidate_interrupt6(void);
extern void invalidate_interrupt7(void);
extern void irq_move_cleanup_interrupt(void);
+extern void reboot_interrupt(void);
extern void threshold_interrupt(void);
extern void call_function_interrupt(void);
@@ -63,7 +67,26 @@ extern unsigned long io_apic_irqs;
extern void init_VISWS_APIC_irqs(void);
extern void setup_IO_APIC(void);
extern void disable_IO_APIC(void);
-extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
+
+struct io_apic_irq_attr {
+ int ioapic;
+ int ioapic_pin;
+ int trigger;
+ int polarity;
+};
+
+static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
+ int ioapic, int ioapic_pin,
+ int trigger, int polarity)
+{
+ irq_attr->ioapic = ioapic;
+ irq_attr->ioapic_pin = ioapic_pin;
+ irq_attr->trigger = trigger;
+ irq_attr->polarity = polarity;
+}
+
+extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin,
+ struct io_apic_irq_attr *irq_attr);
extern void setup_ioapic_dest(void);
extern void enable_IO_APIC(void);
@@ -78,7 +101,11 @@ extern void eisa_set_level_irq(unsigned int irq);
/* SMP */
extern void smp_apic_timer_interrupt(struct pt_regs *);
extern void smp_spurious_interrupt(struct pt_regs *);
+extern void smp_generic_interrupt(struct pt_regs *);
extern void smp_error_interrupt(struct pt_regs *);
+#ifdef CONFIG_X86_IO_APIC
+extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
+#endif
#ifdef CONFIG_SMP
extern void smp_reschedule_interrupt(struct pt_regs *);
extern void smp_call_function_interrupt(struct pt_regs *);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 71c9e518398..175adf58dd4 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -67,7 +67,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
".previous\n"
_ASM_EXTABLE(1b, 3b)
: [err] "=r" (err)
-#if 0 /* See comment in __save_init_fpu() below. */
+#if 0 /* See comment in fxsave() below. */
: [fx] "r" (fx), "m" (*fx), "0" (0));
#else
: [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
@@ -75,14 +75,6 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
return err;
}
-static inline int restore_fpu_checking(struct task_struct *tsk)
-{
- if (task_thread_info(tsk)->status & TS_XSAVE)
- return xrstor_checking(&tsk->thread.xstate->xsave);
- else
- return fxrstor_checking(&tsk->thread.xstate->fxsave);
-}
-
/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
is pending. Clear the x87 state here by setting it to fixed
values. The kernel data segment can be sometimes 0 and sometimes
@@ -120,7 +112,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
".previous\n"
_ASM_EXTABLE(1b, 3b)
: [err] "=r" (err), "=m" (*fx)
-#if 0 /* See comment in __fxsave_clear() below. */
+#if 0 /* See comment in fxsave() below. */
: [fx] "r" (fx), "0" (0));
#else
: [fx] "cdaSDb" (fx), "0" (0));
@@ -185,12 +177,9 @@ static inline void tolerant_fwait(void)
asm volatile("fnclex ; fwait");
}
-static inline void restore_fpu(struct task_struct *tsk)
+/* perform fxrstor iff the processor has extended states, otherwise frstor */
+static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
{
- if (task_thread_info(tsk)->status & TS_XSAVE) {
- xrstor_checking(&tsk->thread.xstate->xsave);
- return;
- }
/*
* The "nop" is needed to make the instructions the same
* length.
@@ -199,7 +188,9 @@ static inline void restore_fpu(struct task_struct *tsk)
"nop ; frstor %1",
"fxrstor %1",
X86_FEATURE_FXSR,
- "m" (tsk->thread.xstate->fxsave));
+ "m" (*fx));
+
+ return 0;
}
/* We need a safe address that is cheap to find and that is already
@@ -262,6 +253,14 @@ end:
#endif /* CONFIG_X86_64 */
+static inline int restore_fpu_checking(struct task_struct *tsk)
+{
+ if (task_thread_info(tsk)->status & TS_XSAVE)
+ return xrstor_checking(&tsk->thread.xstate->xsave);
+ else
+ return fxrstor_checking(&tsk->thread.xstate->fxsave);
+}
+
/*
* Signal frame handlers...
*/
@@ -305,18 +304,18 @@ static inline void kernel_fpu_end(void)
/*
* Some instructions like VIA's padlock instructions generate a spurious
* DNA fault but don't modify SSE registers. And these instructions
- * get used from interrupt context aswell. To prevent these kernel instructions
- * in interrupt context interact wrongly with other user/kernel fpu usage, we
+ * get used from interrupt context as well. To prevent these kernel instructions
+ * in interrupt context interacting wrongly with other user/kernel fpu usage, we
* should use them only in the context of irq_ts_save/restore()
*/
static inline int irq_ts_save(void)
{
/*
- * If we are in process context, we are ok to take a spurious DNA fault.
- * Otherwise, doing clts() in process context require pre-emption to
- * be disabled or some heavy lifting like kernel_fpu_begin()
+ * If in process context and not atomic, we can take a spurious DNA fault.
+ * Otherwise, doing clts() in process context requires disabling preemption
+ * or some heavy lifting like kernel_fpu_begin()
*/
- if (!in_interrupt())
+ if (!in_atomic())
return 0;
if (read_cr0() & X86_CR0_TS) {
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 1a99e6c092a..58d7091eeb1 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -60,8 +60,4 @@ extern struct irq_chip i8259A_chip;
extern void mask_8259A(void);
extern void unmask_8259A(void);
-#ifdef CONFIG_X86_32
-extern void init_ISA_irqs(void);
-#endif
-
#endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
deleted file mode 100644
index fa0fd068bc2..00000000000
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
-#define _ASM_X86_INTEL_ARCH_PERFMON_H
-
-#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
-#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
-
-#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
-#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
-
-#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
-#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
-#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
-#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
-
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
- (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
-
-union cpuid10_eax {
- struct {
- unsigned int version_id:8;
- unsigned int num_counters:8;
- unsigned int bit_width:8;
- unsigned int mask_length:8;
- } split;
- unsigned int full;
-};
-
-#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 9d826e43601..330ee807f89 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -154,22 +154,20 @@ extern int timer_through_8259;
extern int io_apic_get_unique_id(int ioapic, int apic_id);
extern int io_apic_get_version(int ioapic);
extern int io_apic_get_redir_entries(int ioapic);
-extern int io_apic_set_pci_routing(int ioapic, int pin, int irq,
- int edge_level, int active_high_low);
#endif /* CONFIG_ACPI */
+struct io_apic_irq_attr;
+extern int io_apic_set_pci_routing(struct device *dev, int irq,
+ struct io_apic_irq_attr *irq_attr);
extern int (*ioapic_renumber_irq)(int ioapic, int irq);
extern void ioapic_init_mappings(void);
+extern void ioapic_insert_resources(void);
-#ifdef CONFIG_X86_64
extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
-extern void reinit_intr_remapped_IO_APIC(int intr_remapping,
- struct IO_APIC_route_entry **ioapic_entries);
-#endif
extern void probe_nr_irqs_gsi(void);
@@ -183,6 +181,7 @@ extern void ioapic_write_entry(int apic, int pin,
#define io_apic_assign_pci_irqs 0
static const int timer_through_8259 = 0;
static inline void ioapic_init_mappings(void) { }
+static inline void ioapic_insert_resources(void) { }
static inline void probe_nr_irqs_gsi(void) { }
#endif
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index 86af26091d6..0e9fe1d9d97 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -1,3 +1,6 @@
+#ifndef _ASM_X86_IOMAP_H
+#define _ASM_X86_IOMAP_H
+
/*
* Copyright © 2008 Ingo Molnar
*
@@ -31,3 +34,5 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
void
iounmap_atomic(void *kvaddr, enum km_type type);
+
+#endif /* _ASM_X86_IOMAP_H */
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index af326a2975b..fd6d21bbee6 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -6,6 +6,7 @@ extern void no_iommu_init(void);
extern struct dma_map_ops nommu_dma_ops;
extern int force_iommu, no_iommu;
extern int iommu_detected;
+extern int iommu_pass_through;
/* 10 seconds */
#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 0396760fccb..f275e224450 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -1,6 +1,6 @@
#ifndef _ASM_X86_IRQ_REMAPPING_H
#define _ASM_X86_IRQ_REMAPPING_H
-#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8)
+#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
#endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 3cbd79bbb47..5b21f0ec3df 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -25,6 +25,7 @@
*/
#define NMI_VECTOR 0x02
+#define MCE_VECTOR 0x12
/*
* IDT vectors usable for external interrupt sources start
@@ -34,6 +35,7 @@
#ifdef CONFIG_X86_32
# define SYSCALL_VECTOR 0x80
+# define IA32_SYSCALL_VECTOR 0x80
#else
# define IA32_SYSCALL_VECTOR 0x80
#endif
@@ -86,13 +88,8 @@
#define CALL_FUNCTION_VECTOR 0xfc
#define CALL_FUNCTION_SINGLE_VECTOR 0xfb
#define THERMAL_APIC_VECTOR 0xfa
-
-#ifdef CONFIG_X86_32
-/* 0xf8 - 0xf9 : free */
-#else
-# define THRESHOLD_APIC_VECTOR 0xf9
-# define UV_BAU_MESSAGE 0xf8
-#endif
+#define THRESHOLD_APIC_VECTOR 0xf9
+#define REBOOT_VECTOR 0xf8
/* f0-f7 used for spreading out TLB flushes: */
#define INVALIDATE_TLB_VECTOR_END 0xf7
@@ -107,14 +104,21 @@
#define LOCAL_TIMER_VECTOR 0xef
/*
- * Performance monitoring interrupt vector:
+ * Generic system vector for platform specific use
*/
-#define LOCAL_PERF_VECTOR 0xee
+#define GENERIC_INTERRUPT_VECTOR 0xed
/*
- * Generic system vector for platform specific use
+ * Performance monitoring pending work vector:
*/
-#define GENERIC_INTERRUPT_VECTOR 0xed
+#define LOCAL_PENDING_VECTOR 0xec
+
+#define UV_BAU_MESSAGE 0xec
+
+/*
+ * Self IPI vector for machine checks
+ */
+#define MCE_SELF_VECTOR 0xeb
/*
* First APIC vector available to drivers: (vectors 0x30-0xee) we
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 2bdab21f089..c6ccbe7e81a 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -12,9 +12,15 @@ static inline unsigned long native_save_fl(void)
{
unsigned long flags;
+ /*
+ * Note: this needs to be "=r" not "=rm", because we have the
+ * stack offset from what gcc expects at the time the "pop" is
+ * executed, and so a memory reference with respect to the stack
+ * would end up using the wrong address.
+ */
asm volatile("# __raw_save_flags\n\t"
"pushf ; pop %0"
- : "=g" (flags)
+ : "=r" (flags)
: /* no input */
: "memory");
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index 54c8cc53b24..c2d1f3b58e5 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -12,4 +12,17 @@ extern int cache_k8_northbridges(void);
extern void k8_flush_garts(void);
extern int k8_scan_nodes(unsigned long start, unsigned long end);
+#ifdef CONFIG_K8_NB
+static inline struct pci_dev *node_to_k8_nb_misc(int node)
+{
+ return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
+}
+#else
+static inline struct pci_dev *node_to_k8_nb_misc(int node)
+{
+ return NULL;
+}
+#endif
+
+
#endif /* _ASM_X86_K8_H */
diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h
index 5759c165a5c..9e00a731a7f 100644
--- a/arch/x86/include/asm/kmap_types.h
+++ b/arch/x86/include/asm/kmap_types.h
@@ -2,28 +2,11 @@
#define _ASM_X86_KMAP_TYPES_H
#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM)
-# define D(n) __KM_FENCE_##n ,
-#else
-# define D(n)
+#define __WITH_KM_FENCE
#endif
-enum km_type {
-D(0) KM_BOUNCE_READ,
-D(1) KM_SKB_SUNRPC_DATA,
-D(2) KM_SKB_DATA_SOFTIRQ,
-D(3) KM_USER0,
-D(4) KM_USER1,
-D(5) KM_BIO_SRC_IRQ,
-D(6) KM_BIO_DST_IRQ,
-D(7) KM_PTE0,
-D(8) KM_PTE1,
-D(9) KM_IRQ0,
-D(10) KM_IRQ1,
-D(11) KM_SOFTIRQ0,
-D(12) KM_SOFTIRQ1,
-D(13) KM_TYPE_NR
-};
+#include <asm-generic/kmap_types.h>
-#undef D
+#undef __WITH_KM_FENCE
#endif /* _ASM_X86_KMAP_TYPES_H */
diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h
new file mode 100644
index 00000000000..ed01518f297
--- /dev/null
+++ b/arch/x86/include/asm/kmemcheck.h
@@ -0,0 +1,42 @@
+#ifndef ASM_X86_KMEMCHECK_H
+#define ASM_X86_KMEMCHECK_H
+
+#include <linux/types.h>
+#include <asm/ptrace.h>
+
+#ifdef CONFIG_KMEMCHECK
+bool kmemcheck_active(struct pt_regs *regs);
+
+void kmemcheck_show(struct pt_regs *regs);
+void kmemcheck_hide(struct pt_regs *regs);
+
+bool kmemcheck_fault(struct pt_regs *regs,
+ unsigned long address, unsigned long error_code);
+bool kmemcheck_trap(struct pt_regs *regs);
+#else
+static inline bool kmemcheck_active(struct pt_regs *regs)
+{
+ return false;
+}
+
+static inline void kmemcheck_show(struct pt_regs *regs)
+{
+}
+
+static inline void kmemcheck_hide(struct pt_regs *regs)
+{
+}
+
+static inline bool kmemcheck_fault(struct pt_regs *regs,
+ unsigned long address, unsigned long error_code)
+{
+ return false;
+}
+
+static inline bool kmemcheck_trap(struct pt_regs *regs)
+{
+ return false;
+}
+#endif /* CONFIG_KMEMCHECK */
+
+#endif
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index dc3f6cf1170..125be8b1956 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -16,6 +16,7 @@
#define __KVM_HAVE_MSI
#define __KVM_HAVE_USER_NMI
#define __KVM_HAVE_GUEST_DEBUG
+#define __KVM_HAVE_MSIX
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f0faf58044f..eabdc1cfab5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -185,6 +185,7 @@ union kvm_mmu_page_role {
unsigned access:3;
unsigned invalid:1;
unsigned cr4_pge:1;
+ unsigned nxe:1;
};
};
@@ -212,7 +213,6 @@ struct kvm_mmu_page {
int multimapped; /* More than one parent_pte? */
int root_count; /* Currently serving as active root */
bool unsync;
- bool global;
unsigned int unsync_children;
union {
u64 *parent_pte; /* !multimapped */
@@ -261,13 +261,11 @@ struct kvm_mmu {
union kvm_mmu_page_role base_role;
u64 *pae_root;
+ u64 rsvd_bits_mask[2][4];
};
struct kvm_vcpu_arch {
u64 host_tsc;
- int interrupt_window_open;
- unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
- DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
/*
* rip and regs accesses must go through
* kvm_{register,rip}_{read,write} functions.
@@ -286,6 +284,7 @@ struct kvm_vcpu_arch {
u64 shadow_efer;
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
+ int32_t apic_arb_prio;
int mp_state;
int sipi_vector;
u64 ia32_misc_enable_msr;
@@ -320,6 +319,8 @@ struct kvm_vcpu_arch {
struct kvm_pio_request pio;
void *pio_data;
+ u8 event_exit_inst_len;
+
struct kvm_queued_exception {
bool pending;
bool has_error_code;
@@ -329,11 +330,12 @@ struct kvm_vcpu_arch {
struct kvm_queued_interrupt {
bool pending;
+ bool soft;
u8 nr;
} interrupt;
struct {
- int active;
+ int vm86_active;
u8 save_iopl;
struct kvm_save_segment {
u16 selector;
@@ -356,9 +358,9 @@ struct kvm_vcpu_arch {
unsigned int time_offset;
struct page *time_page;
+ bool singlestep; /* guest is single stepped by KVM */
bool nmi_pending;
bool nmi_injected;
- bool nmi_window_open;
struct mtrr_state_type mtrr_state;
u32 pat;
@@ -392,15 +394,14 @@ struct kvm_arch{
*/
struct list_head active_mmu_pages;
struct list_head assigned_dev_head;
- struct list_head oos_global_pages;
struct iommu_domain *iommu_domain;
+ int iommu_flags;
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
struct hlist_head irq_ack_notifier_list;
int vapics_in_nmi_mode;
- int round_robin_prev_vcpu;
unsigned int tss_addr;
struct page *apic_access_page;
@@ -423,7 +424,6 @@ struct kvm_vm_stat {
u32 mmu_recycled;
u32 mmu_cache_miss;
u32 mmu_unsync;
- u32 mmu_unsync_global;
u32 remote_tlb_flush;
u32 lpages;
};
@@ -443,7 +443,6 @@ struct kvm_vcpu_stat {
u32 halt_exits;
u32 halt_wakeup;
u32 request_irq_exits;
- u32 request_nmi_exits;
u32 irq_exits;
u32 host_state_reload;
u32 efer_reload;
@@ -511,20 +510,22 @@ struct kvm_x86_ops {
void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+ void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
+ u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
void (*patch_hypercall)(struct kvm_vcpu *vcpu,
unsigned char *hypercall_addr);
- int (*get_irq)(struct kvm_vcpu *vcpu);
- void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
+ void (*set_irq)(struct kvm_vcpu *vcpu);
+ void (*set_nmi)(struct kvm_vcpu *vcpu);
void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code);
- bool (*exception_injected)(struct kvm_vcpu *vcpu);
- void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
- void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
- struct kvm_run *run);
-
+ int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
+ int (*nmi_allowed)(struct kvm_vcpu *vcpu);
+ void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
+ void (*enable_irq_window)(struct kvm_vcpu *vcpu);
+ void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
- int (*get_mt_mask_shift)(void);
+ u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
};
extern struct kvm_x86_ops *kvm_x86_ops;
@@ -538,7 +539,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu);
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
void kvm_mmu_set_base_ptes(u64 base_pte);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask);
+ u64 dirty_mask, u64 nx_mask, u64 x_mask);
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
@@ -552,6 +553,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
gpa_t addr, unsigned long *ret);
+u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
extern bool tdp_enabled;
@@ -563,6 +565,7 @@ enum emulation_result {
#define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1)
+#define EMULTYPE_SKIP (1 << 2)
int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
unsigned long cr2, u16 error_code, int emulation_type);
void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
@@ -638,7 +641,6 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
-void kvm_mmu_sync_global(struct kvm_vcpu *vcpu);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
@@ -769,6 +771,8 @@ enum {
#define HF_GIF_MASK (1 << 0)
#define HF_HIF_MASK (1 << 1)
#define HF_VINTR_MASK (1 << 2)
+#define HF_NMI_MASK (1 << 3)
+#define HF_IRET_MASK (1 << 4)
/*
* Hardware virtualization extension instructions may fault if a
@@ -791,5 +795,6 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
index 6a159732881..b7ed2c42311 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -143,6 +143,9 @@ struct decode_cache {
struct fetch_cache fetch;
};
+#define X86_SHADOW_INT_MOV_SS 1
+#define X86_SHADOW_INT_STI 2
+
struct x86_emulate_ctxt {
/* Register state before/after emulation. */
struct kvm_vcpu *vcpu;
@@ -152,6 +155,9 @@ struct x86_emulate_ctxt {
int mode;
u32 cs_base;
+ /* interruptibility state, as a result of execution of STI or MOV SS */
+ int interruptibility;
+
/* decode cache */
struct decode_cache decode;
};
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 1caf57628b9..5136dad57cb 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -17,8 +17,12 @@
/* Pages for switcher itself, then two pages per cpu */
#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
-/* We map at -4M for ease of mapping into the guest (one PTE page). */
+/* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */
+#ifdef CONFIG_X86_PAE
+#define SWITCHER_ADDR 0xFFE00000
+#else
#define SWITCHER_ADDR 0xFFC00000
+#endif
/* Found in switcher.S */
extern unsigned long default_idt_entries[];
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index faae1996487..ba0eed8aa1a 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -12,11 +12,13 @@
#define LHCALL_TS 8
#define LHCALL_SET_CLOCKEVENT 9
#define LHCALL_HALT 10
+#define LHCALL_SET_PMD 13
#define LHCALL_SET_PTE 14
-#define LHCALL_SET_PMD 15
+#define LHCALL_SET_PGD 15
#define LHCALL_LOAD_TLS 16
#define LHCALL_NOTIFY 17
#define LHCALL_LOAD_GDT_ENTRY 18
+#define LHCALL_SEND_INTERRUPTS 19
#define LGUEST_TRAP_ENTRY 0x1F
@@ -28,27 +30,28 @@
#include <asm/hw_irq.h>
#include <asm/kvm_para.h>
-/*G:031 But first, how does our Guest contact the Host to ask for privileged
+/*G:030
+ * But first, how does our Guest contact the Host to ask for privileged
* operations? There are two ways: the direct way is to make a "hypercall",
* to make requests of the Host Itself.
*
- * We use the KVM hypercall mechanism. Eighteen hypercalls are
- * available: the hypercall number is put in the %eax register, and the
- * arguments (when required) are placed in %ebx, %ecx and %edx. If a return
- * value makes sense, it's returned in %eax.
+ * We use the KVM hypercall mechanism, though completely different hypercall
+ * numbers. Seventeen hypercalls are available: the hypercall number is put in
+ * the %eax register, and the arguments (when required) are placed in %ebx,
+ * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax.
*
* Grossly invalid calls result in Sudden Death at the hands of the vengeful
* Host, rather than returning failure. This reflects Winston Churchill's
- * definition of a gentleman: "someone who is only rude intentionally". */
-/*:*/
+ * definition of a gentleman: "someone who is only rude intentionally".
+:*/
/* Can't use our min() macro here: needs to be a constant */
#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
#define LHCALL_RING_SIZE 64
struct hcall_args {
- /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */
- unsigned long arg0, arg1, arg2, arg3;
+ /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */
+ unsigned long arg0, arg1, arg2, arg3, arg4;
};
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 4f8c199584e..5cdd8d100ec 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -1,8 +1,6 @@
#ifndef _ASM_X86_MCE_H
#define _ASM_X86_MCE_H
-#ifdef __x86_64__
-
#include <linux/types.h>
#include <asm/ioctls.h>
@@ -10,21 +8,35 @@
* Machine Check support for x86
*/
-#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */
-#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
-#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
-
-#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */
-#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */
-#define MCG_STATUS_MCIP (1UL<<2) /* machine check in progress */
-
-#define MCI_STATUS_VAL (1UL<<63) /* valid error */
-#define MCI_STATUS_OVER (1UL<<62) /* previous errors lost */
-#define MCI_STATUS_UC (1UL<<61) /* uncorrected error */
-#define MCI_STATUS_EN (1UL<<60) /* error enabled */
-#define MCI_STATUS_MISCV (1UL<<59) /* misc error reg. valid */
-#define MCI_STATUS_ADDRV (1UL<<58) /* addr reg. valid */
-#define MCI_STATUS_PCC (1UL<<57) /* processor context corrupt */
+#define MCG_BANKCNT_MASK 0xff /* Number of Banks */
+#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */
+#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
+#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
+#define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */
+#define MCG_EXT_CNT_SHIFT 16
+#define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
+#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */
+
+#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
+#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */
+#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
+
+#define MCI_STATUS_VAL (1ULL<<63) /* valid error */
+#define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */
+#define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */
+#define MCI_STATUS_EN (1ULL<<60) /* error enabled */
+#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
+#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
+#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */
+#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
+#define MCI_STATUS_AR (1ULL<<55) /* Action required */
+
+/* MISC register defines */
+#define MCM_ADDR_SEGOFF 0 /* segment offset */
+#define MCM_ADDR_LINEAR 1 /* linear address */
+#define MCM_ADDR_PHYS 2 /* physical address */
+#define MCM_ADDR_MEM 3 /* memory address */
+#define MCM_ADDR_GENERIC 7 /* generic */
/* Fields are zero when not available */
struct mce {
@@ -34,13 +46,19 @@ struct mce {
__u64 mcgstatus;
__u64 ip;
__u64 tsc; /* cpu time stamp counter */
- __u64 res1; /* for future extension */
- __u64 res2; /* dito. */
+ __u64 time; /* wall time_t when error was detected */
+ __u8 cpuvendor; /* cpu vendor as encoded in system.h */
+ __u8 pad1;
+ __u16 pad2;
+ __u32 cpuid; /* CPUID 1 EAX */
__u8 cs; /* code segment */
__u8 bank; /* machine check bank */
- __u8 cpu; /* cpu that raised the error */
+ __u8 cpu; /* cpu number; obsolete; use extcpu now */
__u8 finished; /* entry is valid */
- __u32 pad;
+ __u32 extcpu; /* linux cpu number that detected the error */
+ __u32 socketid; /* CPU socket ID */
+ __u32 apicid; /* CPU initial apic ID */
+ __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */
};
/*
@@ -57,7 +75,7 @@ struct mce_log {
unsigned len; /* = MCE_LOG_LEN */
unsigned next;
unsigned flags;
- unsigned pad0;
+ unsigned recordlen; /* length of struct mce */
struct mce entry[MCE_LOG_LEN];
};
@@ -82,20 +100,41 @@ struct mce_log {
#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9)
#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0)
-#endif /* __x86_64__ */
-
#ifdef __KERNEL__
-#ifdef CONFIG_X86_32
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <asm/atomic.h>
+
extern int mce_disabled;
-#else /* CONFIG_X86_32 */
+extern int mce_p5_enabled;
-#include <asm/atomic.h>
+#ifdef CONFIG_X86_MCE
+void mcheck_init(struct cpuinfo_x86 *c);
+#else
+static inline void mcheck_init(struct cpuinfo_x86 *c) {}
+#endif
+
+#ifdef CONFIG_X86_OLD_MCE
+extern int nr_mce_banks;
+void amd_mcheck_init(struct cpuinfo_x86 *c);
+void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
+void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
+#endif
+
+#ifdef CONFIG_X86_ANCIENT_MCE
+void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
+void winchip_mcheck_init(struct cpuinfo_x86 *c);
+static inline void enable_p5_mce(void) { mce_p5_enabled = 1; }
+#else
+static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
+static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
+static inline void enable_p5_mce(void) {}
+#endif
void mce_setup(struct mce *m);
void mce_log(struct mce *m);
-DECLARE_PER_CPU(struct sys_device, device_mce);
-extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
+DECLARE_PER_CPU(struct sys_device, mce_dev);
/*
* To support more than 128 would need to escape the predefined
@@ -104,6 +143,8 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
#ifdef CONFIG_X86_MCE_INTEL
+extern int mce_cmci_disabled;
+extern int mce_ignore_ce;
void mce_intel_feature_init(struct cpuinfo_x86 *c);
void cmci_clear(void);
void cmci_reenable(void);
@@ -123,14 +164,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
#endif
-extern int mce_available(struct cpuinfo_x86 *c);
+int mce_available(struct cpuinfo_x86 *c);
-void mce_log_therm_throt_event(__u64 status);
+DECLARE_PER_CPU(unsigned, mce_exception_count);
+DECLARE_PER_CPU(unsigned, mce_poll_count);
extern atomic_t mce_entry;
-extern void do_machine_check(struct pt_regs *, long);
-
typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
@@ -139,19 +179,40 @@ enum mcp_flags {
MCP_UC = (1 << 1), /* log uncorrected errors */
MCP_DONTLOG = (1 << 2), /* only clear, don't log */
};
-extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
-extern int mce_notify_user(void);
+int mce_notify_irq(void);
+void mce_notify_process(void);
-#endif /* !CONFIG_X86_32 */
+DECLARE_PER_CPU(struct mce, injectm);
+extern struct file_operations mce_chrdev_ops;
-#ifdef CONFIG_X86_MCE
-extern void mcheck_init(struct cpuinfo_x86 *c);
-#else
-#define mcheck_init(c) do { } while (0)
-#endif
+/*
+ * Exception handler
+ */
+
+/* Call the installed machine check handler for this CPU setup. */
+extern void (*machine_check_vector)(struct pt_regs *, long error_code);
+void do_machine_check(struct pt_regs *, long);
+
+/*
+ * Threshold handler
+ */
extern void (*mce_threshold_vector)(void);
+extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
+
+/*
+ * Thermal handler
+ */
+
+void intel_init_thermal(struct cpuinfo_x86 *c);
+
+#ifdef CONFIG_X86_NEW_MCE
+void mce_log_therm_throt_event(__u64 status);
+#else
+static inline void mce_log_therm_throt_event(__u64 status) {}
+#endif
#endif /* __KERNEL__ */
#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index c882664716c..ef51b501e22 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -9,20 +9,31 @@ struct cpu_signature {
struct device;
+enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
+
struct microcode_ops {
- int (*request_microcode_user) (int cpu, const void __user *buf, size_t size);
- int (*request_microcode_fw) (int cpu, struct device *device);
+ enum ucode_state (*request_microcode_user) (int cpu,
+ const void __user *buf, size_t size);
- void (*apply_microcode) (int cpu);
+ enum ucode_state (*request_microcode_fw) (int cpu,
+ struct device *device);
- int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
void (*microcode_fini_cpu) (int cpu);
+
+ /*
+ * The generic 'microcode_core' part guarantees that
+ * the callbacks below run on a target cpu when they
+ * are being called.
+ * See also the "Synchronization" section in microcode_core.c.
+ */
+ int (*apply_microcode) (int cpu);
+ int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
};
struct ucode_cpu_info {
- struct cpu_signature cpu_sig;
- int valid;
- void *mc;
+ struct cpu_signature cpu_sig;
+ int valid;
+ void *mc;
};
extern struct ucode_cpu_info ucode_cpu_info[];
diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
index 90bc4108a4f..751af2550ed 100644
--- a/arch/x86/include/asm/mman.h
+++ b/arch/x86/include/asm/mman.h
@@ -1,7 +1,7 @@
#ifndef _ASM_X86_MMAN_H
#define _ASM_X86_MMAN_H
-#include <asm-generic/mman.h>
+#include <asm-generic/mman-common.h>
#define MAP_32BIT 0x40 /* only give out 32bit addresses */
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 642fc7fc8cd..e2a1bb6d71e 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -61,9 +61,11 @@ extern void get_smp_config(void);
#ifdef CONFIG_X86_MPPARSE
extern void find_smp_config(void);
extern void early_reserve_e820_mpc_new(void);
+extern int enable_update_mptable;
#else
static inline void find_smp_config(void) { }
static inline void early_reserve_e820_mpc_new(void) { }
+#define enable_update_mptable 0
#endif
void __cpuinit generic_processor_info(int apicid, int version);
@@ -72,20 +74,13 @@ extern void mp_register_ioapic(int id, u32 address, u32 gsi_base);
extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
u32 gsi);
extern void mp_config_acpi_legacy_irqs(void);
-extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low);
+struct device;
+extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
+ int active_high_low);
extern int acpi_probe_gsi(void);
#ifdef CONFIG_X86_IO_APIC
-extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
- u32 gsi, int triggering, int polarity);
extern int mp_find_ioapic(int gsi);
extern int mp_find_ioapic_pin(int ioapic, int gsi);
-#else
-static inline int
-mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
- u32 gsi, int triggering, int polarity)
-{
- return 0;
-}
#endif
#else /* !CONFIG_ACPI: */
static inline int acpi_probe_gsi(void)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index ec41fc16c16..6be7fc254b5 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -121,7 +121,6 @@
#define MSR_K8_TOP_MEM1 0xc001001a
#define MSR_K8_TOP_MEM2 0xc001001d
#define MSR_K8_SYSCFG 0xc0010010
-#define MSR_K8_HWCR 0xc0010015
#define MSR_K8_INT_PENDING_MSG 0xc0010055
/* C1E active bits in int pending message */
#define K8_INTP_C1E_ACTIVE_MASK 0x18000000
@@ -208,7 +207,14 @@
#define MSR_IA32_THERM_CONTROL 0x0000019a
#define MSR_IA32_THERM_INTERRUPT 0x0000019b
+
+#define THERM_INT_LOW_ENABLE (1 << 0)
+#define THERM_INT_HIGH_ENABLE (1 << 1)
+
#define MSR_IA32_THERM_STATUS 0x0000019c
+
+#define THERM_STATUS_PROCHOT (1 << 0)
+
#define MSR_IA32_MISC_ENABLE 0x000001a0
/* MISC_ENABLE bits: architectural */
@@ -240,10 +246,6 @@
#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38)
#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39)
-/* Intel Model 6 */
-#define MSR_P6_EVNTSEL0 0x00000186
-#define MSR_P6_EVNTSEL1 0x00000187
-
/* P4/Xeon+ specific */
#define MSR_IA32_MCG_EAX 0x00000180
#define MSR_IA32_MCG_EBX 0x00000181
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 638bf624180..48ad9d29484 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -3,15 +3,23 @@
#include <asm/msr-index.h>
-#ifndef __ASSEMBLY__
-# include <linux/types.h>
-#endif
-
#ifdef __KERNEL__
#ifndef __ASSEMBLY__
+#include <linux/types.h>
#include <asm/asm.h>
#include <asm/errno.h>
+#include <asm/cpumask.h>
+
+struct msr {
+ union {
+ struct {
+ u32 l;
+ u32 h;
+ };
+ u64 q;
+ };
+};
static inline unsigned long long native_read_tscp(unsigned int *aux)
{
@@ -216,6 +224,8 @@ do { \
#ifdef CONFIG_SMP
int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
+void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
+void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
#else /* CONFIG_SMP */
@@ -229,6 +239,16 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
wrmsr(msr_no, l, h);
return 0;
}
+static inline void rdmsr_on_cpus(const cpumask_t *m, u32 msr_no,
+ struct msr *msrs)
+{
+ rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h));
+}
+static inline void wrmsr_on_cpus(const cpumask_t *m, u32 msr_no,
+ struct msr *msrs)
+{
+ wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h);
+}
static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
u32 *l, u32 *h)
{
@@ -241,6 +261,4 @@ static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
#endif /* CONFIG_SMP */
#endif /* __ASSEMBLY__ */
#endif /* __KERNEL__ */
-
-
#endif /* _ASM_X86_MSR_H */
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c45a0a568df..c86e5ed4af5 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -64,7 +64,7 @@ static inline int nmi_watchdog_active(void)
* but since they are power of two we could use a
* cheaper way --cvg
*/
- return nmi_watchdog & 0x3;
+ return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC);
}
#endif
@@ -72,7 +72,6 @@ void lapic_watchdog_stop(void);
int lapic_watchdog_init(unsigned nmi_hz);
int lapic_wd_event(unsigned nmi_hz);
unsigned lapic_adjust_nmi_hz(unsigned hz);
-int lapic_watchdog_ok(void);
void disable_lapic_nmi_watchdog(void);
void enable_lapic_nmi_watchdog(void);
void stop_nmi(void);
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 064ed6df4cb..c4ae822e415 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -17,9 +17,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks,
extern void numa_init_array(void);
extern int numa_off;
-extern void srat_reserve_add_area(int nodeid);
-extern int hotadd_percent;
-
extern s16 apicid_to_node[MAX_LOCAL_APIC];
extern unsigned long numa_free_all_bootmem(void);
@@ -27,6 +24,13 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
unsigned long end);
#ifdef CONFIG_NUMA
+/*
+ * Too small node sizes may confuse the VM badly. Usually they
+ * result from BIOS bugs. So dont recognize nodes as standalone
+ * NUMA entities that have less than this amount of RAM listed:
+ */
+#define NODE_MIN_SIZE (4*1024*1024)
+
extern void __init init_cpu_to_node(void);
extern void __cpuinit numa_set_node(int cpu, int node);
extern void __cpuinit numa_clear_node(int cpu);
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 89ed9d70b0a..625c3f0e741 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -56,7 +56,7 @@ extern bool __virt_addr_valid(unsigned long kaddr);
#endif /* __ASSEMBLY__ */
#include <asm-generic/memory_model.h>
-#include <asm-generic/page.h>
+#include <asm-generic/getorder.h>
#define __HAVE_ARCH_GATE_AREA 1
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index 0f915ae649a..6f1b7331313 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -54,10 +54,6 @@ extern unsigned int __VMALLOC_RESERVE;
extern int sysctl_legacy_va_layout;
extern void find_low_pfn_range(void);
-extern unsigned long init_memory_mapping(unsigned long start,
- unsigned long end);
-extern void initmem_init(unsigned long, unsigned long);
-extern void free_initmem(void);
extern void setup_bootmem_allocator(void);
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index d38c91b7024..7639dbf5d22 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -32,24 +32,16 @@
*/
#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
-#define __PHYSICAL_START CONFIG_PHYSICAL_START
-#define __KERNEL_ALIGN 0x200000
-
-/*
- * Make sure kernel is aligned to 2MB address. Catching it at compile
- * time is better. Change your config file and compile the kernel
- * for a 2MB aligned address (CONFIG_PHYSICAL_START)
- */
-#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0
-#error "CONFIG_PHYSICAL_START must be a multiple of 2MB"
-#endif
+#define __PHYSICAL_START ((CONFIG_PHYSICAL_START + \
+ (CONFIG_PHYSICAL_ALIGN - 1)) & \
+ ~(CONFIG_PHYSICAL_ALIGN - 1))
#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
-/* See Documentation/x86_64/mm.txt for a description of the memory map. */
+/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
#define __PHYSICAL_MASK_SHIFT 46
-#define __VIRTUAL_MASK_SHIFT 48
+#define __VIRTUAL_MASK_SHIFT 47
/*
* Kernel image size is limited to 512 MB (see level2_kernel_pgt in
@@ -71,12 +63,6 @@ extern unsigned long __phys_addr(unsigned long);
#define vmemmap ((struct page *)VMEMMAP_START)
-extern unsigned long init_memory_mapping(unsigned long start,
- unsigned long end);
-
-extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
-extern void free_initmem(void);
-
extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 826ad37006a..6473f5ccff8 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -46,6 +46,12 @@ extern int devmem_is_allowed(unsigned long pagenr);
extern unsigned long max_low_pfn_mapped;
extern unsigned long max_pfn_mapped;
+extern unsigned long init_memory_mapping(unsigned long start,
+ unsigned long end);
+
+extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
+extern void free_initmem(void);
+
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PAGE_DEFS_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index a53da004e08..4fb37c8a083 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -56,6 +56,7 @@ struct desc_ptr;
struct tss_struct;
struct mm_struct;
struct desc_struct;
+struct task_struct;
/*
* Wrapper type for pointers to code which uses the non-standard
@@ -203,7 +204,8 @@ struct pv_cpu_ops {
void (*swapgs)(void);
- struct pv_lazy_ops lazy_mode;
+ void (*start_context_switch)(struct task_struct *prev);
+ void (*end_context_switch)(struct task_struct *next);
};
struct pv_irq_ops {
@@ -1399,25 +1401,23 @@ enum paravirt_lazy_mode {
};
enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
-void paravirt_enter_lazy_cpu(void);
-void paravirt_leave_lazy_cpu(void);
+void paravirt_start_context_switch(struct task_struct *prev);
+void paravirt_end_context_switch(struct task_struct *next);
+
void paravirt_enter_lazy_mmu(void);
void paravirt_leave_lazy_mmu(void);
-void paravirt_leave_lazy(enum paravirt_lazy_mode mode);
-#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE
-static inline void arch_enter_lazy_cpu_mode(void)
+#define __HAVE_ARCH_START_CONTEXT_SWITCH
+static inline void arch_start_context_switch(struct task_struct *prev)
{
- PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter);
+ PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev);
}
-static inline void arch_leave_lazy_cpu_mode(void)
+static inline void arch_end_context_switch(struct task_struct *next)
{
- PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave);
+ PVOP_VCALL1(pv_cpu_ops.end_context_switch, next);
}
-void arch_flush_lazy_cpu_mode(void);
-
#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
static inline void arch_enter_lazy_mmu_mode(void)
{
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index b51a1e8b0ba..1ff685ca221 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -91,7 +91,7 @@ extern void pci_iommu_alloc(void);
#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
-#if defined(CONFIG_X86_64) || defined(CONFIG_DMA_API_DEBUG)
+#if defined(CONFIG_X86_64) || defined(CONFIG_DMAR) || defined(CONFIG_DMA_API_DEBUG)
#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
dma_addr_t ADDR_NAME;
@@ -130,6 +130,7 @@ extern void pci_iommu_alloc(void);
/* generic pci stuff */
#include <asm-generic/pci.h>
+#define PCIBIOS_MAX_MEM_32 0xffffffff
#ifdef CONFIG_NUMA
/* Returns the node based on pci bus */
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index e60fd3e14bd..b399988eee3 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -121,6 +121,9 @@ extern int __init pcibios_init(void);
extern int __init pci_mmcfg_arch_init(void);
extern void __init pci_mmcfg_arch_free(void);
+extern struct acpi_mcfg_allocation *pci_mmcfg_config;
+extern int pci_mmcfg_config_num;
+
/*
* AMD Fam10h CPUs are buggy, and cannot access MMIO config space
* on their northbrige except through the * %eax register. As such, you MUST
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 02ecb30982a..103f1ddb0d8 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -42,6 +42,7 @@
#else /* ...!ASSEMBLY */
+#include <linux/kernel.h>
#include <linux/stringify.h>
#ifdef CONFIG_SMP
@@ -155,6 +156,15 @@ do { \
/* We can use this directly for local CPU (faster). */
DECLARE_PER_CPU(unsigned long, this_cpu_off);
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+void *pcpu_lpage_remapped(void *kaddr);
+#else
+static inline void *pcpu_lpage_remapped(void *kaddr)
+{
+ return NULL;
+}
+#endif
+
#endif /* !__ASSEMBLY__ */
#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
new file mode 100644
index 00000000000..fa64e401589
--- /dev/null
+++ b/arch/x86/include/asm/perf_counter.h
@@ -0,0 +1,98 @@
+#ifndef _ASM_X86_PERF_COUNTER_H
+#define _ASM_X86_PERF_COUNTER_H
+
+/*
+ * Performance counter hw details:
+ */
+
+#define X86_PMC_MAX_GENERIC 8
+#define X86_PMC_MAX_FIXED 3
+
+#define X86_PMC_IDX_GENERIC 0
+#define X86_PMC_IDX_FIXED 32
+#define X86_PMC_IDX_MAX 64
+
+#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
+#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
+
+#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
+#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
+
+#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
+#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
+#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
+
+/*
+ * Includes eventsel and unit mask as well:
+ */
+#define ARCH_PERFMON_EVENT_MASK 0xffff
+
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
+ (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+
+#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
+
+/*
+ * Intel "Architectural Performance Monitoring" CPUID
+ * detection/enumeration details:
+ */
+union cpuid10_eax {
+ struct {
+ unsigned int version_id:8;
+ unsigned int num_counters:8;
+ unsigned int bit_width:8;
+ unsigned int mask_length:8;
+ } split;
+ unsigned int full;
+};
+
+union cpuid10_edx {
+ struct {
+ unsigned int num_counters_fixed:4;
+ unsigned int reserved:28;
+ } split;
+ unsigned int full;
+};
+
+
+/*
+ * Fixed-purpose performance counters:
+ */
+
+/*
+ * All 3 fixed-mode PMCs are configured via this single MSR:
+ */
+#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
+
+/*
+ * The counts are available in three separate MSRs:
+ */
+
+/* Instr_Retired.Any: */
+#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
+#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0)
+
+/* CPU_CLK_Unhalted.Core: */
+#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
+#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1)
+
+/* CPU_CLK_Unhalted.Ref: */
+#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
+#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
+
+#ifdef CONFIG_PERF_COUNTERS
+extern void init_hw_perf_counters(void);
+extern void perf_counters_lapic_init(void);
+
+#define PERF_COUNTER_INDEX_OFFSET 0
+
+#else
+static inline void init_hw_perf_counters(void) { }
+static inline void perf_counters_lapic_init(void) { }
+#endif
+
+#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index dd14c54ac71..0e8c2a0fd92 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -46,7 +46,13 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte)
__free_page(pte);
}
-extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
+extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
+
+static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
+ unsigned long address)
+{
+ ___pte_free_tlb(tlb, pte);
+}
static inline void pmd_populate_kernel(struct mm_struct *mm,
pmd_t *pmd, pte_t *pte)
@@ -78,7 +84,13 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
free_page((unsigned long)pmd);
}
-extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
+extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
+
+static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
+ unsigned long adddress)
+{
+ ___pmd_free_tlb(tlb, pmd);
+}
#ifdef CONFIG_X86_PAE
extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
@@ -108,7 +120,14 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
free_page((unsigned long)pud);
}
-extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
+extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
+
+static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
+ unsigned long address)
+{
+ ___pud_free_tlb(tlb, pud);
+}
+
#endif /* PAGETABLE_LEVELS > 3 */
#endif /* PAGETABLE_LEVELS > 2 */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 29d96d168bc..16748077559 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -2,6 +2,7 @@
#define _ASM_X86_PGTABLE_H
#include <asm/page.h>
+#include <asm/e820.h>
#include <asm/pgtable_types.h>
@@ -81,6 +82,8 @@ static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
#define pte_val(x) native_pte_val(x)
#define __pte(x) native_make_pte(x)
+#define arch_end_context_switch(prev) do {} while(0)
+
#endif /* CONFIG_PARAVIRT */
/*
@@ -267,10 +270,17 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
#define canon_pgprot(p) __pgprot(massage_pgprot(p))
-static inline int is_new_memtype_allowed(unsigned long flags,
- unsigned long new_flags)
+static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
+ unsigned long flags,
+ unsigned long new_flags)
{
/*
+ * PAT type is always WB for ISA. So no need to check.
+ */
+ if (is_ISA_range(paddr, paddr + size - 1))
+ return 1;
+
+ /*
* Certain new memtypes are not allowed with certain
* requested memtype:
* - request is uncached, return cannot be write-back
@@ -315,6 +325,11 @@ static inline int pte_present(pte_t a)
return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
}
+static inline int pte_hidden(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_HIDDEN;
+}
+
static inline int pmd_present(pmd_t pmd)
{
return pmd_flags(pmd) & _PAGE_PRESENT;
@@ -503,6 +518,8 @@ static inline int pgd_none(pgd_t pgd)
#ifndef __ASSEMBLY__
+extern int direct_gbpages;
+
/* local pte updates need not use xchg for locking */
static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
{
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 31bd120cf2a..01fd9461d32 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -49,13 +49,17 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
#endif
#if defined(CONFIG_HIGHPTE)
+#define __KM_PTE \
+ (in_nmi() ? KM_NMI_PTE : \
+ in_irq() ? KM_IRQ_PTE : \
+ KM_PTE0)
#define pte_offset_map(dir, address) \
- ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \
+ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) + \
pte_index((address)))
#define pte_offset_map_nested(dir, address) \
((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \
pte_index((address)))
-#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0)
+#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE)
#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
#else
#define pte_offset_map(dir, address) \
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index 2733fad45f9..5e67c153231 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -46,6 +46,10 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
#endif
+#define MODULES_VADDR VMALLOC_START
+#define MODULES_END VMALLOC_END
+#define MODULES_LEN (MODULES_VADDR - MODULES_END)
+
#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)
#endif /* _ASM_X86_PGTABLE_32_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 6b87bc6d501..c57a3011714 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -25,10 +25,6 @@ extern pgd_t init_level4_pgt[];
extern void paging_init(void);
-#endif /* !__ASSEMBLY__ */
-
-#ifndef __ASSEMBLY__
-
#define pte_ERROR(e) \
printk("%s:%d: bad pte %p(%016lx).\n", \
__FILE__, __LINE__, &(e), pte_val(e))
@@ -135,8 +131,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
#define update_mmu_cache(vma, address, pte) do { } while (0)
-extern int direct_gbpages;
-
/* Encode and de-code a swap entry */
#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
@@ -171,10 +165,7 @@ extern void cleanup_highmap(void);
/* fs/proc/kcore.c */
#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
-#define kc_offset_to_vaddr(o) \
- (((o) & (1UL << (__VIRTUAL_MASK_SHIFT - 1))) \
- ? ((o) | ~__VIRTUAL_MASK) \
- : (o))
+#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
#define __HAVE_ARCH_PTE_SAME
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index fbf42b8e038..766ea16fbbb 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -51,11 +51,11 @@ typedef struct { pteval_t pte; } pte_t;
#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
-
+/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
-#define VMALLOC_START _AC(0xffffc20000000000, UL)
-#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
-#define VMEMMAP_START _AC(0xffffe20000000000, UL)
+#define VMALLOC_START _AC(0xffffc90000000000, UL)
+#define VMALLOC_END _AC(0xffffe8ffffffffff, UL)
+#define VMEMMAP_START _AC(0xffffea0000000000, UL)
#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
#define MODULES_END _AC(0xffffffffff000000, UL)
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b8238dc8786..54cb697f490 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -18,7 +18,7 @@
#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
-#define _PAGE_BIT_UNUSED3 11
+#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
@@ -41,13 +41,18 @@
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
-#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3)
#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
#define __HAVE_ARCH_PTE_SPECIAL
+#ifdef CONFIG_KMEMCHECK
+#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
+#else
+#define _PAGE_HIDDEN (_AT(pteval_t, 0))
+#endif
+
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
#else
@@ -273,7 +278,6 @@ typedef struct page *pgtable_t;
extern pteval_t __supported_pte_mask;
extern int nx_enabled;
-extern void set_nx(void);
#define pgprot_writecombine pgprot_writecombine
extern pgprot_t pgprot_writecombine(pgprot_t prot);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c2cceae709c..c7768269b1c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -135,7 +135,8 @@ extern struct cpuinfo_x86 boot_cpu_data;
extern struct cpuinfo_x86 new_cpu_data;
extern struct tss_struct doublefault_tss;
-extern __u32 cleared_cpu_caps[NCAPINTS];
+extern __u32 cpu_caps_cleared[NCAPINTS];
+extern __u32 cpu_caps_set[NCAPINTS];
#ifdef CONFIG_SMP
DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
@@ -409,9 +410,6 @@ DECLARE_PER_CPU(unsigned long, stack_canary);
extern unsigned int xstate_size;
extern void free_thread_xstate(struct task_struct *);
extern struct kmem_cache *task_xstate_cachep;
-extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
-extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
-extern unsigned short num_cache_leaves;
struct thread_struct {
/* Cached TLS descriptors: */
@@ -427,8 +425,12 @@ struct thread_struct {
unsigned short fsindex;
unsigned short gsindex;
#endif
+#ifdef CONFIG_X86_32
unsigned long ip;
+#endif
+#ifdef CONFIG_X86_64
unsigned long fs;
+#endif
unsigned long gs;
/* Hardware debugging registers: */
unsigned long debugreg0;
@@ -460,14 +462,8 @@ struct thread_struct {
unsigned io_bitmap_max;
/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
unsigned long debugctlmsr;
-#ifdef CONFIG_X86_DS
-/* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */
+ /* Debug Store context; see asm/ds.h */
struct ds_context *ds_ctx;
-#endif /* CONFIG_X86_DS */
-#ifdef CONFIG_X86_PTRACE_BTS
-/* the signal to send on a bts buffer overflow */
- unsigned int bts_ovfl_signal;
-#endif /* CONFIG_X86_PTRACE_BTS */
};
static inline unsigned long native_get_debugreg(int regno)
@@ -795,6 +791,21 @@ static inline unsigned long get_debugctlmsr(void)
return debugctlmsr;
}
+static inline unsigned long get_debugctlmsr_on_cpu(int cpu)
+{
+ u64 debugctlmsr = 0;
+ u32 val1, val2;
+
+#ifndef CONFIG_X86_DEBUGCTLMSR
+ if (boot_cpu_data.x86 < 6)
+ return 0;
+#endif
+ rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2);
+ debugctlmsr = val1 | ((u64)val2 << 32);
+
+ return debugctlmsr;
+}
+
static inline void update_debugctlmsr(unsigned long debugctlmsr)
{
#ifndef CONFIG_X86_DEBUGCTLMSR
@@ -804,6 +815,18 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)
wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
}
+static inline void update_debugctlmsr_on_cpu(int cpu,
+ unsigned long debugctlmsr)
+{
+#ifndef CONFIG_X86_DEBUGCTLMSR
+ if (boot_cpu_data.x86 < 6)
+ return;
+#endif
+ wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR,
+ (u32)((u64)debugctlmsr),
+ (u32)((u64)debugctlmsr >> 32));
+}
+
/*
* from system description table in BIOS. Mostly for MCA use, but
* others may find it useful:
@@ -814,6 +837,7 @@ extern unsigned int BIOS_revision;
/* Boot loader type from the setup header: */
extern int bootloader_type;
+extern int bootloader_version;
extern char ignore_fpu_irq;
@@ -874,7 +898,6 @@ static inline void spin_lock_prefetch(const void *x)
.vm86_info = NULL, \
.sysenter_cs = __KERNEL_CS, \
.io_bitmap_ptr = NULL, \
- .fs = __KERNEL_PERCPU, \
}
/*
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 49fb3ecf3bb..621f56d7312 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -22,7 +22,14 @@ extern int reboot_force;
long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
-#define round_up(x, y) (((x) + (y) - 1) & ~((y) - 1))
-#define round_down(x, y) ((x) & ~((y) - 1))
+/*
+ * This looks more complex than it should be. But we need to
+ * get the type for the ~ right in round_down (it needs to be
+ * as wide as the result!), and we want to evaluate the macro
+ * arguments just once each.
+ */
+#define __round_mask(x,y) ((__typeof__(x))((y)-1))
+#define round_up(x,y) ((((x)-1) | __round_mask(x,y))+1)
+#define round_down(x,y) ((x) & ~__round_mask(x,y))
#endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 624f133943e..0f0d908349a 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -236,12 +236,11 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
extern int do_set_thread_area(struct task_struct *p, int idx,
struct user_desc __user *info, int can_allocate);
-extern void x86_ptrace_untrace(struct task_struct *);
-extern void x86_ptrace_fork(struct task_struct *child,
- unsigned long clone_flags);
+#ifdef CONFIG_X86_PTRACE_BTS
+extern void ptrace_bts_untrace(struct task_struct *tsk);
-#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk)
-#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags)
+#define arch_ptrace_untrace(tsk) ptrace_bts_untrace(tsk)
+#endif /* CONFIG_X86_PTRACE_BTS */
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index a4737dddfd5..64cf2d24fad 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -48,9 +48,15 @@
#endif
#ifdef CONFIG_X86_64
+#ifdef CONFIG_PARAVIRT
+/* Paravirtualized systems may not have PSE or PGE available */
#define NEED_PSE 0
-#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
#define NEED_PGE 0
+#else
+#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31)
+#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31)
+#endif
+#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31))
#define NEED_XMM (1<<(X86_FEATURE_XMM & 31))
#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31))
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index bdc2ada05ae..4093d1ed6db 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -33,7 +33,6 @@ struct x86_quirks {
int (*setup_ioapic_ids)(void);
};
-extern void x86_quirk_pre_intr_init(void);
extern void x86_quirk_intr_init(void);
extern void x86_quirk_trap_init(void);
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 7761a5d554b..598457cbd0f 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -117,7 +117,7 @@ typedef unsigned long sigset_t;
#define MINSIGSTKSZ 2048
#define SIGSTKSZ 8192
-#include <asm-generic/signal.h>
+#include <asm-generic/signal-defs.h>
#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 19e0d88b966..6a84ed166ae 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -180,7 +180,7 @@ extern int safe_smp_processor_id(void);
static inline int logical_smp_processor_id(void)
{
/* we don't want to mark this access volatile - bad code generation */
- return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
+ return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
}
#endif
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index e3cc3c063ec..4517d6b9318 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -27,7 +27,7 @@
#else /* CONFIG_X86_32 */
# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
# define MAX_PHYSADDR_BITS 44
-# define MAX_PHYSMEM_BITS 44 /* Can be max 45 bits */
+# define MAX_PHYSMEM_BITS 46
#endif
#endif /* CONFIG_SPARSEMEM */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index b7e5db87639..4e77853321d 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -302,4 +302,8 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
#define _raw_read_relax(lock) cpu_relax()
#define _raw_write_relax(lock) cpu_relax()
+/* The {read|write|spin}_lock() on x86 are full memory barriers. */
+static inline void smp_mb__after_lock(void) { }
+#define ARCH_HAS_SMP_MB_AFTER_LOCK
+
#endif /* _ASM_X86_SPINLOCK_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index f517944b2b1..cf86a5e7381 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -3,6 +3,8 @@
extern int kstack_depth_to_print;
+int x86_is_stack_id(int id, char *name);
+
/* Generic stack tracer with callbacks */
struct stacktrace_ops {
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index 0e0e3ba827f..c86f452256d 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -177,10 +177,18 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
* No 3D Now!
*/
+#ifndef CONFIG_KMEMCHECK
#define memcpy(t, f, n) \
(__builtin_constant_p((n)) \
? __constant_memcpy((t), (f), (n)) \
: __memcpy((t), (f), (n)))
+#else
+/*
+ * kmemcheck becomes very happy if we use the REP instructions unconditionally,
+ * because it means that we know both memory operands in advance.
+ */
+#define memcpy(t, f, n) __memcpy((t), (f), (n))
+#endif
#endif
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 2afe164bf1e..19e2c468fc2 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -27,6 +27,7 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t
function. */
#define __HAVE_ARCH_MEMCPY 1
+#ifndef CONFIG_KMEMCHECK
#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
extern void *memcpy(void *to, const void *from, size_t len);
#else
@@ -42,6 +43,13 @@ extern void *__memcpy(void *to, const void *from, size_t len);
__ret; \
})
#endif
+#else
+/*
+ * kmemcheck becomes very happy if we use the REP instructions unconditionally,
+ * because it means that we know both memory operands in advance.
+ */
+#define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len))
+#endif
#define __HAVE_ARCH_MEMSET
void *memset(void *s, int c, size_t n);
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 82ada75f3eb..85574b7c1bc 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -225,6 +225,7 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_EVTINJ_VALID_ERR (1 << 11)
#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
+#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK
#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 7043408f690..372b76edd63 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -1,7 +1,7 @@
/*
* syscalls.h - Linux syscall interfaces (arch-specific)
*
- * Copyright (c) 2008 Jaswinder Singh
+ * Copyright (c) 2008 Jaswinder Singh Rajput
*
* This file is released under the GPLv2.
* See the file COPYING for more details.
@@ -12,50 +12,55 @@
#include <linux/compiler.h>
#include <linux/linkage.h>
-#include <linux/types.h>
#include <linux/signal.h>
+#include <linux/types.h>
/* Common in X86_32 and X86_64 */
/* kernel/ioport.c */
asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
+/* kernel/process.c */
+int sys_fork(struct pt_regs *);
+int sys_vfork(struct pt_regs *);
+
/* kernel/ldt.c */
asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
+/* kernel/signal.c */
+long sys_rt_sigreturn(struct pt_regs *);
+
/* kernel/tls.c */
asmlinkage int sys_set_thread_area(struct user_desc __user *);
asmlinkage int sys_get_thread_area(struct user_desc __user *);
/* X86_32 only */
#ifdef CONFIG_X86_32
+/* kernel/ioport.c */
+long sys_iopl(struct pt_regs *);
+
/* kernel/process_32.c */
-int sys_fork(struct pt_regs *);
int sys_clone(struct pt_regs *);
-int sys_vfork(struct pt_regs *);
int sys_execve(struct pt_regs *);
-/* kernel/signal_32.c */
+/* kernel/signal.c */
asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
struct old_sigaction __user *);
int sys_sigaltstack(struct pt_regs *);
unsigned long sys_sigreturn(struct pt_regs *);
-long sys_rt_sigreturn(struct pt_regs *);
-
-/* kernel/ioport.c */
-long sys_iopl(struct pt_regs *);
/* kernel/sys_i386_32.c */
+struct mmap_arg_struct;
+struct sel_arg_struct;
+struct oldold_utsname;
+struct old_utsname;
+
asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
unsigned long, unsigned long, unsigned long);
-struct mmap_arg_struct;
asmlinkage int old_mmap(struct mmap_arg_struct __user *);
-struct sel_arg_struct;
asmlinkage int old_select(struct sel_arg_struct __user *);
asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
-struct old_utsname;
asmlinkage int sys_uname(struct old_utsname __user *);
-struct oldold_utsname;
asmlinkage int sys_olduname(struct oldold_utsname __user *);
/* kernel/vm86_32.c */
@@ -65,29 +70,27 @@ int sys_vm86(struct pt_regs *);
#else /* CONFIG_X86_32 */
/* X86_64 only */
+/* kernel/ioport.c */
+asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
+
/* kernel/process_64.c */
-asmlinkage long sys_fork(struct pt_regs *);
asmlinkage long sys_clone(unsigned long, unsigned long,
void __user *, void __user *,
struct pt_regs *);
-asmlinkage long sys_vfork(struct pt_regs *);
asmlinkage long sys_execve(char __user *, char __user * __user *,
char __user * __user *,
struct pt_regs *);
long sys_arch_prctl(int, unsigned long);
-/* kernel/ioport.c */
-asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
-
-/* kernel/signal_64.c */
+/* kernel/signal.c */
asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
struct pt_regs *);
-long sys_rt_sigreturn(struct pt_regs *);
/* kernel/sys_x86_64.c */
+struct new_utsname;
+
asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
unsigned long, unsigned long, unsigned long);
-struct new_utsname;
asmlinkage long sys_uname(struct new_utsname __user *);
#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h
index f72956331c4..c4ee8056bac 100644
--- a/arch/x86/include/asm/termios.h
+++ b/arch/x86/include/asm/termios.h
@@ -67,6 +67,7 @@ static inline int user_termio_to_kernel_termios(struct ktermios *termios,
SET_LOW_TERMIOS_BITS(termios, termio, c_oflag);
SET_LOW_TERMIOS_BITS(termios, termio, c_cflag);
SET_LOW_TERMIOS_BITS(termios, termio, c_lflag);
+ get_user(termios->c_line, &termio->c_line);
return copy_from_user(termios->c_cc, termio->c_cc, NCC);
}
diff --git a/arch/x86/include/asm/therm_throt.h b/arch/x86/include/asm/therm_throt.h
deleted file mode 100644
index c62349ee786..00000000000
--- a/arch/x86/include/asm/therm_throt.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _ASM_X86_THERM_THROT_H
-#define _ASM_X86_THERM_THROT_H
-
-#include <asm/atomic.h>
-
-extern atomic_t therm_throt_en;
-int therm_throt_process(int curr);
-
-#endif /* _ASM_X86_THERM_THROT_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8820a73ae09..fad7d40b75f 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -49,7 +49,7 @@ struct thread_info {
.exec_domain = &default_exec_domain, \
.flags = 0, \
.cpu = 0, \
- .preempt_count = 1, \
+ .preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
.restart_block = { \
.fn = do_no_restart_syscall, \
@@ -94,7 +94,8 @@ struct thread_info {
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
-#define TIF_SYSCALL_FTRACE 27 /* for ftrace syscall instrumentation */
+#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
+#define TIF_SYSCALL_FTRACE 28 /* for ftrace syscall instrumentation */
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -116,6 +117,7 @@ struct thread_info {
#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
+#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE)
/* work to do in syscall_trace_enter() */
@@ -152,9 +154,9 @@ struct thread_info {
/* thread information allocation */
#ifdef CONFIG_DEBUG_STACK_USAGE
-#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO)
+#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
#else
-#define THREAD_FLAGS GFP_KERNEL
+#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK)
#endif
#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index bd37ed444a2..20ca9c4d468 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -45,12 +45,16 @@ extern int no_timer_check;
*/
DECLARE_PER_CPU(unsigned long, cyc2ns);
+DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
{
- return cyc * per_cpu(cyc2ns, smp_processor_id()) >> CYC2NS_SCALE_FACTOR;
+ int cpu = smp_processor_id();
+ unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
+ ns += cyc * per_cpu(cyc2ns, cpu) >> CYC2NS_SCALE_FACTOR;
+ return ns;
}
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h
index b5c9d45c981..1375cfc9396 100644
--- a/arch/x86/include/asm/timex.h
+++ b/arch/x86/include/asm/timex.h
@@ -4,9 +4,7 @@
#include <asm/processor.h>
#include <asm/tsc.h>
-/* The PIT ticks at this frequency (in HZ): */
-#define PIT_TICK_RATE 1193182
-
+/* Assume we use the PIT time source for the clock tick */
#define CLOCK_TICK_RATE PIT_TICK_RATE
#define ARCH_HAS_READ_CURRENT_TIMER
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 16a5c84b032..7f3eba08e7d 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -17,7 +17,7 @@
static inline void __native_flush_tlb(void)
{
- write_cr3(read_cr3());
+ native_write_cr3(native_read_cr3());
}
static inline void __native_flush_tlb_global(void)
@@ -32,11 +32,11 @@ static inline void __native_flush_tlb_global(void)
*/
raw_local_irq_save(flags);
- cr4 = read_cr4();
+ cr4 = native_read_cr4();
/* clear PGE */
- write_cr4(cr4 & ~X86_CR4_PGE);
+ native_write_cr4(cr4 & ~X86_CR4_PGE);
/* write old PGE again and flush TLBs */
- write_cr4(cr4);
+ native_write_cr4(cr4);
raw_local_irq_restore(flags);
}
@@ -172,6 +172,6 @@ static inline void flush_tlb_kernel_range(unsigned long start,
flush_tlb_all();
}
-extern void zap_low_mappings(void);
+extern void zap_low_mappings(bool early);
#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index f44b49abca4..066ef590d7e 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -203,7 +203,8 @@ struct pci_bus;
void x86_pci_root_bus_res_quirks(struct pci_bus *b);
#ifdef CONFIG_SMP
-#define mc_capable() (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids)
+#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \
+ (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids))
#define smt_capable() (smp_num_siblings > 1)
#endif
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0d5342515b8..bfd74c032fc 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -2,6 +2,7 @@
#define _ASM_X86_TRAPS_H
#include <asm/debugreg.h>
+#include <asm/siginfo.h> /* TRAP_TRACE, ... */
#ifdef CONFIG_X86_32
#define dotraplinkage
@@ -13,6 +14,9 @@ asmlinkage void divide_error(void);
asmlinkage void debug(void);
asmlinkage void nmi(void);
asmlinkage void int3(void);
+asmlinkage void xen_debug(void);
+asmlinkage void xen_int3(void);
+asmlinkage void xen_stack_segment(void);
asmlinkage void overflow(void);
asmlinkage void bounds(void);
asmlinkage void invalid_op(void);
@@ -74,7 +78,6 @@ static inline int get_si_code(unsigned long condition)
}
extern int panic_on_unrecovered_nmi;
-extern int kstack_depth_to_print;
void math_error(void __user *);
void math_emulate(struct math_emu_info *);
diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h
index e6f73632007..09b97745772 100644
--- a/arch/x86/include/asm/types.h
+++ b/arch/x86/include/asm/types.h
@@ -14,12 +14,6 @@ typedef unsigned short umode_t;
*/
#ifdef __KERNEL__
-#ifdef CONFIG_X86_32
-# define BITS_PER_LONG 32
-#else
-# define BITS_PER_LONG 64
-#endif
-
#ifndef __ASSEMBLY__
typedef u64 dma64_addr_t;
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index b685ece89d5..d2c6c930b49 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -25,7 +25,7 @@
#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
#define KERNEL_DS MAKE_MM_SEG(-1UL)
-#define USER_DS MAKE_MM_SEG(PAGE_OFFSET)
+#define USER_DS MAKE_MM_SEG(TASK_SIZE_MAX)
#define get_ds() (KERNEL_DS)
#define get_fs() (current_thread_info()->addr_limit)
@@ -212,9 +212,9 @@ extern int __get_user_bad(void);
: "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
#else
#define __put_user_asm_u64(x, ptr, retval, errret) \
- __put_user_asm(x, ptr, retval, "q", "", "Zr", errret)
+ __put_user_asm(x, ptr, retval, "q", "", "er", errret)
#define __put_user_asm_ex_u64(x, addr) \
- __put_user_asm_ex(x, addr, "q", "", "Zr")
+ __put_user_asm_ex(x, addr, "q", "", "er")
#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu)
#endif
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 8cc687326eb..db24b215fc5 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -88,11 +88,11 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size)
ret, "l", "k", "ir", 4);
return ret;
case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst,
- ret, "q", "", "ir", 8);
+ ret, "q", "", "er", 8);
return ret;
case 10:
__put_user_asm(*(u64 *)src, (u64 __user *)dst,
- ret, "q", "", "ir", 10);
+ ret, "q", "", "er", 10);
if (unlikely(ret))
return ret;
asm("":::"memory");
@@ -101,12 +101,12 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size)
return ret;
case 16:
__put_user_asm(*(u64 *)src, (u64 __user *)dst,
- ret, "q", "", "ir", 16);
+ ret, "q", "", "er", 16);
if (unlikely(ret))
return ret;
asm("":::"memory");
__put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
- ret, "q", "", "ir", 8);
+ ret, "q", "", "er", 8);
return ret;
default:
return copy_user_generic((__force void *)dst, src, size);
@@ -157,7 +157,7 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
ret, "q", "", "=r", 8);
if (likely(!ret))
__put_user_asm(tmp, (u64 __user *)dst,
- ret, "q", "", "ir", 8);
+ ret, "q", "", "er", 8);
return ret;
}
default:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6e72d74cf8d..732a3070615 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -340,6 +340,8 @@
#define __NR_inotify_init1 332
#define __NR_preadv 333
#define __NR_pwritev 334
+#define __NR_rt_tgsigqueueinfo 335
+#define __NR_perf_counter_open 336
#ifdef __KERNEL__
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index f8182946232..900e1617e67 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -657,7 +657,10 @@ __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
__SYSCALL(__NR_preadv, sys_preadv)
#define __NR_pwritev 296
__SYSCALL(__NR_pwritev, sys_pwritev)
-
+#define __NR_rt_tgsigqueueinfo 297
+__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
+#define __NR_perf_counter_open 298
+__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 9b0e61bf7a8..80e2984f521 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -37,7 +37,7 @@
#define UV_CPUS_PER_ACT_STATUS 32
#define UV_ACT_STATUS_MASK 0x3
#define UV_ACT_STATUS_SIZE 2
-#define UV_ACTIVATION_DESCRIPTOR_SIZE 32
+#define UV_ADP_SIZE 32
#define UV_DISTRIBUTION_SIZE 256
#define UV_SW_ACK_NPENDING 8
#define UV_NET_ENDPOINT_INTD 0x38
@@ -133,7 +133,7 @@ struct bau_msg_payload {
* see table 4.2.3.0.1 in broacast_assist spec.
*/
struct bau_msg_header {
- unsigned int dest_subnodeid:6; /* must be zero */
+ unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */
/* bits 5:0 */
unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */
/* bits 20:6 */ /* first bit in node_map */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index d3a98ea1062..77a68505419 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -133,6 +133,7 @@ struct uv_scir_s {
struct uv_hub_info_s {
unsigned long global_mmr_base;
unsigned long gpa_mask;
+ unsigned int gnode_extra;
unsigned long gnode_upper;
unsigned long lowmem_remap_top;
unsigned long lowmem_remap_base;
@@ -159,7 +160,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
* p - PNODE (local part of nsids, right shifted 1)
*/
#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask)
-#define UV_PNODE_TO_NASID(p) (((p) << 1) | uv_hub_info->gnode_upper)
+#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra)
+#define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1)
#define UV_LOCAL_MMR_BASE 0xf4000000UL
#define UV_GLOBAL_MMR32_BASE 0xf8000000UL
@@ -173,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
#define UV_GLOBAL_MMR64_PNODE_BITS(p) \
- ((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT)
+ (((unsigned long)(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
#define UV_APIC_PNODE_SHIFT 6
@@ -325,6 +327,7 @@ struct uv_blade_info {
unsigned short nr_possible_cpus;
unsigned short nr_online_cpus;
unsigned short pnode;
+ short memory_nid;
};
extern struct uv_blade_info *uv_blade_info;
extern short *uv_node_to_blade;
@@ -361,6 +364,12 @@ static inline int uv_blade_to_pnode(int bid)
return uv_blade_info[bid].pnode;
}
+/* Nid of memory node on blade. -1 if no blade-local memory */
+static inline int uv_blade_to_memory_nid(int bid)
+{
+ return uv_blade_info[bid].memory_nid;
+}
+
/* Determine the number of possible cpus on a blade */
static inline int uv_blade_nr_possible_cpus(int bid)
{
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 498f944010b..11be5ad2e0e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -247,6 +247,7 @@ enum vmcs_field {
#define EXIT_REASON_MSR_READ 31
#define EXIT_REASON_MSR_WRITE 32
#define EXIT_REASON_MWAIT_INSTRUCTION 36
+#define EXIT_REASON_MCE_DURING_VMENTRY 41
#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
#define EXIT_REASON_APIC_ACCESS 44
#define EXIT_REASON_EPT_VIOLATION 48
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index 11b3bb86e17..7fcf6f3dbcc 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -1,5 +1,10 @@
+#ifdef CONFIG_KMEMCHECK
+/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
+# include <asm-generic/xor.h>
+#else
#ifdef CONFIG_X86_32
# include "xor_32.h"
#else
# include "xor_64.h"
#endif
+#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 88d1bfc847d..430d5b24af7 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -24,11 +24,15 @@ CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
CFLAGS_hpet.o := $(nostackp)
CFLAGS_tsc.o := $(nostackp)
CFLAGS_paravirt.o := $(nostackp)
+GCOV_PROFILE_vsyscall_64.o := n
+GCOV_PROFILE_hpet.o := n
+GCOV_PROFILE_tsc.o := n
+GCOV_PROFILE_paravirt.o := n
obj-y := process_$(BITS).o signal.o entry_$(BITS).o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
-obj-y += setup.o i8259.o irqinit_$(BITS).o
+obj-y += setup.o i8259.o irqinit.o
obj-$(CONFIG_X86_VISWS) += visws_quirks.o
obj-$(CONFIG_X86_32) += probe_roms_32.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
@@ -44,6 +48,7 @@ obj-y += process.o
obj-y += i387.o xsave.o
obj-y += ptrace.o
obj-$(CONFIG_X86_DS) += ds.o
+obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
obj-$(CONFIG_X86_32) += tls.o
obj-$(CONFIG_IA32_EMULATION) += tls.o
obj-y += step.o
@@ -72,7 +77,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
obj-$(CONFIG_KPROBES) += kprobes.o
-obj-$(CONFIG_MODULES) += module_$(BITS).o
+obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
obj-$(CONFIG_KGDB) += kgdb.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 723989d7f80..6b8ca3a0285 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -33,6 +33,7 @@
#include <linux/irq.h>
#include <linux/bootmem.h>
#include <linux/ioport.h>
+#include <linux/pci.h>
#include <asm/pgtable.h>
#include <asm/io_apic.h>
@@ -43,11 +44,7 @@
static int __initdata acpi_force = 0;
u32 acpi_rsdt_forced;
-#ifdef CONFIG_ACPI
-int acpi_disabled = 0;
-#else
-int acpi_disabled = 1;
-#endif
+int acpi_disabled;
EXPORT_SYMBOL(acpi_disabled);
#ifdef CONFIG_X86_64
@@ -121,72 +118,6 @@ void __init __acpi_unmap_table(char *map, unsigned long size)
early_iounmap(map, size);
}
-#ifdef CONFIG_PCI_MMCONFIG
-
-static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
-
-/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
-struct acpi_mcfg_allocation *pci_mmcfg_config;
-int pci_mmcfg_config_num;
-
-static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
-{
- if (!strcmp(mcfg->header.oem_id, "SGI"))
- acpi_mcfg_64bit_base_addr = TRUE;
-
- return 0;
-}
-
-int __init acpi_parse_mcfg(struct acpi_table_header *header)
-{
- struct acpi_table_mcfg *mcfg;
- unsigned long i;
- int config_size;
-
- if (!header)
- return -EINVAL;
-
- mcfg = (struct acpi_table_mcfg *)header;
-
- /* how many config structures do we have */
- pci_mmcfg_config_num = 0;
- i = header->length - sizeof(struct acpi_table_mcfg);
- while (i >= sizeof(struct acpi_mcfg_allocation)) {
- ++pci_mmcfg_config_num;
- i -= sizeof(struct acpi_mcfg_allocation);
- };
- if (pci_mmcfg_config_num == 0) {
- printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
- return -ENODEV;
- }
-
- config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
- pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
- if (!pci_mmcfg_config) {
- printk(KERN_WARNING PREFIX
- "No memory for MCFG config tables\n");
- return -ENOMEM;
- }
-
- memcpy(pci_mmcfg_config, &mcfg[1], config_size);
-
- acpi_mcfg_oem_check(mcfg);
-
- for (i = 0; i < pci_mmcfg_config_num; ++i) {
- if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
- !acpi_mcfg_64bit_base_addr) {
- printk(KERN_ERR PREFIX
- "MMCONFIG not in low 4GB of memory\n");
- kfree(pci_mmcfg_config);
- pci_mmcfg_config_num = 0;
- return -ENODEV;
- }
- }
-
- return 0;
-}
-#endif /* CONFIG_PCI_MMCONFIG */
-
#ifdef CONFIG_X86_LOCAL_APIC
static int __init acpi_parse_madt(struct acpi_table_header *table)
{
@@ -522,7 +453,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
* success: return IRQ number (>=0)
* failure: return < 0
*/
-int acpi_register_gsi(u32 gsi, int triggering, int polarity)
+int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
{
unsigned int irq;
unsigned int plat_gsi = gsi;
@@ -532,14 +463,14 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity)
* Make sure all (legacy) PCI IRQs are set as level-triggered.
*/
if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
- if (triggering == ACPI_LEVEL_SENSITIVE)
+ if (trigger == ACPI_LEVEL_SENSITIVE)
eisa_set_level_irq(gsi);
}
#endif
#ifdef CONFIG_X86_IO_APIC
if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
- plat_gsi = mp_register_gsi(gsi, triggering, polarity);
+ plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
}
#endif
acpi_gsi_to_irq(plat_gsi, &irq);
@@ -903,10 +834,8 @@ extern int es7000_plat;
#endif
static struct {
- int apic_id;
int gsi_base;
int gsi_end;
- DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
} mp_ioapic_routing[MAX_IO_APICS];
int mp_find_ioapic(int gsi)
@@ -986,16 +915,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
mp_ioapics[idx].apicid = uniq_ioapic_id(id);
-#ifdef CONFIG_X86_32
mp_ioapics[idx].apicver = io_apic_get_version(idx);
-#else
- mp_ioapics[idx].apicver = 0;
-#endif
+
/*
* Build basic GSI lookup table to facilitate gsi->io_apic lookups
* and to prevent reprogramming of IOAPIC pins (PCI GSIs).
*/
- mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid;
mp_ioapic_routing[idx].gsi_base = gsi_base;
mp_ioapic_routing[idx].gsi_end = gsi_base +
io_apic_get_redir_entries(idx);
@@ -1158,26 +1083,52 @@ void __init mp_config_acpi_legacy_irqs(void)
}
}
-int mp_register_gsi(u32 gsi, int triggering, int polarity)
+static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
+ int polarity)
{
+#ifdef CONFIG_X86_MPPARSE
+ struct mpc_intsrc mp_irq;
+ struct pci_dev *pdev;
+ unsigned char number;
+ unsigned int devfn;
int ioapic;
- int ioapic_pin;
-#ifdef CONFIG_X86_32
-#define MAX_GSI_NUM 4096
-#define IRQ_COMPRESSION_START 64
+ u8 pin;
- static int pci_irq = IRQ_COMPRESSION_START;
- /*
- * Mapping between Global System Interrupts, which
- * represent all possible interrupts, and IRQs
- * assigned to actual devices.
- */
- static int gsi_to_irq[MAX_GSI_NUM];
-#else
+ if (!acpi_ioapic)
+ return 0;
+ if (!dev)
+ return 0;
+ if (dev->bus != &pci_bus_type)
+ return 0;
+
+ pdev = to_pci_dev(dev);
+ number = pdev->bus->number;
+ devfn = pdev->devfn;
+ pin = pdev->pin;
+ /* print the entry should happen on mptable identically */
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+ (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
+ mp_irq.srcbus = number;
+ mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+ ioapic = mp_find_ioapic(gsi);
+ mp_irq.dstapic = mp_ioapics[ioapic].apicid;
+ mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
+
+ save_mp_irq(&mp_irq);
+#endif
+ return 0;
+}
+
+int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+{
+ int ioapic;
+ int ioapic_pin;
+ struct io_apic_irq_attr irq_attr;
if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
return gsi;
-#endif
/* Don't set up the ACPI SCI because it's already set up */
if (acpi_gbl_FADT.sci_interrupt == gsi)
@@ -1196,93 +1147,22 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
gsi = ioapic_renumber_irq(ioapic, gsi);
#endif
- /*
- * Avoid pin reprogramming. PRTs typically include entries
- * with redundant pin->gsi mappings (but unique PCI devices);
- * we only program the IOAPIC on the first.
- */
if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
printk(KERN_ERR "Invalid reference to IOAPIC pin "
- "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
+ "%d-%d\n", mp_ioapics[ioapic].apicid,
ioapic_pin);
return gsi;
}
- if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
- pr_debug("Pin %d-%d already programmed\n",
- mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-#ifdef CONFIG_X86_32
- return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
-#else
- return gsi;
-#endif
- }
- set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
-#ifdef CONFIG_X86_32
- /*
- * For GSI >= 64, use IRQ compression
- */
- if ((gsi >= IRQ_COMPRESSION_START)
- && (triggering == ACPI_LEVEL_SENSITIVE)) {
- /*
- * For PCI devices assign IRQs in order, avoiding gaps
- * due to unused I/O APIC pins.
- */
- int irq = gsi;
- if (gsi < MAX_GSI_NUM) {
- /*
- * Retain the VIA chipset work-around (gsi > 15), but
- * avoid a problem where the 8254 timer (IRQ0) is setup
- * via an override (so it's not on pin 0 of the ioapic),
- * and at the same time, the pin 0 interrupt is a PCI
- * type. The gsi > 15 test could cause these two pins
- * to be shared as IRQ0, and they are not shareable.
- * So test for this condition, and if necessary, avoid
- * the pin collision.
- */
- gsi = pci_irq++;
- /*
- * Don't assign IRQ used by ACPI SCI
- */
- if (gsi == acpi_gbl_FADT.sci_interrupt)
- gsi = pci_irq++;
- gsi_to_irq[irq] = gsi;
- } else {
- printk(KERN_ERR "GSI %u is too high\n", gsi);
- return gsi;
- }
- }
-#endif
- io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
- triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
- polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
- return gsi;
-}
+ if (enable_update_mptable)
+ mp_config_acpi_gsi(dev, gsi, trigger, polarity);
-int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
- u32 gsi, int triggering, int polarity)
-{
-#ifdef CONFIG_X86_MPPARSE
- struct mpc_intsrc mp_irq;
- int ioapic;
-
- if (!acpi_ioapic)
- return 0;
+ set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
+ trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
+ polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+ io_apic_set_pci_routing(dev, gsi, &irq_attr);
- /* print the entry should happen on mptable identically */
- mp_irq.type = MP_INTSRC;
- mp_irq.irqtype = mp_INT;
- mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
- (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
- mp_irq.srcbus = number;
- mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
- ioapic = mp_find_ioapic(gsi);
- mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
- mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
-
- save_mp_irq(&mp_irq);
-#endif
- return 0;
+ return gsi;
}
/*
@@ -1569,14 +1449,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
},
{
.callback = force_acpi_ht,
- .ident = "ASUS P4B266",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
- DMI_MATCH(DMI_BOARD_NAME, "P4B266"),
- },
- },
- {
- .callback = force_acpi_ht,
.ident = "ASUS P2B-DS",
.matches = {
DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index bbbe4bbb6f3..8c44c232efc 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -34,12 +34,22 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
flags->bm_check = 1;
else if (c->x86_vendor == X86_VENDOR_INTEL) {
/*
- * Today all CPUs that support C3 share cache.
- * TBD: This needs to look at cache shared map, once
- * multi-core detection patch makes to the base.
+ * Today all MP CPUs that support C3 share cache.
+ * And caches should not be flushed by software while
+ * entering C3 type state.
*/
flags->bm_check = 1;
}
+
+ /*
+ * On all recent Intel platforms, ARB_DISABLE is a nop.
+ * So, set bm_control to zero to indicate that ARB_DISABLE
+ * is not required while entering C3 type state on
+ * P4, Core and beyond CPUs
+ */
+ if (c->x86_vendor == X86_VENDOR_INTEL &&
+ (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14)))
+ flags->bm_control = 0;
}
EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
index 7c074eec39f..d296f4a195c 100644
--- a/arch/x86/kernel/acpi/processor.c
+++ b/arch/x86/kernel/acpi/processor.c
@@ -72,6 +72,7 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
return;
}
+
/* Initialize _PDC data based on the CPU vendor */
void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
{
@@ -85,3 +86,15 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
}
EXPORT_SYMBOL(arch_acpi_processor_init_pdc);
+
+void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr)
+{
+ if (pr->pdc) {
+ kfree(pr->pdc->pointer->buffer.pointer);
+ kfree(pr->pdc->pointer);
+ kfree(pr->pdc);
+ pr->pdc = NULL;
+ }
+}
+
+EXPORT_SYMBOL(arch_acpi_processor_cleanup_pdc);
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
index 1c31cc0e9de..6a564ac67ef 100644
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ b/arch/x86/kernel/acpi/realmode/Makefile
@@ -9,7 +9,7 @@
always := wakeup.bin
targets := wakeup.elf wakeup.lds
-wakeup-y += wakeup.o wakemain.o video-mode.o copy.o
+wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
# The link order of the video-*.o modules can matter. In particular,
# video-vga.o *must* be listed first, followed by video-vesa.o.
@@ -42,6 +42,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \
$(call cc-option, -mpreferred-stack-boundary=2)
KBUILD_CFLAGS += $(call cc-option, -m32)
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
+GCOV_PROFILE := n
WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y))
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S
new file mode 100644
index 00000000000..f51eb0bb56c
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/bioscall.S
@@ -0,0 +1 @@
+#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c
new file mode 100644
index 00000000000..6206033ba20
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/regs.c
@@ -0,0 +1 @@
+#include "../../../boot/regs.c"
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 7c243a2c511..ca93638ba43 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -104,7 +104,7 @@ int acpi_save_state_mem(void)
initial_gs = per_cpu_offset(smp_processor_id());
#endif
initial_code = (unsigned long)wakeup_long64;
- saved_magic = 0x123456789abcdef0;
+ saved_magic = 0x123456789abcdef0L;
#endif /* CONFIG_64BIT */
return 0;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a97db99dad5..6c99f503780 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -55,7 +55,16 @@ struct iommu_cmd {
static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
struct unity_map_entry *e);
static struct dma_ops_domain *find_protection_domain(u16 devid);
+static u64* alloc_pte(struct protection_domain *dom,
+ unsigned long address, u64
+ **pte_page, gfp_t gfp);
+static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
+ unsigned long start_page,
+ unsigned int pages);
+#ifndef BUS_NOTIFY_UNBOUND_DRIVER
+#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
+#endif
#ifdef CONFIG_AMD_IOMMU_STATS
@@ -213,7 +222,7 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
{
struct amd_iommu *iommu;
- list_for_each_entry(iommu, &amd_iommu_list, list)
+ for_each_iommu(iommu)
iommu_poll_events(iommu);
return IRQ_HANDLED;
@@ -425,6 +434,16 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
}
+/* Flush the whole IO/TLB for a given protection domain - including PDE */
+static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid)
+{
+ u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+
+ INC_STATS_COUNTER(domain_flush_single);
+
+ iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1);
+}
+
/*
* This function is used to flush the IO/TLB for a given protection domain
* on every IOMMU in the system
@@ -440,7 +459,7 @@ static void iommu_flush_domain(u16 domid)
__iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
domid, 1, 1);
- list_for_each_entry(iommu, &amd_iommu_list, list) {
+ for_each_iommu(iommu) {
spin_lock_irqsave(&iommu->lock, flags);
__iommu_queue_command(iommu, &cmd);
__iommu_completion_wait(iommu);
@@ -449,6 +468,35 @@ static void iommu_flush_domain(u16 domid)
}
}
+void amd_iommu_flush_all_domains(void)
+{
+ int i;
+
+ for (i = 1; i < MAX_DOMAIN_ID; ++i) {
+ if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
+ continue;
+ iommu_flush_domain(i);
+ }
+}
+
+void amd_iommu_flush_all_devices(void)
+{
+ struct amd_iommu *iommu;
+ int i;
+
+ for (i = 0; i <= amd_iommu_last_bdf; ++i) {
+ if (amd_iommu_pd_table[i] == NULL)
+ continue;
+
+ iommu = amd_iommu_rlookup_table[i];
+ if (!iommu)
+ continue;
+
+ iommu_queue_inv_dev_entry(iommu, i);
+ iommu_completion_wait(iommu);
+ }
+}
+
/****************************************************************************
*
* The functions below are used the create the page table mappings for
@@ -468,7 +516,7 @@ static int iommu_map_page(struct protection_domain *dom,
unsigned long phys_addr,
int prot)
{
- u64 __pte, *pte, *page;
+ u64 __pte, *pte;
bus_addr = PAGE_ALIGN(bus_addr);
phys_addr = PAGE_ALIGN(phys_addr);
@@ -477,27 +525,7 @@ static int iommu_map_page(struct protection_domain *dom,
if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
return -EINVAL;
- pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
-
- if (!IOMMU_PTE_PRESENT(*pte)) {
- page = (u64 *)get_zeroed_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
- *pte = IOMMU_L2_PDE(virt_to_phys(page));
- }
-
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
-
- if (!IOMMU_PTE_PRESENT(*pte)) {
- page = (u64 *)get_zeroed_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
- *pte = IOMMU_L1_PDE(virt_to_phys(page));
- }
-
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)];
+ pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL);
if (IOMMU_PTE_PRESENT(*pte))
return -EBUSY;
@@ -595,7 +623,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
* as allocated in the aperture
*/
if (addr < dma_dom->aperture_size)
- __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap);
+ __set_bit(addr >> PAGE_SHIFT,
+ dma_dom->aperture[0]->bitmap);
}
return 0;
@@ -632,42 +661,191 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
****************************************************************************/
/*
- * The address allocator core function.
+ * The address allocator core functions.
*
* called with domain->lock held
*/
+
+/*
+ * This function checks if there is a PTE for a given dma address. If
+ * there is one, it returns the pointer to it.
+ */
+static u64* fetch_pte(struct protection_domain *domain,
+ unsigned long address)
+{
+ u64 *pte;
+
+ pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)];
+
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return NULL;
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return NULL;
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+
+ return pte;
+}
+
+/*
+ * This function is used to add a new aperture range to an existing
+ * aperture in case of dma_ops domain allocation or address allocation
+ * failure.
+ */
+static int alloc_new_range(struct amd_iommu *iommu,
+ struct dma_ops_domain *dma_dom,
+ bool populate, gfp_t gfp)
+{
+ int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
+ int i;
+
+#ifdef CONFIG_IOMMU_STRESS
+ populate = false;
+#endif
+
+ if (index >= APERTURE_MAX_RANGES)
+ return -ENOMEM;
+
+ dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
+ if (!dma_dom->aperture[index])
+ return -ENOMEM;
+
+ dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
+ if (!dma_dom->aperture[index]->bitmap)
+ goto out_free;
+
+ dma_dom->aperture[index]->offset = dma_dom->aperture_size;
+
+ if (populate) {
+ unsigned long address = dma_dom->aperture_size;
+ int i, num_ptes = APERTURE_RANGE_PAGES / 512;
+ u64 *pte, *pte_page;
+
+ for (i = 0; i < num_ptes; ++i) {
+ pte = alloc_pte(&dma_dom->domain, address,
+ &pte_page, gfp);
+ if (!pte)
+ goto out_free;
+
+ dma_dom->aperture[index]->pte_pages[i] = pte_page;
+
+ address += APERTURE_RANGE_SIZE / 64;
+ }
+ }
+
+ dma_dom->aperture_size += APERTURE_RANGE_SIZE;
+
+ /* Intialize the exclusion range if necessary */
+ if (iommu->exclusion_start &&
+ iommu->exclusion_start >= dma_dom->aperture[index]->offset &&
+ iommu->exclusion_start < dma_dom->aperture_size) {
+ unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
+ int pages = iommu_num_pages(iommu->exclusion_start,
+ iommu->exclusion_length,
+ PAGE_SIZE);
+ dma_ops_reserve_addresses(dma_dom, startpage, pages);
+ }
+
+ /*
+ * Check for areas already mapped as present in the new aperture
+ * range and mark those pages as reserved in the allocator. Such
+ * mappings may already exist as a result of requested unity
+ * mappings for devices.
+ */
+ for (i = dma_dom->aperture[index]->offset;
+ i < dma_dom->aperture_size;
+ i += PAGE_SIZE) {
+ u64 *pte = fetch_pte(&dma_dom->domain, i);
+ if (!pte || !IOMMU_PTE_PRESENT(*pte))
+ continue;
+
+ dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
+ }
+
+ return 0;
+
+out_free:
+ free_page((unsigned long)dma_dom->aperture[index]->bitmap);
+
+ kfree(dma_dom->aperture[index]);
+ dma_dom->aperture[index] = NULL;
+
+ return -ENOMEM;
+}
+
+static unsigned long dma_ops_area_alloc(struct device *dev,
+ struct dma_ops_domain *dom,
+ unsigned int pages,
+ unsigned long align_mask,
+ u64 dma_mask,
+ unsigned long start)
+{
+ unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
+ int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
+ int i = start >> APERTURE_RANGE_SHIFT;
+ unsigned long boundary_size;
+ unsigned long address = -1;
+ unsigned long limit;
+
+ next_bit >>= PAGE_SHIFT;
+
+ boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+ PAGE_SIZE) >> PAGE_SHIFT;
+
+ for (;i < max_index; ++i) {
+ unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
+
+ if (dom->aperture[i]->offset >= dma_mask)
+ break;
+
+ limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
+ dma_mask >> PAGE_SHIFT);
+
+ address = iommu_area_alloc(dom->aperture[i]->bitmap,
+ limit, next_bit, pages, 0,
+ boundary_size, align_mask);
+ if (address != -1) {
+ address = dom->aperture[i]->offset +
+ (address << PAGE_SHIFT);
+ dom->next_address = address + (pages << PAGE_SHIFT);
+ break;
+ }
+
+ next_bit = 0;
+ }
+
+ return address;
+}
+
static unsigned long dma_ops_alloc_addresses(struct device *dev,
struct dma_ops_domain *dom,
unsigned int pages,
unsigned long align_mask,
u64 dma_mask)
{
- unsigned long limit;
unsigned long address;
- unsigned long boundary_size;
- boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
- PAGE_SIZE) >> PAGE_SHIFT;
- limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0,
- dma_mask >> PAGE_SHIFT);
+#ifdef CONFIG_IOMMU_STRESS
+ dom->next_address = 0;
+ dom->need_flush = true;
+#endif
- if (dom->next_bit >= limit) {
- dom->next_bit = 0;
- dom->need_flush = true;
- }
+ address = dma_ops_area_alloc(dev, dom, pages, align_mask,
+ dma_mask, dom->next_address);
- address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
- 0 , boundary_size, align_mask);
if (address == -1) {
- address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
- 0, boundary_size, align_mask);
+ dom->next_address = 0;
+ address = dma_ops_area_alloc(dev, dom, pages, align_mask,
+ dma_mask, 0);
dom->need_flush = true;
}
- if (likely(address != -1)) {
- dom->next_bit = address + pages;
- address <<= PAGE_SHIFT;
- } else
+ if (unlikely(address == -1))
address = bad_dma_address;
WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
@@ -684,11 +862,23 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
unsigned long address,
unsigned int pages)
{
- address >>= PAGE_SHIFT;
- iommu_area_free(dom->bitmap, address, pages);
+ unsigned i = address >> APERTURE_RANGE_SHIFT;
+ struct aperture_range *range = dom->aperture[i];
+
+ BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
- if (address >= dom->next_bit)
+#ifdef CONFIG_IOMMU_STRESS
+ if (i < 4)
+ return;
+#endif
+
+ if (address >= dom->next_address)
dom->need_flush = true;
+
+ address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
+
+ iommu_area_free(range->bitmap, address, pages);
+
}
/****************************************************************************
@@ -736,12 +926,16 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
unsigned long start_page,
unsigned int pages)
{
- unsigned int last_page = dom->aperture_size >> PAGE_SHIFT;
+ unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
if (start_page + pages > last_page)
pages = last_page - start_page;
- iommu_area_reserve(dom->bitmap, start_page, pages);
+ for (i = start_page; i < start_page + pages; ++i) {
+ int index = i / APERTURE_RANGE_PAGES;
+ int page = i % APERTURE_RANGE_PAGES;
+ __set_bit(page, dom->aperture[index]->bitmap);
+ }
}
static void free_pagetable(struct protection_domain *domain)
@@ -780,14 +974,19 @@ static void free_pagetable(struct protection_domain *domain)
*/
static void dma_ops_domain_free(struct dma_ops_domain *dom)
{
+ int i;
+
if (!dom)
return;
free_pagetable(&dom->domain);
- kfree(dom->pte_pages);
-
- kfree(dom->bitmap);
+ for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
+ if (!dom->aperture[i])
+ continue;
+ free_page((unsigned long)dom->aperture[i]->bitmap);
+ kfree(dom->aperture[i]);
+ }
kfree(dom);
}
@@ -797,19 +996,9 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
* It also intializes the page table and the address allocator data
* structures required for the dma_ops interface
*/
-static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
- unsigned order)
+static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
{
struct dma_ops_domain *dma_dom;
- unsigned i, num_pte_pages;
- u64 *l2_pde;
- u64 address;
-
- /*
- * Currently the DMA aperture must be between 32 MB and 1GB in size
- */
- if ((order < 25) || (order > 30))
- return NULL;
dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
if (!dma_dom)
@@ -826,55 +1015,20 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
dma_dom->domain.priv = dma_dom;
if (!dma_dom->domain.pt_root)
goto free_dma_dom;
- dma_dom->aperture_size = (1ULL << order);
- dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8),
- GFP_KERNEL);
- if (!dma_dom->bitmap)
- goto free_dma_dom;
- /*
- * mark the first page as allocated so we never return 0 as
- * a valid dma-address. So we can use 0 as error value
- */
- dma_dom->bitmap[0] = 1;
- dma_dom->next_bit = 0;
dma_dom->need_flush = false;
dma_dom->target_dev = 0xffff;
- /* Intialize the exclusion range if necessary */
- if (iommu->exclusion_start &&
- iommu->exclusion_start < dma_dom->aperture_size) {
- unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
- int pages = iommu_num_pages(iommu->exclusion_start,
- iommu->exclusion_length,
- PAGE_SIZE);
- dma_ops_reserve_addresses(dma_dom, startpage, pages);
- }
+ if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL))
+ goto free_dma_dom;
/*
- * At the last step, build the page tables so we don't need to
- * allocate page table pages in the dma_ops mapping/unmapping
- * path.
+ * mark the first page as allocated so we never return 0 as
+ * a valid dma-address. So we can use 0 as error value
*/
- num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
- dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
- GFP_KERNEL);
- if (!dma_dom->pte_pages)
- goto free_dma_dom;
+ dma_dom->aperture[0]->bitmap[0] = 1;
+ dma_dom->next_address = 0;
- l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
- if (l2_pde == NULL)
- goto free_dma_dom;
-
- dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
-
- for (i = 0; i < num_pte_pages; ++i) {
- dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL);
- if (!dma_dom->pte_pages[i])
- goto free_dma_dom;
- address = virt_to_phys(dma_dom->pte_pages[i]);
- l2_pde[i] = IOMMU_L1_PDE(address);
- }
return dma_dom;
@@ -934,7 +1088,13 @@ static void attach_device(struct amd_iommu *iommu,
amd_iommu_pd_table[devid] = domain;
write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+ /*
+ * We might boot into a crash-kernel here. The crashed kernel
+ * left the caches in the IOMMU dirty. So we have to flush
+ * here to evict all dirty stuff.
+ */
iommu_queue_inv_dev_entry(iommu, devid);
+ iommu_flush_tlb_pde(iommu, domain->id);
}
/*
@@ -983,7 +1143,6 @@ static int device_change_notifier(struct notifier_block *nb,
struct protection_domain *domain;
struct dma_ops_domain *dma_domain;
struct amd_iommu *iommu;
- int order = amd_iommu_aperture_order;
unsigned long flags;
if (devid > amd_iommu_last_bdf)
@@ -1002,17 +1161,7 @@ static int device_change_notifier(struct notifier_block *nb,
"to a non-dma-ops domain\n", dev_name(dev));
switch (action) {
- case BUS_NOTIFY_BOUND_DRIVER:
- if (domain)
- goto out;
- dma_domain = find_protection_domain(devid);
- if (!dma_domain)
- dma_domain = iommu->default_dom;
- attach_device(iommu, &dma_domain->domain, devid);
- printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
- "device %s\n", dma_domain->domain.id, dev_name(dev));
- break;
- case BUS_NOTIFY_UNBIND_DRIVER:
+ case BUS_NOTIFY_UNBOUND_DRIVER:
if (!domain)
goto out;
detach_device(domain, devid);
@@ -1022,7 +1171,7 @@ static int device_change_notifier(struct notifier_block *nb,
dma_domain = find_protection_domain(devid);
if (dma_domain)
goto out;
- dma_domain = dma_ops_domain_alloc(iommu, order);
+ dma_domain = dma_ops_domain_alloc(iommu);
if (!dma_domain)
goto out;
dma_domain->target_dev = devid;
@@ -1043,7 +1192,7 @@ out:
return 0;
}
-struct notifier_block device_nb = {
+static struct notifier_block device_nb = {
.notifier_call = device_change_notifier,
};
@@ -1133,8 +1282,8 @@ static int get_device_resources(struct device *dev,
dma_dom = (*iommu)->default_dom;
*domain = &dma_dom->domain;
attach_device(*iommu, *domain, *bdf);
- printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
- "device %s\n", (*domain)->id, dev_name(dev));
+ DUMP_printk("Using protection domain %d for device %s\n",
+ (*domain)->id, dev_name(dev));
}
if (domain_for_device(_bdf) == NULL)
@@ -1144,6 +1293,66 @@ static int get_device_resources(struct device *dev,
}
/*
+ * If the pte_page is not yet allocated this function is called
+ */
+static u64* alloc_pte(struct protection_domain *dom,
+ unsigned long address, u64 **pte_page, gfp_t gfp)
+{
+ u64 *pte, *page;
+
+ pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)];
+
+ if (!IOMMU_PTE_PRESENT(*pte)) {
+ page = (u64 *)get_zeroed_page(gfp);
+ if (!page)
+ return NULL;
+ *pte = IOMMU_L2_PDE(virt_to_phys(page));
+ }
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+
+ if (!IOMMU_PTE_PRESENT(*pte)) {
+ page = (u64 *)get_zeroed_page(gfp);
+ if (!page)
+ return NULL;
+ *pte = IOMMU_L1_PDE(virt_to_phys(page));
+ }
+
+ pte = IOMMU_PTE_PAGE(*pte);
+
+ if (pte_page)
+ *pte_page = pte;
+
+ pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+
+ return pte;
+}
+
+/*
+ * This function fetches the PTE for a given address in the aperture
+ */
+static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
+ unsigned long address)
+{
+ struct aperture_range *aperture;
+ u64 *pte, *pte_page;
+
+ aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
+ if (!aperture)
+ return NULL;
+
+ pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
+ if (!pte) {
+ pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC);
+ aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
+ } else
+ pte += IOMMU_PTE_L0_INDEX(address);
+
+ return pte;
+}
+
+/*
* This is the generic map function. It maps one 4kb page at paddr to
* the given address in the DMA address space for the domain.
*/
@@ -1159,8 +1368,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
paddr &= PAGE_MASK;
- pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
- pte += IOMMU_PTE_L0_INDEX(address);
+ pte = dma_ops_get_pte(dom, address);
+ if (!pte)
+ return bad_dma_address;
__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
@@ -1185,14 +1395,20 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
struct dma_ops_domain *dom,
unsigned long address)
{
+ struct aperture_range *aperture;
u64 *pte;
if (address >= dom->aperture_size)
return;
- WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size);
+ aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
+ if (!aperture)
+ return;
+
+ pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
+ if (!pte)
+ return;
- pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
pte += IOMMU_PTE_L0_INDEX(address);
WARN_ON(!*pte);
@@ -1216,7 +1432,7 @@ static dma_addr_t __map_single(struct device *dev,
u64 dma_mask)
{
dma_addr_t offset = paddr & ~PAGE_MASK;
- dma_addr_t address, start;
+ dma_addr_t address, start, ret;
unsigned int pages;
unsigned long align_mask = 0;
int i;
@@ -1232,14 +1448,33 @@ static dma_addr_t __map_single(struct device *dev,
if (align)
align_mask = (1UL << get_order(size)) - 1;
+retry:
address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
dma_mask);
- if (unlikely(address == bad_dma_address))
- goto out;
+ if (unlikely(address == bad_dma_address)) {
+ /*
+ * setting next_address here will let the address
+ * allocator only scan the new allocated range in the
+ * first run. This is a small optimization.
+ */
+ dma_dom->next_address = dma_dom->aperture_size;
+
+ if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC))
+ goto out;
+
+ /*
+ * aperture was sucessfully enlarged by 128 MB, try
+ * allocation again
+ */
+ goto retry;
+ }
start = address;
for (i = 0; i < pages; ++i) {
- dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
+ ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
+ if (ret == bad_dma_address)
+ goto out_unmap;
+
paddr += PAGE_SIZE;
start += PAGE_SIZE;
}
@@ -1255,6 +1490,17 @@ static dma_addr_t __map_single(struct device *dev,
out:
return address;
+
+out_unmap:
+
+ for (--i; i >= 0; --i) {
+ start -= PAGE_SIZE;
+ dma_ops_domain_unmap(iommu, dma_dom, start);
+ }
+
+ dma_ops_free_addresses(dma_dom, address, pages);
+
+ return bad_dma_address;
}
/*
@@ -1517,7 +1763,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
flag |= __GFP_ZERO;
virt_addr = (void *)__get_free_pages(flag, get_order(size));
if (!virt_addr)
- return 0;
+ return NULL;
paddr = virt_to_phys(virt_addr);
@@ -1537,8 +1783,10 @@ static void *alloc_coherent(struct device *dev, size_t size,
*dma_addr = __map_single(dev, iommu, domain->priv, paddr,
size, DMA_BIDIRECTIONAL, true, dma_mask);
- if (*dma_addr == bad_dma_address)
+ if (*dma_addr == bad_dma_address) {
+ spin_unlock_irqrestore(&domain->lock, flags);
goto out_free;
+ }
iommu_completion_wait(iommu);
@@ -1625,7 +1873,6 @@ static void prealloc_protection_domains(void)
struct pci_dev *dev = NULL;
struct dma_ops_domain *dma_dom;
struct amd_iommu *iommu;
- int order = amd_iommu_aperture_order;
u16 devid;
while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
@@ -1638,7 +1885,7 @@ static void prealloc_protection_domains(void)
iommu = amd_iommu_rlookup_table[devid];
if (!iommu)
continue;
- dma_dom = dma_ops_domain_alloc(iommu, order);
+ dma_dom = dma_ops_domain_alloc(iommu);
if (!dma_dom)
continue;
init_unity_mappings_for_device(dma_dom, devid);
@@ -1664,7 +1911,6 @@ static struct dma_map_ops amd_iommu_dma_ops = {
int __init amd_iommu_init_dma_ops(void)
{
struct amd_iommu *iommu;
- int order = amd_iommu_aperture_order;
int ret;
/*
@@ -1672,8 +1918,8 @@ int __init amd_iommu_init_dma_ops(void)
* found in the system. Devices not assigned to any other
* protection domain will be assigned to the default one.
*/
- list_for_each_entry(iommu, &amd_iommu_list, list) {
- iommu->default_dom = dma_ops_domain_alloc(iommu, order);
+ for_each_iommu(iommu) {
+ iommu->default_dom = dma_ops_domain_alloc(iommu);
if (iommu->default_dom == NULL)
return -ENOMEM;
iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
@@ -1710,7 +1956,7 @@ int __init amd_iommu_init_dma_ops(void)
free_domains:
- list_for_each_entry(iommu, &amd_iommu_list, list) {
+ for_each_iommu(iommu) {
if (iommu->default_dom)
dma_ops_domain_free(iommu->default_dom);
}
@@ -1842,7 +2088,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
old_domain = domain_for_device(devid);
if (old_domain)
- return -EBUSY;
+ detach_device(old_domain, devid);
attach_device(iommu, domain, devid);
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 8c0be0902da..c1b17e97252 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -115,15 +115,21 @@ struct ivmd_header {
u64 range_length;
} __attribute__((packed));
+bool amd_iommu_dump;
+
static int __initdata amd_iommu_detected;
u16 amd_iommu_last_bdf; /* largest PCI device id we have
to handle */
LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
we find in ACPI */
-unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
+#ifdef CONFIG_IOMMU_STRESS
+bool amd_iommu_isolate = false;
+#else
bool amd_iommu_isolate = true; /* if true, device isolation is
enabled */
+#endif
+
bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
@@ -175,7 +181,7 @@ static inline void update_last_devid(u16 devid)
static inline unsigned long tbl_size(int entry_size)
{
unsigned shift = PAGE_SHIFT +
- get_order(amd_iommu_last_bdf * entry_size);
+ get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
return 1UL << shift;
}
@@ -193,7 +199,7 @@ static inline unsigned long tbl_size(int entry_size)
* This function set the exclusion range in the IOMMU. DMA accesses to the
* exclusion range are passed through untranslated
*/
-static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
+static void iommu_set_exclusion_range(struct amd_iommu *iommu)
{
u64 start = iommu->exclusion_start & PAGE_MASK;
u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
@@ -225,7 +231,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
}
/* Generic functions to enable/disable certain features of the IOMMU. */
-static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
+static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
{
u32 ctrl;
@@ -244,7 +250,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
}
/* Function to enable the hardware */
-static void __init iommu_enable(struct amd_iommu *iommu)
+static void iommu_enable(struct amd_iommu *iommu)
{
printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
dev_name(&iommu->dev->dev), iommu->cap_ptr);
@@ -252,11 +258,17 @@ static void __init iommu_enable(struct amd_iommu *iommu)
iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
}
-/* Function to enable IOMMU event logging and event interrupts */
-static void __init iommu_enable_event_logging(struct amd_iommu *iommu)
+static void iommu_disable(struct amd_iommu *iommu)
{
- iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
- iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
+ /* Disable command buffer */
+ iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
+
+ /* Disable event logging and event interrupts */
+ iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
+ iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
+
+ /* Disable IOMMU hardware itself */
+ iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
}
/*
@@ -413,25 +425,36 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
{
u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
get_order(CMD_BUFFER_SIZE));
- u64 entry;
if (cmd_buf == NULL)
return NULL;
iommu->cmd_buf_size = CMD_BUFFER_SIZE;
- entry = (u64)virt_to_phys(cmd_buf);
+ return cmd_buf;
+}
+
+/*
+ * This function writes the command buffer address to the hardware and
+ * enables it.
+ */
+static void iommu_enable_command_buffer(struct amd_iommu *iommu)
+{
+ u64 entry;
+
+ BUG_ON(iommu->cmd_buf == NULL);
+
+ entry = (u64)virt_to_phys(iommu->cmd_buf);
entry |= MMIO_CMD_SIZE_512;
+
memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
- &entry, sizeof(entry));
+ &entry, sizeof(entry));
/* set head and tail to zero manually */
writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
-
- return cmd_buf;
}
static void __init free_command_buffer(struct amd_iommu *iommu)
@@ -443,20 +466,33 @@ static void __init free_command_buffer(struct amd_iommu *iommu)
/* allocates the memory where the IOMMU will log its events to */
static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
{
- u64 entry;
iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
get_order(EVT_BUFFER_SIZE));
if (iommu->evt_buf == NULL)
return NULL;
+ iommu->evt_buf_size = EVT_BUFFER_SIZE;
+
+ return iommu->evt_buf;
+}
+
+static void iommu_enable_event_buffer(struct amd_iommu *iommu)
+{
+ u64 entry;
+
+ BUG_ON(iommu->evt_buf == NULL);
+
entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
+
memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
&entry, sizeof(entry));
- iommu->evt_buf_size = EVT_BUFFER_SIZE;
+ /* set head and tail to zero manually */
+ writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
+ writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
- return iommu->evt_buf;
+ iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
}
static void __init free_event_buffer(struct amd_iommu *iommu)
@@ -596,32 +632,84 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
p += sizeof(struct ivhd_header);
end += h->length;
+
while (p < end) {
e = (struct ivhd_entry *)p;
switch (e->type) {
case IVHD_DEV_ALL:
+
+ DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x"
+ " last device %02x:%02x.%x flags: %02x\n",
+ PCI_BUS(iommu->first_device),
+ PCI_SLOT(iommu->first_device),
+ PCI_FUNC(iommu->first_device),
+ PCI_BUS(iommu->last_device),
+ PCI_SLOT(iommu->last_device),
+ PCI_FUNC(iommu->last_device),
+ e->flags);
+
for (dev_i = iommu->first_device;
dev_i <= iommu->last_device; ++dev_i)
set_dev_entry_from_acpi(iommu, dev_i,
e->flags, 0);
break;
case IVHD_DEV_SELECT:
+
+ DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x "
+ "flags: %02x\n",
+ PCI_BUS(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags);
+
devid = e->devid;
set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
break;
case IVHD_DEV_SELECT_RANGE_START:
+
+ DUMP_printk(" DEV_SELECT_RANGE_START\t "
+ "devid: %02x:%02x.%x flags: %02x\n",
+ PCI_BUS(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags);
+
devid_start = e->devid;
flags = e->flags;
ext_flags = 0;
alias = false;
break;
case IVHD_DEV_ALIAS:
+
+ DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
+ "flags: %02x devid_to: %02x:%02x.%x\n",
+ PCI_BUS(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags,
+ PCI_BUS(e->ext >> 8),
+ PCI_SLOT(e->ext >> 8),
+ PCI_FUNC(e->ext >> 8));
+
devid = e->devid;
devid_to = e->ext >> 8;
- set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
+ set_dev_entry_from_acpi(iommu, devid , e->flags, 0);
+ set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
amd_iommu_alias_table[devid] = devid_to;
break;
case IVHD_DEV_ALIAS_RANGE:
+
+ DUMP_printk(" DEV_ALIAS_RANGE\t\t "
+ "devid: %02x:%02x.%x flags: %02x "
+ "devid_to: %02x:%02x.%x\n",
+ PCI_BUS(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags,
+ PCI_BUS(e->ext >> 8),
+ PCI_SLOT(e->ext >> 8),
+ PCI_FUNC(e->ext >> 8));
+
devid_start = e->devid;
flags = e->flags;
devid_to = e->ext >> 8;
@@ -629,24 +717,48 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
alias = true;
break;
case IVHD_DEV_EXT_SELECT:
+
+ DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
+ "flags: %02x ext: %08x\n",
+ PCI_BUS(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags, e->ext);
+
devid = e->devid;
set_dev_entry_from_acpi(iommu, devid, e->flags,
e->ext);
break;
case IVHD_DEV_EXT_SELECT_RANGE:
+
+ DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: "
+ "%02x:%02x.%x flags: %02x ext: %08x\n",
+ PCI_BUS(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags, e->ext);
+
devid_start = e->devid;
flags = e->flags;
ext_flags = e->ext;
alias = false;
break;
case IVHD_DEV_RANGE_END:
+
+ DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
+ PCI_BUS(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid));
+
devid = e->devid;
for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
- if (alias)
+ if (alias) {
amd_iommu_alias_table[dev_i] = devid_to;
- set_dev_entry_from_acpi(iommu,
- amd_iommu_alias_table[dev_i],
- flags, ext_flags);
+ set_dev_entry_from_acpi(iommu,
+ devid_to, flags, ext_flags);
+ }
+ set_dev_entry_from_acpi(iommu, dev_i,
+ flags, ext_flags);
}
break;
default:
@@ -679,7 +791,7 @@ static void __init free_iommu_all(void)
{
struct amd_iommu *iommu, *next;
- list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) {
+ for_each_iommu_safe(iommu, next) {
list_del(&iommu->list);
free_iommu_one(iommu);
kfree(iommu);
@@ -710,7 +822,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
if (!iommu->mmio_base)
return -ENOMEM;
- iommu_set_device_table(iommu);
iommu->cmd_buf = alloc_command_buffer(iommu);
if (!iommu->cmd_buf)
return -ENOMEM;
@@ -746,6 +857,15 @@ static int __init init_iommu_all(struct acpi_table_header *table)
h = (struct ivhd_header *)p;
switch (*p) {
case ACPI_IVHD_TYPE:
+
+ DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x "
+ "seg: %d flags: %01x info %04x\n",
+ PCI_BUS(h->devid), PCI_SLOT(h->devid),
+ PCI_FUNC(h->devid), h->cap_ptr,
+ h->pci_seg, h->flags, h->info);
+ DUMP_printk(" mmio-addr: %016llx\n",
+ h->mmio_phys);
+
iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
if (iommu == NULL)
return -ENOMEM;
@@ -773,56 +893,9 @@ static int __init init_iommu_all(struct acpi_table_header *table)
*
****************************************************************************/
-static int __init iommu_setup_msix(struct amd_iommu *iommu)
-{
- struct amd_iommu *curr;
- struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
- int nvec = 0, i;
-
- list_for_each_entry(curr, &amd_iommu_list, list) {
- if (curr->dev == iommu->dev) {
- entries[nvec].entry = curr->evt_msi_num;
- entries[nvec].vector = 0;
- curr->int_enabled = true;
- nvec++;
- }
- }
-
- if (pci_enable_msix(iommu->dev, entries, nvec)) {
- pci_disable_msix(iommu->dev);
- return 1;
- }
-
- for (i = 0; i < nvec; ++i) {
- int r = request_irq(entries->vector, amd_iommu_int_handler,
- IRQF_SAMPLE_RANDOM,
- "AMD IOMMU",
- NULL);
- if (r)
- goto out_free;
- }
-
- return 0;
-
-out_free:
- for (i -= 1; i >= 0; --i)
- free_irq(entries->vector, NULL);
-
- pci_disable_msix(iommu->dev);
-
- return 1;
-}
-
static int __init iommu_setup_msi(struct amd_iommu *iommu)
{
int r;
- struct amd_iommu *curr;
-
- list_for_each_entry(curr, &amd_iommu_list, list) {
- if (curr->dev == iommu->dev)
- curr->int_enabled = true;
- }
-
if (pci_enable_msi(iommu->dev))
return 1;
@@ -837,17 +910,18 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
return 1;
}
+ iommu->int_enabled = true;
+ iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
+
return 0;
}
-static int __init iommu_init_msi(struct amd_iommu *iommu)
+static int iommu_init_msi(struct amd_iommu *iommu)
{
if (iommu->int_enabled)
return 0;
- if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX))
- return iommu_setup_msix(iommu);
- else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
+ if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
return iommu_setup_msi(iommu);
return 1;
@@ -899,6 +973,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
static int __init init_unity_map_range(struct ivmd_header *m)
{
struct unity_map_entry *e = 0;
+ char *s;
e = kzalloc(sizeof(*e), GFP_KERNEL);
if (e == NULL)
@@ -906,14 +981,19 @@ static int __init init_unity_map_range(struct ivmd_header *m)
switch (m->type) {
default:
+ kfree(e);
+ return 0;
case ACPI_IVMD_TYPE:
+ s = "IVMD_TYPEi\t\t\t";
e->devid_start = e->devid_end = m->devid;
break;
case ACPI_IVMD_TYPE_ALL:
+ s = "IVMD_TYPE_ALL\t\t";
e->devid_start = 0;
e->devid_end = amd_iommu_last_bdf;
break;
case ACPI_IVMD_TYPE_RANGE:
+ s = "IVMD_TYPE_RANGE\t\t";
e->devid_start = m->devid;
e->devid_end = m->aux;
break;
@@ -922,6 +1002,13 @@ static int __init init_unity_map_range(struct ivmd_header *m)
e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
e->prot = m->flags >> 1;
+ DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
+ " range_start: %016llx range_end: %016llx flags: %x\n", s,
+ PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
+ PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
+ PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
+ e->address_start, e->address_end, m->flags);
+
list_add_tail(&e->list, &amd_iommu_unity_map);
return 0;
@@ -967,18 +1054,29 @@ static void init_device_table(void)
* This function finally enables all IOMMUs found in the system after
* they have been initialized
*/
-static void __init enable_iommus(void)
+static void enable_iommus(void)
{
struct amd_iommu *iommu;
- list_for_each_entry(iommu, &amd_iommu_list, list) {
+ for_each_iommu(iommu) {
+ iommu_disable(iommu);
+ iommu_set_device_table(iommu);
+ iommu_enable_command_buffer(iommu);
+ iommu_enable_event_buffer(iommu);
iommu_set_exclusion_range(iommu);
iommu_init_msi(iommu);
- iommu_enable_event_logging(iommu);
iommu_enable(iommu);
}
}
+static void disable_iommus(void)
+{
+ struct amd_iommu *iommu;
+
+ for_each_iommu(iommu)
+ iommu_disable(iommu);
+}
+
/*
* Suspend/Resume support
* disable suspend until real resume implemented
@@ -986,12 +1084,25 @@ static void __init enable_iommus(void)
static int amd_iommu_resume(struct sys_device *dev)
{
+ /* re-load the hardware */
+ enable_iommus();
+
+ /*
+ * we have to flush after the IOMMUs are enabled because a
+ * disabled IOMMU will never execute the commands we send
+ */
+ amd_iommu_flush_all_devices();
+ amd_iommu_flush_all_domains();
+
return 0;
}
static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
{
- return -EINVAL;
+ /* disable IOMMUs to go out of the way for BIOS */
+ disable_iommus();
+
+ return 0;
}
static struct sysdev_class amd_iommu_sysdev_class = {
@@ -1137,9 +1248,6 @@ int __init amd_iommu_init(void)
enable_iommus();
- printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
- (1 << (amd_iommu_aperture_order-20)));
-
printk(KERN_INFO "AMD IOMMU: device isolation ");
if (amd_iommu_isolate)
printk("enabled\n");
@@ -1177,6 +1285,11 @@ free:
goto out;
}
+void amd_iommu_shutdown(void)
+{
+ disable_iommus();
+}
+
/****************************************************************************
*
* Early detect code. This code runs at IOMMU detection time in the DMA
@@ -1211,6 +1324,13 @@ void __init amd_iommu_detect(void)
*
****************************************************************************/
+static int __init parse_amd_iommu_dump(char *str)
+{
+ amd_iommu_dump = true;
+
+ return 1;
+}
+
static int __init parse_amd_iommu_options(char *str)
{
for (; *str; ++str) {
@@ -1225,15 +1345,5 @@ static int __init parse_amd_iommu_options(char *str)
return 1;
}
-static int __init parse_amd_iommu_size_options(char *str)
-{
- unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
-
- if ((order > 24) && (order < 31))
- amd_iommu_aperture_order = order;
-
- return 1;
-}
-
+__setup("amd_iommu_dump", parse_amd_iommu_dump);
__setup("amd_iommu=", parse_amd_iommu_options);
-__setup("amd_iommu_size=", parse_amd_iommu_size_options);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f2870920f24..0a1c2830ec6 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,6 +14,7 @@
* Mikael Pettersson : PM converted to driver model.
*/
+#include <linux/perf_counter.h>
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/acpi_pmtmr.h>
@@ -34,6 +35,7 @@
#include <linux/smp.h>
#include <linux/mm.h>
+#include <asm/perf_counter.h>
#include <asm/pgalloc.h>
#include <asm/atomic.h>
#include <asm/mpspec.h>
@@ -98,6 +100,29 @@ early_param("lapic", parse_lapic);
/* Local APIC was disabled by the BIOS and enabled by the kernel */
static int enabled_via_apicbase;
+/*
+ * Handle interrupt mode configuration register (IMCR).
+ * This register controls whether the interrupt signals
+ * that reach the BSP come from the master PIC or from the
+ * local APIC. Before entering Symmetric I/O Mode, either
+ * the BIOS or the operating system must switch out of
+ * PIC Mode by changing the IMCR.
+ */
+static inline void imcr_pic_to_apic(void)
+{
+ /* select IMCR register */
+ outb(0x70, 0x22);
+ /* NMI and 8259 INTR go through APIC */
+ outb(0x01, 0x23);
+}
+
+static inline void imcr_apic_to_pic(void)
+{
+ /* select IMCR register */
+ outb(0x70, 0x22);
+ /* NMI and 8259 INTR go directly to BSP */
+ outb(0x00, 0x23);
+}
#endif
#ifdef CONFIG_X86_64
@@ -111,14 +136,18 @@ static __init int setup_apicpmtimer(char *s)
__setup("apicpmtimer", setup_apicpmtimer);
#endif
+int x2apic_mode;
#ifdef CONFIG_X86_X2APIC
-int x2apic;
/* x2apic enabled before OS handover */
static int x2apic_preenabled;
-static int disable_x2apic;
static __init int setup_nox2apic(char *str)
{
- disable_x2apic = 1;
+ if (x2apic_enabled()) {
+ pr_warning("Bios already enabled x2apic, "
+ "can't enforce nox2apic");
+ return 0;
+ }
+
setup_clear_cpu_cap(X86_FEATURE_X2APIC);
return 0;
}
@@ -209,6 +238,31 @@ static int modern_apic(void)
return lapic_get_version() >= 0x14;
}
+/*
+ * bare function to substitute write operation
+ * and it's _that_ fast :)
+ */
+static void native_apic_write_dummy(u32 reg, u32 v)
+{
+ WARN_ON_ONCE((cpu_has_apic || !disable_apic));
+}
+
+static u32 native_apic_read_dummy(u32 reg)
+{
+ WARN_ON_ONCE((cpu_has_apic && !disable_apic));
+ return 0;
+}
+
+/*
+ * right after this call apic->write/read doesn't do anything
+ * note that there is no restore operation it works one way
+ */
+void apic_disable(void)
+{
+ apic->read = native_apic_read_dummy;
+ apic->write = native_apic_write_dummy;
+}
+
void native_apic_wait_icr_idle(void)
{
while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
@@ -348,7 +402,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
{
- unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
+ unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0);
unsigned int v = (mask << 16) | (msg_type << 8) | vector;
apic_write(reg, v);
@@ -815,7 +869,7 @@ void clear_local_APIC(void)
u32 v;
/* APIC hasn't been mapped yet */
- if (!x2apic && !apic_phys)
+ if (!x2apic_mode && !apic_phys)
return;
maxlvt = lapic_get_maxlvt();
@@ -843,7 +897,7 @@ void clear_local_APIC(void)
}
/* lets not touch this if we didn't frob it */
-#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
+#ifdef CONFIG_X86_THERMAL_VECTOR
if (maxlvt >= 5) {
v = apic_read(APIC_LVTTHMR);
apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
@@ -1133,6 +1187,7 @@ void __cpuinit setup_local_APIC(void)
apic_write(APIC_ESR, 0);
}
#endif
+ perf_counters_lapic_init();
preempt_disable();
@@ -1287,7 +1342,7 @@ void check_x2apic(void)
{
if (x2apic_enabled()) {
pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
- x2apic_preenabled = x2apic = 1;
+ x2apic_preenabled = x2apic_mode = 1;
}
}
@@ -1295,7 +1350,7 @@ void enable_x2apic(void)
{
int msr, msr2;
- if (!x2apic)
+ if (!x2apic_mode)
return;
rdmsr(MSR_IA32_APICBASE, msr, msr2);
@@ -1304,6 +1359,7 @@ void enable_x2apic(void)
wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
}
}
+#endif /* CONFIG_X86_X2APIC */
void __init enable_IR_x2apic(void)
{
@@ -1312,32 +1368,21 @@ void __init enable_IR_x2apic(void)
unsigned long flags;
struct IO_APIC_route_entry **ioapic_entries = NULL;
- if (!cpu_has_x2apic)
- return;
-
- if (!x2apic_preenabled && disable_x2apic) {
- pr_info("Skipped enabling x2apic and Interrupt-remapping "
- "because of nox2apic\n");
- return;
+ ret = dmar_table_init();
+ if (ret) {
+ pr_debug("dmar_table_init() failed with %d:\n", ret);
+ goto ir_failed;
}
- if (x2apic_preenabled && disable_x2apic)
- panic("Bios already enabled x2apic, can't enforce nox2apic");
-
- if (!x2apic_preenabled && skip_ioapic_setup) {
- pr_info("Skipped enabling x2apic and Interrupt-remapping "
- "because of skipping io-apic setup\n");
- return;
+ if (!intr_remapping_supported()) {
+ pr_debug("intr-remapping not supported\n");
+ goto ir_failed;
}
- ret = dmar_table_init();
- if (ret) {
- pr_info("dmar_table_init() failed with %d:\n", ret);
- if (x2apic_preenabled)
- panic("x2apic enabled by bios. But IR enabling failed");
- else
- pr_info("Not enabling x2apic,Intr-remapping\n");
+ if (!x2apic_preenabled && skip_ioapic_setup) {
+ pr_info("Skipped enabling intr-remap because of skipping "
+ "io-apic setup\n");
return;
}
@@ -1357,19 +1402,16 @@ void __init enable_IR_x2apic(void)
mask_IO_APIC_setup(ioapic_entries);
mask_8259A();
- ret = enable_intr_remapping(EIM_32BIT_APIC_ID);
-
- if (ret && x2apic_preenabled) {
- local_irq_restore(flags);
- panic("x2apic enabled by bios. But IR enabling failed");
- }
-
+ ret = enable_intr_remapping(x2apic_supported());
if (ret)
goto end_restore;
- if (!x2apic) {
- x2apic = 1;
+ pr_info("Enabled Interrupt-remapping\n");
+
+ if (x2apic_supported() && !x2apic_mode) {
+ x2apic_mode = 1;
enable_x2apic();
+ pr_info("Enabled x2apic\n");
}
end_restore:
@@ -1378,37 +1420,34 @@ end_restore:
* IR enabling failed
*/
restore_IO_APIC_setup(ioapic_entries);
- else
- reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries);
unmask_8259A();
local_irq_restore(flags);
end:
- if (!ret) {
- if (!x2apic_preenabled)
- pr_info("Enabled x2apic and interrupt-remapping\n");
- else
- pr_info("Enabled Interrupt-remapping\n");
- } else
- pr_err("Failed to enable Interrupt-remapping and x2apic\n");
if (ioapic_entries)
free_ioapic_entries(ioapic_entries);
+
+ if (!ret)
+ return;
+
+ir_failed:
+ if (x2apic_preenabled)
+ panic("x2apic enabled by bios. But IR enabling failed");
+ else if (cpu_has_x2apic)
+ pr_info("Not enabling x2apic,Intr-remapping\n");
#else
if (!cpu_has_x2apic)
return;
if (x2apic_preenabled)
panic("x2apic enabled prior OS handover,"
- " enable CONFIG_INTR_REMAP");
-
- pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping "
- " and x2apic\n");
+ " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");
#endif
return;
}
-#endif /* CONFIG_X86_X2APIC */
+
#ifdef CONFIG_X86_64
/*
@@ -1425,7 +1464,6 @@ static int __init detect_init_APIC(void)
}
mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
- boot_cpu_physical_apicid = 0;
return 0;
}
#else
@@ -1539,32 +1577,49 @@ void __init early_init_lapic_mapping(void)
*/
void __init init_apic_mappings(void)
{
- if (x2apic) {
+ unsigned int new_apicid;
+
+ if (x2apic_mode) {
boot_cpu_physical_apicid = read_apic_id();
return;
}
- /*
- * If no local APIC can be found then set up a fake all
- * zeroes page to simulate the local APIC and another
- * one for the IO-APIC.
- */
+ /* If no local APIC can be found return early */
if (!smp_found_config && detect_init_APIC()) {
- apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
- apic_phys = __pa(apic_phys);
- } else
+ /* lets NOP'ify apic operations */
+ pr_info("APIC: disable apic facility\n");
+ apic_disable();
+ } else {
apic_phys = mp_lapic_addr;
- set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
- apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
- APIC_BASE, apic_phys);
+ /*
+ * acpi lapic path already maps that address in
+ * acpi_register_lapic_address()
+ */
+ if (!acpi_lapic)
+ set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+
+ apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
+ APIC_BASE, apic_phys);
+ }
/*
* Fetch the APIC ID of the BSP in case we have a
* default configuration (or the MP table is broken).
*/
- if (boot_cpu_physical_apicid == -1U)
- boot_cpu_physical_apicid = read_apic_id();
+ new_apicid = read_apic_id();
+ if (boot_cpu_physical_apicid != new_apicid) {
+ boot_cpu_physical_apicid = new_apicid;
+ /*
+ * yeah -- we lie about apic_version
+ * in case if apic was disabled via boot option
+ * but it's not a problem for SMP compiled kernel
+ * since smp_sanity_check is prepared for such a case
+ * and disable smp mode
+ */
+ apic_version[new_apicid] =
+ GET_APIC_VERSION(apic_read(APIC_LVR));
+ }
}
/*
@@ -1733,8 +1788,7 @@ void __init connect_bsp_APIC(void)
*/
apic_printk(APIC_VERBOSE, "leaving PIC mode, "
"enabling APIC mode.\n");
- outb(0x70, 0x22);
- outb(0x01, 0x23);
+ imcr_pic_to_apic();
}
#endif
if (apic->enable_apic_mode)
@@ -1762,8 +1816,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
*/
apic_printk(APIC_VERBOSE, "disabling APIC mode, "
"entering PIC mode.\n");
- outb(0x70, 0x22);
- outb(0x00, 0x23);
+ imcr_apic_to_pic();
return;
}
#endif
@@ -1962,17 +2015,17 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
+#ifdef CONFIG_X86_THERMAL_VECTOR
if (maxlvt >= 5)
apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
#endif
local_irq_save(flags);
disable_local_APIC();
-#ifdef CONFIG_INTR_REMAP
+
if (intr_remapping_enabled)
disable_intr_remapping();
-#endif
+
local_irq_restore(flags);
return 0;
}
@@ -1982,42 +2035,34 @@ static int lapic_resume(struct sys_device *dev)
unsigned int l, h;
unsigned long flags;
int maxlvt;
-
-#ifdef CONFIG_INTR_REMAP
- int ret;
+ int ret = 0;
struct IO_APIC_route_entry **ioapic_entries = NULL;
if (!apic_pm_state.active)
return 0;
local_irq_save(flags);
- if (x2apic) {
+ if (intr_remapping_enabled) {
ioapic_entries = alloc_ioapic_entries();
if (!ioapic_entries) {
WARN(1, "Alloc ioapic_entries in lapic resume failed.");
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto restore;
}
ret = save_IO_APIC_setup(ioapic_entries);
if (ret) {
WARN(1, "Saving IO-APIC state failed: %d\n", ret);
free_ioapic_entries(ioapic_entries);
- return ret;
+ goto restore;
}
mask_IO_APIC_setup(ioapic_entries);
mask_8259A();
- enable_x2apic();
}
-#else
- if (!apic_pm_state.active)
- return 0;
- local_irq_save(flags);
- if (x2apic)
+ if (x2apic_mode)
enable_x2apic();
-#endif
-
else {
/*
* Make sure the APICBASE points to the right address
@@ -2055,21 +2100,16 @@ static int lapic_resume(struct sys_device *dev)
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
-#ifdef CONFIG_INTR_REMAP
- if (intr_remapping_enabled)
- reenable_intr_remapping(EIM_32BIT_APIC_ID);
-
- if (x2apic) {
+ if (intr_remapping_enabled) {
+ reenable_intr_remapping(x2apic_mode);
unmask_8259A();
restore_IO_APIC_setup(ioapic_entries);
free_ioapic_entries(ioapic_entries);
}
-#endif
-
+restore:
local_irq_restore(flags);
-
- return 0;
+ return ret;
}
/*
@@ -2117,31 +2157,14 @@ static void apic_pm_activate(void) { }
#endif /* CONFIG_PM */
#ifdef CONFIG_X86_64
-/*
- * apic_is_clustered_box() -- Check if we can expect good TSC
- *
- * Thus far, the major user of this is IBM's Summit2 series:
- *
- * Clustered boxes may have unsynced TSC problems if they are
- * multi-chassis. Use available data to take a good guess.
- * If in doubt, go HPET.
- */
-__cpuinit int apic_is_clustered_box(void)
+
+static int __cpuinit apic_cluster_num(void)
{
int i, clusters, zeros;
unsigned id;
u16 *bios_cpu_apicid;
DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
- /*
- * there is not this kind of box with AMD CPU yet.
- * Some AMD box with quadcore cpu and 8 sockets apicid
- * will be [4, 0x23] or [8, 0x27] could be thought to
- * vsmp box still need checking...
- */
- if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
- return 0;
-
bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
@@ -2177,18 +2200,67 @@ __cpuinit int apic_is_clustered_box(void)
++zeros;
}
- /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
- * not guaranteed to be synced between boards
- */
- if (is_vsmp_box() && clusters > 1)
+ return clusters;
+}
+
+static int __cpuinitdata multi_checked;
+static int __cpuinitdata multi;
+
+static int __cpuinit set_multi(const struct dmi_system_id *d)
+{
+ if (multi)
+ return 0;
+ pr_info("APIC: %s detected, Multi Chassis\n", d->ident);
+ multi = 1;
+ return 0;
+}
+
+static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = {
+ {
+ .callback = set_multi,
+ .ident = "IBM System Summit2",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"),
+ },
+ },
+ {}
+};
+
+static void __cpuinit dmi_check_multi(void)
+{
+ if (multi_checked)
+ return;
+
+ dmi_check_system(multi_dmi_table);
+ multi_checked = 1;
+}
+
+/*
+ * apic_is_clustered_box() -- Check if we can expect good TSC
+ *
+ * Thus far, the major user of this is IBM's Summit2 series:
+ * Clustered boxes may have unsynced TSC problems if they are
+ * multi-chassis.
+ * Use DMI to check them
+ */
+__cpuinit int apic_is_clustered_box(void)
+{
+ dmi_check_multi();
+ if (multi)
return 1;
+ if (!is_vsmp_box())
+ return 0;
+
/*
- * If clusters > 2, then should be multi-chassis.
- * May have to revisit this when multi-core + hyperthreaded CPUs come
- * out, but AFAIK this will work even for them.
+ * ScaleMP vSMPowered boxes have one cluster per board and TSCs are
+ * not guaranteed to be synced between boards
*/
- return (clusters > 2);
+ if (apic_cluster_num() > 1)
+ return 1;
+
+ return 0;
}
#endif
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 306e5e88fb6..d0c99abc26c 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -161,7 +161,7 @@ static int flat_apic_id_registered(void)
static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
{
- return hard_smp_processor_id() >> index_msb;
+ return initial_apic_id >> index_msb;
}
struct apic apic_flat = {
@@ -235,7 +235,7 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
* regardless of how many processors are present (x86_64 ES7000
* is an example).
*/
- if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
+ if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
printk(KERN_DEBUG "system APIC only can use physical flat");
return 1;
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 30294777557..8952a589028 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -145,7 +145,7 @@ es7000_rename_gsi(int ioapic, int gsi)
return gsi;
}
-static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
+static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
{
unsigned long vect = 0, psaival = 0;
@@ -652,7 +652,8 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
return ret && es7000_apic_is_cluster();
}
-struct apic apic_es7000_cluster = {
+/* We've been warned by a false positive warning.Use __refdata to keep calm. */
+struct apic __refdata apic_es7000_cluster = {
.name = "es7000",
.probe = probe_es7000,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 30da617d18e..d2ed6c5ddc8 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -59,6 +59,7 @@
#include <asm/setup.h>
#include <asm/irq_remapping.h>
#include <asm/hpet.h>
+#include <asm/hw_irq.h>
#include <asm/uv/uv_hub.h>
#include <asm/uv/uv_irq.h>
@@ -129,12 +130,9 @@ struct irq_pin_list {
struct irq_pin_list *next;
};
-static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+static struct irq_pin_list *get_one_free_irq_2_pin(int node)
{
struct irq_pin_list *pin;
- int node;
-
- node = cpu_to_node(cpu);
pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
@@ -148,9 +146,6 @@ struct irq_cfg {
unsigned move_cleanup_count;
u8 vector;
u8 move_in_progress : 1;
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
- u8 move_desc_pending : 1;
-#endif
};
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -182,16 +177,18 @@ int __init arch_early_irq_init(void)
struct irq_cfg *cfg;
struct irq_desc *desc;
int count;
+ int node;
int i;
cfg = irq_cfgx;
count = ARRAY_SIZE(irq_cfgx);
+ node= cpu_to_node(boot_cpu_id);
for (i = 0; i < count; i++) {
desc = irq_to_desc(i);
desc->chip_data = &cfg[i];
- alloc_bootmem_cpumask_var(&cfg[i].domain);
- alloc_bootmem_cpumask_var(&cfg[i].old_domain);
+ zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
+ zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
if (i < NR_IRQS_LEGACY)
cpumask_setall(cfg[i].domain);
}
@@ -212,12 +209,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
return cfg;
}
-static struct irq_cfg *get_one_free_irq_cfg(int cpu)
+static struct irq_cfg *get_one_free_irq_cfg(int node)
{
struct irq_cfg *cfg;
- int node;
-
- node = cpu_to_node(cpu);
cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
if (cfg) {
@@ -238,13 +232,13 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu)
return cfg;
}
-int arch_init_chip_data(struct irq_desc *desc, int cpu)
+int arch_init_chip_data(struct irq_desc *desc, int node)
{
struct irq_cfg *cfg;
cfg = desc->chip_data;
if (!cfg) {
- desc->chip_data = get_one_free_irq_cfg(cpu);
+ desc->chip_data = get_one_free_irq_cfg(node);
if (!desc->chip_data) {
printk(KERN_ERR "can not alloc irq_cfg\n");
BUG_ON(1);
@@ -254,10 +248,9 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu)
return 0;
}
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-
+/* for move_irq_desc */
static void
-init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
+init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
{
struct irq_pin_list *old_entry, *head, *tail, *entry;
@@ -266,7 +259,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
if (!old_entry)
return;
- entry = get_one_free_irq_2_pin(cpu);
+ entry = get_one_free_irq_2_pin(node);
if (!entry)
return;
@@ -276,7 +269,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
tail = entry;
old_entry = old_entry->next;
while (old_entry) {
- entry = get_one_free_irq_2_pin(cpu);
+ entry = get_one_free_irq_2_pin(node);
if (!entry) {
entry = head;
while (entry) {
@@ -316,12 +309,12 @@ static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
}
void arch_init_copy_chip_data(struct irq_desc *old_desc,
- struct irq_desc *desc, int cpu)
+ struct irq_desc *desc, int node)
{
struct irq_cfg *cfg;
struct irq_cfg *old_cfg;
- cfg = get_one_free_irq_cfg(cpu);
+ cfg = get_one_free_irq_cfg(node);
if (!cfg)
return;
@@ -332,7 +325,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc,
memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
- init_copy_irq_2_pin(old_cfg, cfg, cpu);
+ init_copy_irq_2_pin(old_cfg, cfg, node);
}
static void free_irq_cfg(struct irq_cfg *old_cfg)
@@ -356,19 +349,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
old_desc->chip_data = NULL;
}
}
-
-static void
-set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
- struct irq_cfg *cfg = desc->chip_data;
-
- if (!cfg->move_in_progress) {
- /* it means that domain is not changed */
- if (!cpumask_intersects(desc->affinity, mask))
- cfg->move_desc_pending = 1;
- }
-}
-#endif
+/* end for move_irq_desc */
#else
static struct irq_cfg *irq_cfg(unsigned int irq)
@@ -378,13 +359,6 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
#endif
-#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
-static inline void
-set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
-}
-#endif
-
struct io_apic {
unsigned int index;
unsigned int unused[3];
@@ -488,7 +462,8 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
static void
__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
- union entry_union eu;
+ union entry_union eu = {{0, 0}};
+
eu.entry = e;
io_apic_write(apic, 0x11 + 2*pin, eu.w2);
io_apic_write(apic, 0x10 + 2*pin, eu.w1);
@@ -518,132 +493,18 @@ static void ioapic_mask_entry(int apic, int pin)
spin_unlock_irqrestore(&ioapic_lock, flags);
}
-#ifdef CONFIG_SMP
-static void send_cleanup_vector(struct irq_cfg *cfg)
-{
- cpumask_var_t cleanup_mask;
-
- if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
- unsigned int i;
- cfg->move_cleanup_count = 0;
- for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
- cfg->move_cleanup_count++;
- for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
- apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
- } else {
- cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
- cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
- apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- free_cpumask_var(cleanup_mask);
- }
- cfg->move_in_progress = 0;
-}
-
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
-{
- int apic, pin;
- struct irq_pin_list *entry;
- u8 vector = cfg->vector;
-
- entry = cfg->irq_2_pin;
- for (;;) {
- unsigned int reg;
-
- if (!entry)
- break;
-
- apic = entry->apic;
- pin = entry->pin;
- /*
- * With interrupt-remapping, destination information comes
- * from interrupt-remapping table entry.
- */
- if (!irq_remapped(irq))
- io_apic_write(apic, 0x11 + pin*2, dest);
- reg = io_apic_read(apic, 0x10 + pin*2);
- reg &= ~IO_APIC_REDIR_VECTOR_MASK;
- reg |= vector;
- io_apic_modify(apic, 0x10 + pin*2, reg);
- if (!entry->next)
- break;
- entry = entry->next;
- }
-}
-
-static int
-assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
-
-/*
- * Either sets desc->affinity to a valid value, and returns
- * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
- * leaves desc->affinity untouched.
- */
-static unsigned int
-set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
-{
- struct irq_cfg *cfg;
- unsigned int irq;
-
- if (!cpumask_intersects(mask, cpu_online_mask))
- return BAD_APICID;
-
- irq = desc->irq;
- cfg = desc->chip_data;
- if (assign_irq_vector(irq, cfg, mask))
- return BAD_APICID;
-
- /* check that before desc->addinity get updated */
- set_extra_move_desc(desc, mask);
-
- cpumask_copy(desc->affinity, mask);
-
- return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
-}
-
-static void
-set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
- struct irq_cfg *cfg;
- unsigned long flags;
- unsigned int dest;
- unsigned int irq;
-
- irq = desc->irq;
- cfg = desc->chip_data;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- dest = set_desc_affinity(desc, mask);
- if (dest != BAD_APICID) {
- /* Only the high 8 bits are valid. */
- dest = SET_APIC_LOGICAL_ID(dest);
- __target_IO_APIC_irq(irq, dest, cfg);
- }
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-static void
-set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
-{
- struct irq_desc *desc;
-
- desc = irq_to_desc(irq);
-
- set_ioapic_affinity_irq_desc(desc, mask);
-}
-#endif /* CONFIG_SMP */
-
/*
* The common case is 1:1 IRQ<->pin mappings. Sometimes there are
* shared ISA-space IRQs, so we have to support them. We are super
* fast in the common case, and fast for shared ISA-space IRQs.
*/
-static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
+static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
{
struct irq_pin_list *entry;
entry = cfg->irq_2_pin;
if (!entry) {
- entry = get_one_free_irq_2_pin(cpu);
+ entry = get_one_free_irq_2_pin(node);
if (!entry) {
printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
apic, pin);
@@ -663,7 +524,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
entry = entry->next;
}
- entry->next = get_one_free_irq_2_pin(cpu);
+ entry->next = get_one_free_irq_2_pin(node);
entry = entry->next;
entry->apic = apic;
entry->pin = pin;
@@ -672,7 +533,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
/*
* Reroute an IRQ to a different pin.
*/
-static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
+static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
int oldapic, int oldpin,
int newapic, int newpin)
{
@@ -692,7 +553,7 @@ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
/* why? call replace before add? */
if (!replaced)
- add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
+ add_pin_to_irq_node(cfg, node, newapic, newpin);
}
static inline void io_apic_modify_irq(struct irq_cfg *cfg,
@@ -850,7 +711,6 @@ static int __init ioapic_pirq_setup(char *str)
__setup("pirq=", ioapic_pirq_setup);
#endif /* CONFIG_X86_32 */
-#ifdef CONFIG_INTR_REMAP
struct IO_APIC_route_entry **alloc_ioapic_entries(void)
{
int apic;
@@ -948,20 +808,6 @@ int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
return 0;
}
-void reinit_intr_remapped_IO_APIC(int intr_remapping,
- struct IO_APIC_route_entry **ioapic_entries)
-
-{
- /*
- * for now plain restore of previous settings.
- * TBD: In the case of OS enabling interrupt-remapping,
- * IO-APIC RTE's need to be setup to point to interrupt-remapping
- * table entries. for now, do a plain restore, and wait for
- * the setup_IO_APIC_irqs() to do proper initialization.
- */
- restore_IO_APIC_setup(ioapic_entries);
-}
-
void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
{
int apic;
@@ -971,7 +817,6 @@ void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
kfree(ioapic_entries);
}
-#endif
/*
* Find the IRQ entry number of a certain pin.
@@ -1032,54 +877,6 @@ static int __init find_isa_irq_apic(int irq, int type)
return -1;
}
-/*
- * Find a specific PCI IRQ entry.
- * Not an __init, possibly needed by modules
- */
-static int pin_2_irq(int idx, int apic, int pin);
-
-int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
-{
- int apic, i, best_guess = -1;
-
- apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
- bus, slot, pin);
- if (test_bit(bus, mp_bus_not_pci)) {
- apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
- return -1;
- }
- for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].srcbus;
-
- for (apic = 0; apic < nr_ioapics; apic++)
- if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
- mp_irqs[i].dstapic == MP_APIC_ALL)
- break;
-
- if (!test_bit(lbus, mp_bus_not_pci) &&
- !mp_irqs[i].irqtype &&
- (bus == lbus) &&
- (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
- int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
-
- if (!(apic || IO_APIC_IRQ(irq)))
- continue;
-
- if (pin == (mp_irqs[i].srcbusirq & 3))
- return irq;
- /*
- * Use the first all-but-pin matching entry as a
- * best-guess fuzzy result for broken mptables.
- */
- if (best_guess < 0)
- best_guess = irq;
- }
- }
- return best_guess;
-}
-
-EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
-
#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
/*
* EISA Edge/Level control register, ELCR
@@ -1298,6 +1095,64 @@ static int pin_2_irq(int idx, int apic, int pin)
return irq;
}
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
+ struct io_apic_irq_attr *irq_attr)
+{
+ int apic, i, best_guess = -1;
+
+ apic_printk(APIC_DEBUG,
+ "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+ bus, slot, pin);
+ if (test_bit(bus, mp_bus_not_pci)) {
+ apic_printk(APIC_VERBOSE,
+ "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+ return -1;
+ }
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].srcbus;
+
+ for (apic = 0; apic < nr_ioapics; apic++)
+ if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
+ mp_irqs[i].dstapic == MP_APIC_ALL)
+ break;
+
+ if (!test_bit(lbus, mp_bus_not_pci) &&
+ !mp_irqs[i].irqtype &&
+ (bus == lbus) &&
+ (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
+ int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
+
+ if (!(apic || IO_APIC_IRQ(irq)))
+ continue;
+
+ if (pin == (mp_irqs[i].srcbusirq & 3)) {
+ set_io_apic_irq_attr(irq_attr, apic,
+ mp_irqs[i].dstirq,
+ irq_trigger(i),
+ irq_polarity(i));
+ return irq;
+ }
+ /*
+ * Use the first all-but-pin matching entry as a
+ * best-guess fuzzy result for broken mptables.
+ */
+ if (best_guess < 0) {
+ set_io_apic_irq_attr(irq_attr, apic,
+ mp_irqs[i].dstirq,
+ irq_trigger(i),
+ irq_polarity(i));
+ best_guess = irq;
+ }
+ }
+ }
+ return best_guess;
+}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+
void lock_vector_lock(void)
{
/* Used to the online set of cpus does not change
@@ -1559,6 +1414,9 @@ int setup_ioapic_entry(int apic_id, int irq,
irte.vector = vector;
irte.dest_id = IRTE_DEST(destination);
+ /* Set source-id of interrupt request */
+ set_ioapic_sid(&irte, apic_id);
+
modify_irte(irq, &irte);
ir_entry->index2 = (index >> 15) & 0x1;
@@ -1628,58 +1486,70 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
ioapic_write_entry(apic_id, pin, entry);
}
+static struct {
+ DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} mp_ioapic_routing[MAX_IO_APICS];
+
static void __init setup_IO_APIC_irqs(void)
{
- int apic_id, pin, idx, irq;
+ int apic_id = 0, pin, idx, irq;
int notcon = 0;
struct irq_desc *desc;
struct irq_cfg *cfg;
- int cpu = boot_cpu_id;
+ int node = cpu_to_node(boot_cpu_id);
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
- for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
- for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
-
- idx = find_irq_entry(apic_id, pin, mp_INT);
- if (idx == -1) {
- if (!notcon) {
- notcon = 1;
- apic_printk(APIC_VERBOSE,
- KERN_DEBUG " %d-%d",
- mp_ioapics[apic_id].apicid, pin);
- } else
- apic_printk(APIC_VERBOSE, " %d-%d",
- mp_ioapics[apic_id].apicid, pin);
- continue;
- }
- if (notcon) {
+#ifdef CONFIG_ACPI
+ if (!acpi_disabled && acpi_ioapic) {
+ apic_id = mp_find_ioapic(0);
+ if (apic_id < 0)
+ apic_id = 0;
+ }
+#endif
+
+ for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
+ idx = find_irq_entry(apic_id, pin, mp_INT);
+ if (idx == -1) {
+ if (!notcon) {
+ notcon = 1;
apic_printk(APIC_VERBOSE,
- " (apicid-pin) not connected\n");
- notcon = 0;
- }
+ KERN_DEBUG " %d-%d",
+ mp_ioapics[apic_id].apicid, pin);
+ } else
+ apic_printk(APIC_VERBOSE, " %d-%d",
+ mp_ioapics[apic_id].apicid, pin);
+ continue;
+ }
+ if (notcon) {
+ apic_printk(APIC_VERBOSE,
+ " (apicid-pin) not connected\n");
+ notcon = 0;
+ }
- irq = pin_2_irq(idx, apic_id, pin);
+ irq = pin_2_irq(idx, apic_id, pin);
- /*
- * Skip the timer IRQ if there's a quirk handler
- * installed and if it returns 1:
- */
- if (apic->multi_timer_check &&
- apic->multi_timer_check(apic_id, irq))
- continue;
-
- desc = irq_to_desc_alloc_cpu(irq, cpu);
- if (!desc) {
- printk(KERN_INFO "can not get irq_desc for %d\n", irq);
- continue;
- }
- cfg = desc->chip_data;
- add_pin_to_irq_cpu(cfg, cpu, apic_id, pin);
+ /*
+ * Skip the timer IRQ if there's a quirk handler
+ * installed and if it returns 1:
+ */
+ if (apic->multi_timer_check &&
+ apic->multi_timer_check(apic_id, irq))
+ continue;
- setup_IO_APIC_irq(apic_id, pin, irq, desc,
- irq_trigger(idx), irq_polarity(idx));
+ desc = irq_to_desc_alloc_node(irq, node);
+ if (!desc) {
+ printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+ continue;
}
+ cfg = desc->chip_data;
+ add_pin_to_irq_node(cfg, node, apic_id, pin);
+ /*
+ * don't mark it in pin_programmed, so later acpi could
+ * set it correctly when irq < 16
+ */
+ setup_IO_APIC_irq(apic_id, pin, irq, desc,
+ irq_trigger(idx), irq_polarity(idx));
}
if (notcon)
@@ -1846,36 +1716,30 @@ __apicdebuginit(void) print_IO_APIC(void)
return;
}
-__apicdebuginit(void) print_APIC_bitfield(int base)
+__apicdebuginit(void) print_APIC_field(int base)
{
- unsigned int v;
- int i, j;
+ int i;
if (apic_verbosity == APIC_QUIET)
return;
- printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
- for (i = 0; i < 8; i++) {
- v = apic_read(base + i*0x10);
- for (j = 0; j < 32; j++) {
- if (v & (1<<j))
- printk("1");
- else
- printk("0");
- }
- printk("\n");
- }
+ printk(KERN_DEBUG);
+
+ for (i = 0; i < 8; i++)
+ printk(KERN_CONT "%08x", apic_read(base + i*0x10));
+
+ printk(KERN_CONT "\n");
}
__apicdebuginit(void) print_local_APIC(void *dummy)
{
- unsigned int v, ver, maxlvt;
+ unsigned int i, v, ver, maxlvt;
u64 icr;
if (apic_verbosity == APIC_QUIET)
return;
- printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
+ printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
smp_processor_id(), hard_smp_processor_id());
v = apic_read(APIC_ID);
printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id());
@@ -1916,11 +1780,11 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
printk(KERN_DEBUG "... APIC ISR field:\n");
- print_APIC_bitfield(APIC_ISR);
+ print_APIC_field(APIC_ISR);
printk(KERN_DEBUG "... APIC TMR field:\n");
- print_APIC_bitfield(APIC_TMR);
+ print_APIC_field(APIC_TMR);
printk(KERN_DEBUG "... APIC IRR field:\n");
- print_APIC_bitfield(APIC_IRR);
+ print_APIC_field(APIC_IRR);
if (APIC_INTEGRATED(ver)) { /* !82489DX */
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
@@ -1957,6 +1821,18 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
v = apic_read(APIC_TDCR);
printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+
+ if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+ v = apic_read(APIC_EFEAT);
+ maxlvt = (v >> 16) & 0xff;
+ printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v);
+ v = apic_read(APIC_ECTRL);
+ printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v);
+ for (i = 0; i < maxlvt; i++) {
+ v = apic_read(APIC_EILVTn(i));
+ printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
+ }
+ }
printk("\n");
}
@@ -2005,6 +1881,11 @@ __apicdebuginit(void) print_PIC(void)
__apicdebuginit(int) print_all_ICs(void)
{
print_PIC();
+
+ /* don't print out if apic is not there */
+ if (!cpu_has_apic || disable_apic)
+ return 0;
+
print_all_local_APICs();
print_IO_APIC();
@@ -2120,7 +2001,9 @@ void disable_IO_APIC(void)
/*
* Use virtual wire A mode when interrupt remapping is enabled.
*/
- disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1);
+ if (cpu_has_apic)
+ disconnect_bsp_APIC(!intr_remapping_enabled &&
+ ioapic_i8259.pin != -1);
}
#ifdef CONFIG_X86_32
@@ -2360,6 +2243,118 @@ static int ioapic_retrigger_irq(unsigned int irq)
*/
#ifdef CONFIG_SMP
+static void send_cleanup_vector(struct irq_cfg *cfg)
+{
+ cpumask_var_t cleanup_mask;
+
+ if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+ unsigned int i;
+ cfg->move_cleanup_count = 0;
+ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ cfg->move_cleanup_count++;
+ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+ } else {
+ cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+ cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
+ apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+ free_cpumask_var(cleanup_mask);
+ }
+ cfg->move_in_progress = 0;
+}
+
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
+{
+ int apic, pin;
+ struct irq_pin_list *entry;
+ u8 vector = cfg->vector;
+
+ entry = cfg->irq_2_pin;
+ for (;;) {
+ unsigned int reg;
+
+ if (!entry)
+ break;
+
+ apic = entry->apic;
+ pin = entry->pin;
+ /*
+ * With interrupt-remapping, destination information comes
+ * from interrupt-remapping table entry.
+ */
+ if (!irq_remapped(irq))
+ io_apic_write(apic, 0x11 + pin*2, dest);
+ reg = io_apic_read(apic, 0x10 + pin*2);
+ reg &= ~IO_APIC_REDIR_VECTOR_MASK;
+ reg |= vector;
+ io_apic_modify(apic, 0x10 + pin*2, reg);
+ if (!entry->next)
+ break;
+ entry = entry->next;
+ }
+}
+
+static int
+assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
+
+/*
+ * Either sets desc->affinity to a valid value, and returns
+ * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
+ * leaves desc->affinity untouched.
+ */
+static unsigned int
+set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
+{
+ struct irq_cfg *cfg;
+ unsigned int irq;
+
+ if (!cpumask_intersects(mask, cpu_online_mask))
+ return BAD_APICID;
+
+ irq = desc->irq;
+ cfg = desc->chip_data;
+ if (assign_irq_vector(irq, cfg, mask))
+ return BAD_APICID;
+
+ cpumask_copy(desc->affinity, mask);
+
+ return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
+}
+
+static int
+set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+{
+ struct irq_cfg *cfg;
+ unsigned long flags;
+ unsigned int dest;
+ unsigned int irq;
+ int ret = -1;
+
+ irq = desc->irq;
+ cfg = desc->chip_data;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ dest = set_desc_affinity(desc, mask);
+ if (dest != BAD_APICID) {
+ /* Only the high 8 bits are valid. */
+ dest = SET_APIC_LOGICAL_ID(dest);
+ __target_IO_APIC_irq(irq, dest, cfg);
+ ret = 0;
+ }
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return ret;
+}
+
+static int
+set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
+{
+ struct irq_desc *desc;
+
+ desc = irq_to_desc(irq);
+
+ return set_ioapic_affinity_irq_desc(desc, mask);
+}
#ifdef CONFIG_INTR_REMAP
@@ -2374,26 +2369,25 @@ static int ioapic_retrigger_irq(unsigned int irq)
* Real vector that is used for interrupting cpu will be coming from
* the interrupt-remapping table entry.
*/
-static void
+static int
migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
{
struct irq_cfg *cfg;
struct irte irte;
unsigned int dest;
unsigned int irq;
+ int ret = -1;
if (!cpumask_intersects(mask, cpu_online_mask))
- return;
+ return ret;
irq = desc->irq;
if (get_irte(irq, &irte))
- return;
+ return ret;
cfg = desc->chip_data;
if (assign_irq_vector(irq, cfg, mask))
- return;
-
- set_extra_move_desc(desc, mask);
+ return ret;
dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
@@ -2409,27 +2403,30 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
send_cleanup_vector(cfg);
cpumask_copy(desc->affinity, mask);
+
+ return 0;
}
/*
* Migrates the IRQ destination in the process context.
*/
-static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
const struct cpumask *mask)
{
- migrate_ioapic_irq_desc(desc, mask);
+ return migrate_ioapic_irq_desc(desc, mask);
}
-static void set_ir_ioapic_affinity_irq(unsigned int irq,
+static int set_ir_ioapic_affinity_irq(unsigned int irq,
const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
- set_ir_ioapic_affinity_irq_desc(desc, mask);
+ return set_ir_ioapic_affinity_irq_desc(desc, mask);
}
#else
-static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
const struct cpumask *mask)
{
+ return 0;
}
#endif
@@ -2491,86 +2488,19 @@ static void irq_complete_move(struct irq_desc **descp)
struct irq_cfg *cfg = desc->chip_data;
unsigned vector, me;
- if (likely(!cfg->move_in_progress)) {
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
- if (likely(!cfg->move_desc_pending))
- return;
-
- /* domain has not changed, but affinity did */
- me = smp_processor_id();
- if (cpumask_test_cpu(me, desc->affinity)) {
- *descp = desc = move_irq_desc(desc, me);
- /* get the new one */
- cfg = desc->chip_data;
- cfg->move_desc_pending = 0;
- }
-#endif
+ if (likely(!cfg->move_in_progress))
return;
- }
vector = ~get_irq_regs()->orig_ax;
me = smp_processor_id();
- if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) {
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
- *descp = desc = move_irq_desc(desc, me);
- /* get the new one */
- cfg = desc->chip_data;
-#endif
+ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
send_cleanup_vector(cfg);
- }
}
#else
static inline void irq_complete_move(struct irq_desc **descp) {}
#endif
-static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
-{
- int apic, pin;
- struct irq_pin_list *entry;
-
- entry = cfg->irq_2_pin;
- for (;;) {
-
- if (!entry)
- break;
-
- apic = entry->apic;
- pin = entry->pin;
- io_apic_eoi(apic, pin);
- entry = entry->next;
- }
-}
-
-static void
-eoi_ioapic_irq(struct irq_desc *desc)
-{
- struct irq_cfg *cfg;
- unsigned long flags;
- unsigned int irq;
-
- irq = desc->irq;
- cfg = desc->chip_data;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- __eoi_ioapic_irq(irq, cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-#ifdef CONFIG_X86_X2APIC
-static void ack_x2apic_level(unsigned int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- ack_x2APIC_irq();
- eoi_ioapic_irq(desc);
-}
-
-static void ack_x2apic_edge(unsigned int irq)
-{
- ack_x2APIC_irq();
-}
-#endif
-
static void ack_apic_edge(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -2634,9 +2564,6 @@ static void ack_apic_level(unsigned int irq)
*/
ack_APIC_irq();
- if (irq_remapped(irq))
- eoi_ioapic_irq(desc);
-
/* Now we can move and renable the irq */
if (unlikely(do_unmask_irq)) {
/* Only migrate the irq if the ack has been received.
@@ -2683,22 +2610,50 @@ static void ack_apic_level(unsigned int irq)
}
#ifdef CONFIG_INTR_REMAP
+static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+ int apic, pin;
+ struct irq_pin_list *entry;
+
+ entry = cfg->irq_2_pin;
+ for (;;) {
+
+ if (!entry)
+ break;
+
+ apic = entry->apic;
+ pin = entry->pin;
+ io_apic_eoi(apic, pin);
+ entry = entry->next;
+ }
+}
+
+static void
+eoi_ioapic_irq(struct irq_desc *desc)
+{
+ struct irq_cfg *cfg;
+ unsigned long flags;
+ unsigned int irq;
+
+ irq = desc->irq;
+ cfg = desc->chip_data;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ __eoi_ioapic_irq(irq, cfg);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
static void ir_ack_apic_edge(unsigned int irq)
{
-#ifdef CONFIG_X86_X2APIC
- if (x2apic_enabled())
- return ack_x2apic_edge(irq);
-#endif
- return ack_apic_edge(irq);
+ ack_APIC_irq();
}
static void ir_ack_apic_level(unsigned int irq)
{
-#ifdef CONFIG_X86_X2APIC
- if (x2apic_enabled())
- return ack_x2apic_level(irq);
-#endif
- return ack_apic_level(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ ack_APIC_irq();
+ eoi_ioapic_irq(desc);
}
#endif /* CONFIG_INTR_REMAP */
@@ -2903,7 +2858,7 @@ static inline void __init check_timer(void)
{
struct irq_desc *desc = irq_to_desc(0);
struct irq_cfg *cfg = desc->chip_data;
- int cpu = boot_cpu_id;
+ int node = cpu_to_node(boot_cpu_id);
int apic1, pin1, apic2, pin2;
unsigned long flags;
int no_pin1 = 0;
@@ -2969,7 +2924,7 @@ static inline void __init check_timer(void)
* Ok, does IRQ0 through the IOAPIC work?
*/
if (no_pin1) {
- add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
+ add_pin_to_irq_node(cfg, node, apic1, pin1);
setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
} else {
/* for edge trigger, setup_IO_APIC_irq already
@@ -3006,7 +2961,7 @@ static inline void __init check_timer(void)
/*
* legacy devices should be connected to IO APIC #0
*/
- replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
+ replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
enable_8259A_irq(0);
if (timer_irq_works()) {
@@ -3218,14 +3173,13 @@ static int nr_irqs_gsi = NR_IRQS_LEGACY;
/*
* Dynamic irq allocate and deallocation
*/
-unsigned int create_irq_nr(unsigned int irq_want)
+unsigned int create_irq_nr(unsigned int irq_want, int node)
{
/* Allocate an unused irq */
unsigned int irq;
unsigned int new;
unsigned long flags;
struct irq_cfg *cfg_new = NULL;
- int cpu = boot_cpu_id;
struct irq_desc *desc_new = NULL;
irq = 0;
@@ -3234,7 +3188,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
spin_lock_irqsave(&vector_lock, flags);
for (new = irq_want; new < nr_irqs; new++) {
- desc_new = irq_to_desc_alloc_cpu(new, cpu);
+ desc_new = irq_to_desc_alloc_node(new, node);
if (!desc_new) {
printk(KERN_INFO "can not get irq_desc for %d\n", new);
continue;
@@ -3243,6 +3197,9 @@ unsigned int create_irq_nr(unsigned int irq_want)
if (cfg_new->vector != 0)
continue;
+
+ desc_new = move_irq_desc(desc_new, node);
+
if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
irq = new;
break;
@@ -3260,11 +3217,12 @@ unsigned int create_irq_nr(unsigned int irq_want)
int create_irq(void)
{
+ int node = cpu_to_node(boot_cpu_id);
unsigned int irq_want;
int irq;
irq_want = nr_irqs_gsi;
- irq = create_irq_nr(irq_want);
+ irq = create_irq_nr(irq_want, node);
if (irq == 0)
irq = -1;
@@ -3329,6 +3287,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
irte.vector = cfg->vector;
irte.dest_id = IRTE_DEST(dest);
+ /* Set source-id of interrupt request */
+ set_msi_sid(&irte, pdev);
+
modify_irte(irq, &irte);
msg->address_hi = MSI_ADDR_BASE_HI;
@@ -3366,7 +3327,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
}
#ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
@@ -3375,7 +3336,7 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
dest = set_desc_affinity(desc, mask);
if (dest == BAD_APICID)
- return;
+ return -1;
cfg = desc->chip_data;
@@ -3387,13 +3348,15 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
write_msi_msg_desc(desc, &msg);
+
+ return 0;
}
#ifdef CONFIG_INTR_REMAP
/*
* Migrate the MSI irq to another cpumask. This migration is
* done in the process context using interrupt-remapping hardware.
*/
-static void
+static int
ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -3402,11 +3365,11 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
struct irte irte;
if (get_irte(irq, &irte))
- return;
+ return -1;
dest = set_desc_affinity(desc, mask);
if (dest == BAD_APICID)
- return;
+ return -1;
irte.vector = cfg->vector;
irte.dest_id = IRTE_DEST(dest);
@@ -3423,6 +3386,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
*/
if (cfg->move_in_progress)
send_cleanup_vector(cfg);
+
+ return 0;
}
#endif
@@ -3518,15 +3483,17 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
unsigned int irq_want;
struct intel_iommu *iommu = NULL;
int index = 0;
+ int node;
/* x86 doesn't support multiple MSI yet */
if (type == PCI_CAP_ID_MSI && nvec > 1)
return 1;
+ node = dev_to_node(&dev->dev);
irq_want = nr_irqs_gsi;
sub_handle = 0;
list_for_each_entry(msidesc, &dev->msi_list, list) {
- irq = create_irq_nr(irq_want);
+ irq = create_irq_nr(irq_want, node);
if (irq == 0)
return -1;
irq_want = irq + 1;
@@ -3576,7 +3543,7 @@ void arch_teardown_msi_irq(unsigned int irq)
#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
#ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
@@ -3585,7 +3552,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
dest = set_desc_affinity(desc, mask);
if (dest == BAD_APICID)
- return;
+ return -1;
cfg = desc->chip_data;
@@ -3597,11 +3564,13 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
dmar_msi_write(irq, &msg);
+
+ return 0;
}
#endif /* CONFIG_SMP */
-struct irq_chip dmar_msi_type = {
+static struct irq_chip dmar_msi_type = {
.name = "DMAR_MSI",
.unmask = dmar_msi_unmask,
.mask = dmar_msi_mask,
@@ -3630,7 +3599,7 @@ int arch_setup_dmar_msi(unsigned int irq)
#ifdef CONFIG_HPET_TIMER
#ifdef CONFIG_SMP
-static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
@@ -3639,7 +3608,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
dest = set_desc_affinity(desc, mask);
if (dest == BAD_APICID)
- return;
+ return -1;
cfg = desc->chip_data;
@@ -3651,6 +3620,8 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
hpet_msi_write(irq, &msg);
+
+ return 0;
}
#endif /* CONFIG_SMP */
@@ -3707,7 +3678,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
write_ht_irq_msg(irq, &msg);
}
-static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
@@ -3715,11 +3686,13 @@ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
dest = set_desc_affinity(desc, mask);
if (dest == BAD_APICID)
- return;
+ return -1;
cfg = desc->chip_data;
target_ht_irq(irq, dest, cfg->vector);
+
+ return 0;
}
#endif
@@ -3794,6 +3767,8 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
unsigned long flags;
int err;
+ BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
cfg = irq_cfg(irq);
err = assign_irq_vector(irq, cfg, eligible_cpu);
@@ -3807,19 +3782,20 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
mmr_value = 0;
entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
- BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
-
- entry->vector = cfg->vector;
- entry->delivery_mode = apic->irq_delivery_mode;
- entry->dest_mode = apic->irq_dest_mode;
- entry->polarity = 0;
- entry->trigger = 0;
- entry->mask = 0;
- entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
+ entry->vector = cfg->vector;
+ entry->delivery_mode = apic->irq_delivery_mode;
+ entry->dest_mode = apic->irq_dest_mode;
+ entry->polarity = 0;
+ entry->trigger = 0;
+ entry->mask = 0;
+ entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
mmr_pnode = uv_blade_to_pnode(mmr_blade);
uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+ if (cfg->move_in_progress)
+ send_cleanup_vector(cfg);
+
return irq;
}
@@ -3833,10 +3809,10 @@ void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
struct uv_IO_APIC_route_entry *entry;
int mmr_pnode;
+ BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
mmr_value = 0;
entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
- BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
-
entry->mask = 1;
mmr_pnode = uv_blade_to_pnode(mmr_blade);
@@ -3900,6 +3876,71 @@ int __init arch_probe_nr_irqs(void)
}
#endif
+static int __io_apic_set_pci_routing(struct device *dev, int irq,
+ struct io_apic_irq_attr *irq_attr)
+{
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+ int node;
+ int ioapic, pin;
+ int trigger, polarity;
+
+ ioapic = irq_attr->ioapic;
+ if (!IO_APIC_IRQ(irq)) {
+ apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
+ ioapic);
+ return -EINVAL;
+ }
+
+ if (dev)
+ node = dev_to_node(dev);
+ else
+ node = cpu_to_node(boot_cpu_id);
+
+ desc = irq_to_desc_alloc_node(irq, node);
+ if (!desc) {
+ printk(KERN_INFO "can not get irq_desc %d\n", irq);
+ return 0;
+ }
+
+ pin = irq_attr->ioapic_pin;
+ trigger = irq_attr->trigger;
+ polarity = irq_attr->polarity;
+
+ /*
+ * IRQs < 16 are already in the irq_2_pin[] map
+ */
+ if (irq >= NR_IRQS_LEGACY) {
+ cfg = desc->chip_data;
+ add_pin_to_irq_node(cfg, node, ioapic, pin);
+ }
+
+ setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
+
+ return 0;
+}
+
+int io_apic_set_pci_routing(struct device *dev, int irq,
+ struct io_apic_irq_attr *irq_attr)
+{
+ int ioapic, pin;
+ /*
+ * Avoid pin reprogramming. PRTs typically include entries
+ * with redundant pin->gsi mappings (but unique PCI devices);
+ * we only program the IOAPIC on the first.
+ */
+ ioapic = irq_attr->ioapic;
+ pin = irq_attr->ioapic_pin;
+ if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
+ pr_debug("Pin %d-%d already programmed\n",
+ mp_ioapics[ioapic].apicid, pin);
+ return 0;
+ }
+ set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
+
+ return __io_apic_set_pci_routing(dev, irq, irq_attr);
+}
+
/* --------------------------------------------------------------------------
ACPI-based IOAPIC Configuration
-------------------------------------------------------------------------- */
@@ -3980,6 +4021,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
return apic_id;
}
+#endif
int __init io_apic_get_version(int ioapic)
{
@@ -3992,39 +4034,6 @@ int __init io_apic_get_version(int ioapic)
return reg_01.bits.version;
}
-#endif
-
-int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
-{
- struct irq_desc *desc;
- struct irq_cfg *cfg;
- int cpu = boot_cpu_id;
-
- if (!IO_APIC_IRQ(irq)) {
- apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
- ioapic);
- return -EINVAL;
- }
-
- desc = irq_to_desc_alloc_cpu(irq, cpu);
- if (!desc) {
- printk(KERN_INFO "can not get irq_desc %d\n", irq);
- return 0;
- }
-
- /*
- * IRQs < 16 are already in the irq_2_pin[] map
- */
- if (irq >= NR_IRQS_LEGACY) {
- cfg = desc->chip_data;
- add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
- }
-
- setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
-
- return 0;
-}
-
int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
{
@@ -4055,51 +4064,44 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
#ifdef CONFIG_SMP
void __init setup_ioapic_dest(void)
{
- int pin, ioapic, irq, irq_entry;
+ int pin, ioapic = 0, irq, irq_entry;
struct irq_desc *desc;
- struct irq_cfg *cfg;
const struct cpumask *mask;
if (skip_ioapic_setup == 1)
return;
- for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
- for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
- irq_entry = find_irq_entry(ioapic, pin, mp_INT);
- if (irq_entry == -1)
- continue;
- irq = pin_2_irq(irq_entry, ioapic, pin);
-
- /* setup_IO_APIC_irqs could fail to get vector for some device
- * when you have too many devices, because at that time only boot
- * cpu is online.
- */
- desc = irq_to_desc(irq);
- cfg = desc->chip_data;
- if (!cfg->vector) {
- setup_IO_APIC_irq(ioapic, pin, irq, desc,
- irq_trigger(irq_entry),
- irq_polarity(irq_entry));
- continue;
+#ifdef CONFIG_ACPI
+ if (!acpi_disabled && acpi_ioapic) {
+ ioapic = mp_find_ioapic(0);
+ if (ioapic < 0)
+ ioapic = 0;
+ }
+#endif
- }
+ for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+ irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+ if (irq_entry == -1)
+ continue;
+ irq = pin_2_irq(irq_entry, ioapic, pin);
- /*
- * Honour affinities which have been set in early boot
- */
- if (desc->status &
- (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
- mask = desc->affinity;
- else
- mask = apic->target_cpus();
+ desc = irq_to_desc(irq);
- if (intr_remapping_enabled)
- set_ir_ioapic_affinity_irq_desc(desc, mask);
- else
- set_ioapic_affinity_irq_desc(desc, mask);
- }
+ /*
+ * Honour affinities which have been set in early boot
+ */
+ if (desc->status &
+ (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
+ mask = desc->affinity;
+ else
+ mask = apic->target_cpus();
+ if (intr_remapping_enabled)
+ set_ir_ioapic_affinity_irq_desc(desc, mask);
+ else
+ set_ioapic_affinity_irq_desc(desc, mask);
}
+
}
#endif
@@ -4182,28 +4184,20 @@ fake_ioapic_page:
}
}
-static int __init ioapic_insert_resources(void)
+void __init ioapic_insert_resources(void)
{
int i;
struct resource *r = ioapic_resources;
if (!r) {
- if (nr_ioapics > 0) {
+ if (nr_ioapics > 0)
printk(KERN_ERR
"IO APIC resources couldn't be allocated.\n");
- return -1;
- }
- return 0;
+ return;
}
for (i = 0; i < nr_ioapics; i++) {
insert_resource(&iomem_resource, r);
r++;
}
-
- return 0;
}
-
-/* Insert the IO APIC resources after PCI initialization has occured to handle
- * IO APICS that are mapped in on a BAR in PCI space. */
-late_initcall(ioapic_insert_resources);
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index dbf5445727a..6ef00ba4c88 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -106,6 +106,9 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
unsigned long mask = cpumask_bits(cpumask)[0];
unsigned long flags;
+ if (WARN_ONCE(!mask, "empty IPI mask"))
+ return;
+
local_irq_save(flags);
WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
__default_send_IPI_dest_field(mask, vector, apic->dest_logical);
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index ce4fbfa315a..b3025b43b63 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu)
static inline int mce_in_progress(void)
{
-#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
+#if defined(CONFIG_X86_NEW_MCE)
return atomic_read(&mce_entry) > 0;
#endif
return 0;
@@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data)
}
#endif
-static void report_broken_nmi(int cpu, int *prev_nmi_count)
+static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
{
printk(KERN_CONT "\n");
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 533e59c6fc8..ca96e68f0d2 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -493,7 +493,8 @@ static void numaq_setup_portio_remap(void)
(u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
}
-struct apic apic_numaq = {
+/* Use __refdata to keep false positive warning calm. */
+struct apic __refdata apic_numaq = {
.name = "NUMAQ",
.probe = probe_numaq,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 01eda2ac65e..0c0182cc947 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -20,23 +20,12 @@
#include <asm/apic.h>
#include <asm/setup.h>
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <asm/mpspec.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
#include <linux/smp.h>
-#include <linux/init.h>
#include <asm/ipi.h>
-#include <linux/smp.h>
-#include <linux/init.h>
#include <linux/interrupt.h>
#include <asm/acpi.h>
#include <asm/e820.h>
-#include <asm/setup.h>
#ifdef CONFIG_HOTPLUG_CPU
#define DEFAULT_SEND_IPI (1)
@@ -160,7 +149,6 @@ extern struct apic apic_summit;
extern struct apic apic_bigsmp;
extern struct apic apic_es7000;
extern struct apic apic_es7000_cluster;
-extern struct apic apic_default;
struct apic *apic = &apic_default;
EXPORT_SYMBOL_GPL(apic);
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 1783652bb0e..bc3e880f9b8 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -50,7 +50,7 @@ static struct apic *apic_probe[] __initdata = {
void __init default_setup_apic_routing(void)
{
#ifdef CONFIG_X86_X2APIC
- if (x2apic && (apic != &apic_x2apic_phys &&
+ if (x2apic_mode && (apic != &apic_x2apic_phys &&
#ifdef CONFIG_X86_UV
apic != &apic_x2apic_uv_x &&
#endif
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 9cfe1f415d8..eafdfbd1ea9 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -44,7 +44,6 @@
#include <asm/ipi.h>
#include <linux/kernel.h>
#include <linux/string.h>
-#include <linux/init.h>
#include <linux/gfp.h>
#include <linux/smp.h>
@@ -173,13 +172,6 @@ static inline int is_WPEG(struct rio_detail *rio){
rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
}
-
-/* In clustered mode, the high nibble of APIC ID is a cluster number.
- * The low nibble is a 4-bit bitmap. */
-#define XAPIC_DEST_CPUS_SHIFT 4
-#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
-#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
-
#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
static const struct cpumask *summit_target_cpus(void)
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 4a903e2f0d1..a5371ec3677 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -10,18 +10,20 @@
#include <asm/apic.h>
#include <asm/ipi.h>
-DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
return x2apic_enabled();
}
-/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
-
+/*
+ * need to use more than cpu 0, because we need more vectors when
+ * MSI-X are used.
+ */
static const struct cpumask *x2apic_target_cpus(void)
{
- return cpumask_of(0);
+ return cpu_online_mask;
}
/*
@@ -170,7 +172,7 @@ static unsigned long set_apic_id(unsigned int id)
static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb)
{
- return current_cpu_data.initial_apicid >> index_msb;
+ return initial_apicid >> index_msb;
}
static void x2apic_send_IPI_self(int vector)
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index a284359627e..a8989aadc99 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -27,11 +27,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 0;
}
-/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
-
+/*
+ * need to use more than cpu 0, because we need more vectors when
+ * MSI-X are used.
+ */
static const struct cpumask *x2apic_target_cpus(void)
{
- return cpumask_of(0);
+ return cpu_online_mask;
}
static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
@@ -162,7 +164,7 @@ static unsigned long set_apic_id(unsigned int id)
static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
{
- return current_cpu_data.initial_apicid >> index_msb;
+ return initial_apicid >> index_msb;
}
static void x2apic_send_IPI_self(int vector)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 2bda6935297..601159374e8 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -46,7 +46,7 @@ static int early_get_nodeid(void)
return node_id.s.node_id;
}
-static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
if (!strcmp(oem_id, "SGI")) {
if (!strcmp(oem_table_id, "UVL"))
@@ -105,7 +105,7 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
cpumask_set_cpu(cpu, retmask);
}
-static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
{
#ifdef CONFIG_SMP
unsigned long val;
@@ -253,7 +253,7 @@ static void uv_send_IPI_self(int vector)
apic_write(APIC_SELF_IPI, vector);
}
-struct apic apic_x2apic_uv_x = {
+struct apic __refdata apic_x2apic_uv_x = {
.name = "UV large system",
.probe = NULL,
@@ -261,7 +261,7 @@ struct apic apic_x2apic_uv_x = {
.apic_id_registered = uv_apic_id_registered,
.irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 1, /* logical */
+ .irq_dest_mode = 0, /* physical */
.target_cpus = uv_target_cpus,
.disable_esr = 0,
@@ -362,12 +362,6 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
BUG();
}
-static __init void map_low_mmrs(void)
-{
- init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
- init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
-}
-
enum map_type {map_wb, map_uc};
static __init void map_high(char *id, unsigned long base, int shift,
@@ -395,26 +389,6 @@ static __init void map_gru_high(int max_pnode)
map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
}
-static __init void map_config_high(int max_pnode)
-{
- union uvh_rh_gam_cfg_overlay_config_mmr_u cfg;
- int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT;
-
- cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR);
- if (cfg.s.enable)
- map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc);
-}
-
-static __init void map_mmr_high(int max_pnode)
-{
- union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
- int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
-
- mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
- if (mmr.s.enable)
- map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
-}
-
static __init void map_mmioh_high(int max_pnode)
{
union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
@@ -463,7 +437,7 @@ static void uv_heartbeat(unsigned long ignored)
uv_set_scir_bits(bits);
/* enable next timer period */
- mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
+ mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL);
}
static void __cpuinit uv_heartbeat_enable(int cpu)
@@ -562,18 +536,23 @@ void __init uv_system_init(void)
union uvh_node_id_u node_id;
unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
- int max_pnode = 0;
+ int gnode_extra, max_pnode = 0;
unsigned long mmr_base, present, paddr;
unsigned short pnode_mask;
- map_low_mmrs();
-
m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
m_val = m_n_config.s.m_skt;
n_val = m_n_config.s.n_skt;
mmr_base =
uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
~UV_MMR_ENABLE;
+ pnode_mask = (1 << n_val) - 1;
+ node_id.v = uv_read_local_mmr(UVH_NODE_ID);
+ gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
+ gnode_upper = ((unsigned long)gnode_extra << m_val);
+ printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
+ n_val, m_val, gnode_upper, gnode_extra);
+
printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
@@ -583,15 +562,20 @@ void __init uv_system_init(void)
bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
uv_blade_info = kmalloc(bytes, GFP_KERNEL);
+ BUG_ON(!uv_blade_info);
+ for (blade = 0; blade < uv_num_possible_blades(); blade++)
+ uv_blade_info[blade].memory_nid = -1;
get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
uv_node_to_blade = kmalloc(bytes, GFP_KERNEL);
+ BUG_ON(!uv_node_to_blade);
memset(uv_node_to_blade, 255, bytes);
bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus();
uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL);
+ BUG_ON(!uv_cpu_to_blade);
memset(uv_cpu_to_blade, 255, bytes);
blade = 0;
@@ -607,11 +591,6 @@ void __init uv_system_init(void)
}
}
- pnode_mask = (1 << n_val) - 1;
- node_id.v = uv_read_local_mmr(UVH_NODE_ID);
- gnode_upper = (((unsigned long)node_id.s.node_id) &
- ~((1 << n_val) - 1)) << m_val;
-
uv_bios_init();
uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
&sn_coherency_id, &sn_region_size);
@@ -624,6 +603,9 @@ void __init uv_system_init(void)
lcpu = uv_blade_info[blade].nr_possible_cpus;
uv_blade_info[blade].nr_possible_cpus++;
+ /* Any node on the blade, else will contain -1. */
+ uv_blade_info[blade].memory_nid = nid;
+
uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
uv_cpu_hub_info(cpu)->m_val = m_val;
@@ -634,6 +616,7 @@ void __init uv_system_init(void)
uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
+ uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
@@ -656,11 +639,10 @@ void __init uv_system_init(void)
pnode = (paddr >> m_val) & pnode_mask;
blade = boot_pnode_to_blade(pnode);
uv_node_to_blade[nid] = blade;
+ max_pnode = max(pnode, max_pnode);
}
map_gru_high(max_pnode);
- map_mmr_high(max_pnode);
- map_config_high(max_pnode);
map_mmioh_high(max_pnode);
uv_cpu_init();
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 49e0939bac4..442b5508893 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -811,7 +811,7 @@ static int apm_do_idle(void)
u8 ret = 0;
int idled = 0;
int polling;
- int err;
+ int err = 0;
polling = !!(current_thread_info()->status & TS_POLLING);
if (polling) {
@@ -1233,9 +1233,9 @@ static int suspend(int vetoable)
int err;
struct apm_user *as;
- device_suspend(PMSG_SUSPEND);
+ dpm_suspend_start(PMSG_SUSPEND);
- device_power_down(PMSG_SUSPEND);
+ dpm_suspend_noirq(PMSG_SUSPEND);
local_irq_disable();
sysdev_suspend(PMSG_SUSPEND);
@@ -1259,9 +1259,9 @@ static int suspend(int vetoable)
sysdev_resume();
local_irq_enable();
- device_power_up(PMSG_RESUME);
+ dpm_resume_noirq(PMSG_RESUME);
- device_resume(PMSG_RESUME);
+ dpm_resume_end(PMSG_RESUME);
queue_event(APM_NORMAL_RESUME, NULL);
spin_lock(&user_list_lock);
for (as = user_list; as != NULL; as = as->next) {
@@ -1277,7 +1277,7 @@ static void standby(void)
{
int err;
- device_power_down(PMSG_SUSPEND);
+ dpm_suspend_noirq(PMSG_SUSPEND);
local_irq_disable();
sysdev_suspend(PMSG_SUSPEND);
@@ -1291,7 +1291,7 @@ static void standby(void)
sysdev_resume();
local_irq_enable();
- device_power_up(PMSG_RESUME);
+ dpm_resume_noirq(PMSG_RESUME);
}
static apm_event_t get_event(void)
@@ -1376,7 +1376,7 @@ static void check_events(void)
ignore_bounce = 1;
if ((event != APM_NORMAL_RESUME)
|| (ignore_normal_resume == 0)) {
- device_resume(PMSG_RESUME);
+ dpm_resume_end(PMSG_RESUME);
queue_event(event, NULL);
}
ignore_normal_resume = 0;
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 5a6aa1c1162..dfdbf640389 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -126,6 +126,7 @@ void foo(void)
#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
BLANK();
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
+ OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
BLANK();
@@ -146,4 +147,5 @@ void foo(void)
OFFSET(BP_loadflags, boot_params, hdr.loadflags);
OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
OFFSET(BP_version, boot_params, hdr.version);
+ OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index e72f062fb4b..898ecc47e12 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -125,6 +125,7 @@ int main(void)
OFFSET(BP_loadflags, boot_params, hdr.loadflags);
OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
OFFSET(BP_version, boot_params, hdr.version);
+ OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
BLANK();
DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4e242f9a06e..c1f253dac15 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
#
-# Makefile for x86-compatible CPU details and quirks
+# Makefile for x86-compatible CPU details, features and quirks
#
# Don't trace early stages of a secondary CPU boot
@@ -7,6 +7,10 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_common.o = -pg
endif
+# Make sure load_percpu_segment has no stackprotector
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_common.o := $(nostackp)
+
obj-y := intel_cacheinfo.o addon_cpuid_features.o
obj-y += proc.o capflags.o powerflags.o common.o
obj-y += vmware.o hypervisor.o
@@ -23,11 +27,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
-obj-$(CONFIG_X86_MCE) += mcheck/
-obj-$(CONFIG_MTRR) += mtrr/
-obj-$(CONFIG_CPU_FREQ) += cpufreq/
+obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
+
+obj-$(CONFIG_X86_MCE) += mcheck/
+obj-$(CONFIG_MTRR) += mtrr/
+obj-$(CONFIG_CPU_FREQ) += cpufreq/
-obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
+obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
quiet_cmd_mkcapflags = MKCAP $@
cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7e4a459daa6..63fddcd082c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -6,6 +6,7 @@
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cpu.h>
+#include <asm/pci-direct.h>
#ifdef CONFIG_X86_64
# include <asm/numa_64.h>
@@ -257,13 +258,15 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_HT
unsigned bits;
+ int cpu = smp_processor_id();
bits = c->x86_coreid_bits;
-
/* Low order bits define the core id (index of core in socket) */
c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
/* Convert the initial APIC ID into the socket ID */
c->phys_proc_id = c->initial_apicid >> bits;
+ /* use socket ID also for last level cache */
+ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
#endif
}
@@ -272,7 +275,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
int cpu = smp_processor_id();
int node;
- unsigned apicid = hard_smp_processor_id();
+ unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
node = c->phys_proc_id;
if (apicid_to_node[apicid] != NUMA_NO_NODE)
@@ -351,6 +354,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
(c->x86_model == 8 && c->x86_mask >= 8))
set_cpu_cap(c, X86_FEATURE_K6_MTRR);
#endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
+ /* check CPU config space for extended APIC ID */
+ if (cpu_has_apic && c->x86 >= 0xf) {
+ unsigned int val;
+ val = read_pci_config(0, 24, 0, 0x68);
+ if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
+ set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+ }
+#endif
}
static void __cpuinit init_amd(struct cpuinfo_x86 *c)
@@ -388,6 +400,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
level = cpuid_eax(1);
if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /*
+ * Some BIOSes incorrectly force this feature, but only K8
+ * revision D (model = 0x14) and later actually support it.
+ */
+ if (c->x86_model < 0x14)
+ clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
}
if (c->x86 == 0x10 || c->x86 == 0x11)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 77848d9fca6..5ce60a88027 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
#include <linux/io.h>
#include <asm/stackprotector.h>
+#include <asm/perf_counter.h>
#include <asm/mmu_context.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
@@ -58,7 +59,30 @@ void __init setup_cpu_local_masks(void)
alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
}
-static const struct cpu_dev *this_cpu __cpuinitdata;
+static void __cpuinit default_init(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_64
+ display_cacheinfo(c);
+#else
+ /* Not much we can do here... */
+ /* Check if at least it has cpuid */
+ if (c->cpuid_level == -1) {
+ /* No cpuid. It must be an ancient CPU */
+ if (c->x86 == 4)
+ strcpy(c->x86_model_id, "486");
+ else if (c->x86 == 3)
+ strcpy(c->x86_model_id, "386");
+ }
+#endif
+}
+
+static const struct cpu_dev __cpuinitconst default_cpu = {
+ .c_init = default_init,
+ .c_vendor = "Unknown",
+ .c_x86_vendor = X86_VENDOR_UNKNOWN,
+};
+
+static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
#ifdef CONFIG_X86_64
@@ -107,7 +131,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
/* data */
[GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
- [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
+ [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } },
[GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
GDT_STACK_CANARY_INIT
#endif
@@ -299,7 +323,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
return NULL; /* Not found */
}
-__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata;
+__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;
void load_percpu_segment(int cpu)
{
@@ -330,29 +355,6 @@ void switch_to_new_gdt(int cpu)
static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
-static void __cpuinit default_init(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_X86_64
- display_cacheinfo(c);
-#else
- /* Not much we can do here... */
- /* Check if at least it has cpuid */
- if (c->cpuid_level == -1) {
- /* No cpuid. It must be an ancient CPU */
- if (c->x86 == 4)
- strcpy(c->x86_model_id, "486");
- else if (c->x86 == 3)
- strcpy(c->x86_model_id, "386");
- }
-#endif
-}
-
-static const struct cpu_dev __cpuinitconst default_cpu = {
- .c_init = default_init,
- .c_vendor = "Unknown",
- .c_x86_vendor = X86_VENDOR_UNKNOWN,
-};
-
static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
{
unsigned int *v;
@@ -485,7 +487,6 @@ out:
static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
{
char *v = c->x86_vendor_id;
- static int printed;
int i;
for (i = 0; i < X86_VENDOR_NUM; i++) {
@@ -502,13 +503,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
}
}
- if (!printed) {
- printed++;
- printk(KERN_ERR
- "CPU: vendor_id '%s' unknown, using generic init.\n", v);
-
- printk(KERN_ERR "CPU: Your system may be unstable.\n");
- }
+ printk_once(KERN_ERR
+ "CPU: vendor_id '%s' unknown, using generic init.\n" \
+ "CPU: Your system may be unstable.\n", v);
c->x86_vendor = X86_VENDOR_UNKNOWN;
this_cpu = &default_cpu;
@@ -768,6 +765,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
if (this_cpu->c_identify)
this_cpu->c_identify(c);
+ /* Clear/Set all flags overriden by options, after probe */
+ for (i = 0; i < NCAPINTS; i++) {
+ c->x86_capability[i] &= ~cpu_caps_cleared[i];
+ c->x86_capability[i] |= cpu_caps_set[i];
+ }
+
#ifdef CONFIG_X86_64
c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
#endif
@@ -813,6 +816,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
#endif
init_hypervisor(c);
+
+ /*
+ * Clear/Set all flags overriden by options, need do it
+ * before following smp all cpus cap AND.
+ */
+ for (i = 0; i < NCAPINTS; i++) {
+ c->x86_capability[i] &= ~cpu_caps_cleared[i];
+ c->x86_capability[i] |= cpu_caps_set[i];
+ }
+
/*
* On SMP, boot_cpu_data holds the common feature set between
* all CPUs; so make sure that we indicate which features are
@@ -825,10 +838,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
}
- /* Clear all flags overriden by options */
- for (i = 0; i < NCAPINTS; i++)
- c->x86_capability[i] &= ~cleared_cpu_caps[i];
-
#ifdef CONFIG_X86_MCE
/* Init Machine Check Exception if available. */
mcheck_init(c);
@@ -861,6 +870,7 @@ void __init identify_boot_cpu(void)
#else
vgetcpu_set_mode();
#endif
+ init_hw_perf_counters();
}
void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 46e29ab96c6..6b2a52dd040 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -32,9 +32,7 @@
static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
-static DEFINE_PER_CPU(unsigned, cpu_modelflag);
static DEFINE_PER_CPU(int, cpu_priv_count);
-static DEFINE_PER_CPU(unsigned, cpu_model);
static DEFINE_MUTEX(cpu_debug_lock);
@@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = {
{ "value", CPU_REG_ALL, 1 },
};
-/* Intel Registers Range */
-static struct cpu_debug_range cpu_intel_range[] = {
- { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL },
- { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE },
- { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL },
- { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM },
- { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE },
- { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE },
-
- { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE },
- { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON },
- { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON },
- { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE },
-
- { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE },
- { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT },
- { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT },
- { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM },
-
- { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE },
- { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 },
- { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE },
- { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON },
-
- { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT },
- { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT },
- { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT },
- { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE },
-
- { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 },
- { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 },
- { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX },
- { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 },
- { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT },
-
- { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE },
- { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE },
- { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE },
- { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT },
- { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE },
- { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE },
- { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE },
- { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE },
-
- { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT },
- { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON },
- { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE },
- { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON },
- { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE },
- { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 },
- { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE },
- { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 },
-
- { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE },
- { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE },
- { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE },
- { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE },
- { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE },
- { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE },
-
- { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON },
- { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE },
- { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON },
- { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT },
- { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON },
- { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT },
- { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON },
- { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON },
- { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON },
- { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON },
- { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE },
- { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON },
-
- { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE },
- { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON },
- { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE },
- { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON },
- { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE },
- { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON },
- { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE },
- { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON },
- { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE },
- { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE },
- { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE },
-
- { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE },
- { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON },
- { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON },
-
- { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP },
-
- { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON },
- { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON },
- { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON },
- { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON },
+/* CPU Registers Range */
+static struct cpu_debug_range cpu_reg_range[] = {
+ { 0x00000000, 0x00000001, CPU_MC, },
+ { 0x00000006, 0x00000007, CPU_MONITOR, },
+ { 0x00000010, 0x00000010, CPU_TIME, },
+ { 0x00000011, 0x00000013, CPU_PMC, },
+ { 0x00000017, 0x00000017, CPU_PLATFORM, },
+ { 0x0000001B, 0x0000001B, CPU_APIC, },
+ { 0x0000002A, 0x0000002B, CPU_POWERON, },
+ { 0x0000002C, 0x0000002C, CPU_FREQ, },
+ { 0x0000003A, 0x0000003A, CPU_CONTROL, },
+ { 0x00000040, 0x00000047, CPU_LBRANCH, },
+ { 0x00000060, 0x00000067, CPU_LBRANCH, },
+ { 0x00000079, 0x00000079, CPU_BIOS, },
+ { 0x00000088, 0x0000008A, CPU_CACHE, },
+ { 0x0000008B, 0x0000008B, CPU_BIOS, },
+ { 0x0000009B, 0x0000009B, CPU_MONITOR, },
+ { 0x000000C1, 0x000000C4, CPU_PMC, },
+ { 0x000000CD, 0x000000CD, CPU_FREQ, },
+ { 0x000000E7, 0x000000E8, CPU_PERF, },
+ { 0x000000FE, 0x000000FE, CPU_MTRR, },
+
+ { 0x00000116, 0x0000011E, CPU_CACHE, },
+ { 0x00000174, 0x00000176, CPU_SYSENTER, },
+ { 0x00000179, 0x0000017B, CPU_MC, },
+ { 0x00000186, 0x00000189, CPU_PMC, },
+ { 0x00000198, 0x00000199, CPU_PERF, },
+ { 0x0000019A, 0x0000019A, CPU_TIME, },
+ { 0x0000019B, 0x0000019D, CPU_THERM, },
+ { 0x000001A0, 0x000001A0, CPU_MISC, },
+ { 0x000001C9, 0x000001C9, CPU_LBRANCH, },
+ { 0x000001D7, 0x000001D8, CPU_LBRANCH, },
+ { 0x000001D9, 0x000001D9, CPU_DEBUG, },
+ { 0x000001DA, 0x000001E0, CPU_LBRANCH, },
+
+ { 0x00000200, 0x0000020F, CPU_MTRR, },
+ { 0x00000250, 0x00000250, CPU_MTRR, },
+ { 0x00000258, 0x00000259, CPU_MTRR, },
+ { 0x00000268, 0x0000026F, CPU_MTRR, },
+ { 0x00000277, 0x00000277, CPU_PAT, },
+ { 0x000002FF, 0x000002FF, CPU_MTRR, },
+
+ { 0x00000300, 0x00000311, CPU_PMC, },
+ { 0x00000345, 0x00000345, CPU_PMC, },
+ { 0x00000360, 0x00000371, CPU_PMC, },
+ { 0x0000038D, 0x00000390, CPU_PMC, },
+ { 0x000003A0, 0x000003BE, CPU_PMC, },
+ { 0x000003C0, 0x000003CD, CPU_PMC, },
+ { 0x000003E0, 0x000003E1, CPU_PMC, },
+ { 0x000003F0, 0x000003F2, CPU_PMC, },
+
+ { 0x00000400, 0x00000417, CPU_MC, },
+ { 0x00000480, 0x0000048B, CPU_VMX, },
+
+ { 0x00000600, 0x00000600, CPU_DEBUG, },
+ { 0x00000680, 0x0000068F, CPU_LBRANCH, },
+ { 0x000006C0, 0x000006CF, CPU_LBRANCH, },
+
+ { 0x000107CC, 0x000107D3, CPU_PMC, },
+
+ { 0xC0000080, 0xC0000080, CPU_FEATURES, },
+ { 0xC0000081, 0xC0000084, CPU_CALL, },
+ { 0xC0000100, 0xC0000102, CPU_BASE, },
+ { 0xC0000103, 0xC0000103, CPU_TIME, },
+
+ { 0xC0010000, 0xC0010007, CPU_PMC, },
+ { 0xC0010010, 0xC0010010, CPU_CONF, },
+ { 0xC0010015, 0xC0010015, CPU_CONF, },
+ { 0xC0010016, 0xC001001A, CPU_MTRR, },
+ { 0xC001001D, 0xC001001D, CPU_MTRR, },
+ { 0xC001001F, 0xC001001F, CPU_CONF, },
+ { 0xC0010030, 0xC0010035, CPU_BIOS, },
+ { 0xC0010044, 0xC0010048, CPU_MC, },
+ { 0xC0010050, 0xC0010056, CPU_SMM, },
+ { 0xC0010058, 0xC0010058, CPU_CONF, },
+ { 0xC0010060, 0xC0010060, CPU_CACHE, },
+ { 0xC0010061, 0xC0010068, CPU_SMM, },
+ { 0xC0010069, 0xC001006B, CPU_SMM, },
+ { 0xC0010070, 0xC0010071, CPU_SMM, },
+ { 0xC0010111, 0xC0010113, CPU_SMM, },
+ { 0xC0010114, 0xC0010118, CPU_SVM, },
+ { 0xC0010140, 0xC0010141, CPU_OSVM, },
+ { 0xC0011022, 0xC0011023, CPU_CONF, },
};
-/* AMD Registers Range */
-static struct cpu_debug_range cpu_amd_range[] = {
- { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, },
- { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, },
- { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, },
- { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS },
- { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS },
- { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, },
-
- { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, },
- { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, },
- { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, },
- { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, },
-
- { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, },
- { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, },
- { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, },
- { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, },
- { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, },
- { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, },
-
- { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, },
-
- { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, },
- { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, },
- { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, },
- { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, },
-
- { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, },
- { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, },
- { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, },
- { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, },
- { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, },
- { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, },
- { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, },
- { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, },
- { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, },
- { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, },
- { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, },
- { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, },
- { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, },
- { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, },
- { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, },
- { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, },
- { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, },
- { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, },
-};
-
-
-/* Intel */
-static int get_intel_modelflag(unsigned model)
-{
- int flag;
-
- switch (model) {
- case 0x0501:
- case 0x0502:
- case 0x0504:
- flag = CPU_INTEL_PENTIUM;
- break;
- case 0x0601:
- case 0x0603:
- case 0x0605:
- case 0x0607:
- case 0x0608:
- case 0x060A:
- case 0x060B:
- flag = CPU_INTEL_P6;
- break;
- case 0x0609:
- case 0x060D:
- flag = CPU_INTEL_PENTIUM_M;
- break;
- case 0x060E:
- flag = CPU_INTEL_CORE;
- break;
- case 0x060F:
- case 0x0617:
- flag = CPU_INTEL_CORE2;
- break;
- case 0x061C:
- flag = CPU_INTEL_ATOM;
- break;
- case 0x0F00:
- case 0x0F01:
- case 0x0F02:
- case 0x0F03:
- case 0x0F04:
- flag = CPU_INTEL_XEON_P4;
- break;
- case 0x0F06:
- flag = CPU_INTEL_XEON_MP;
- break;
- default:
- flag = CPU_NONE;
- break;
- }
-
- return flag;
-}
-
-/* AMD */
-static int get_amd_modelflag(unsigned model)
-{
- int flag;
-
- switch (model >> 8) {
- case 0x6:
- flag = CPU_AMD_K6;
- break;
- case 0x7:
- flag = CPU_AMD_K7;
- break;
- case 0x8:
- flag = CPU_AMD_K8;
- break;
- case 0xf:
- flag = CPU_AMD_0F;
- break;
- case 0x10:
- flag = CPU_AMD_10;
- break;
- case 0x11:
- flag = CPU_AMD_11;
- break;
- default:
- flag = CPU_NONE;
- break;
- }
-
- return flag;
-}
-
-static int get_cpu_modelflag(unsigned cpu)
-{
- int flag;
-
- flag = per_cpu(cpu_model, cpu);
-
- switch (flag >> 16) {
- case X86_VENDOR_INTEL:
- flag = get_intel_modelflag(flag);
- break;
- case X86_VENDOR_AMD:
- flag = get_amd_modelflag(flag & 0xffff);
- break;
- default:
- flag = CPU_NONE;
- break;
- }
-
- return flag;
-}
-
-static int get_cpu_range_count(unsigned cpu)
-{
- int index;
-
- switch (per_cpu(cpu_model, cpu) >> 16) {
- case X86_VENDOR_INTEL:
- index = ARRAY_SIZE(cpu_intel_range);
- break;
- case X86_VENDOR_AMD:
- index = ARRAY_SIZE(cpu_amd_range);
- break;
- default:
- index = 0;
- break;
- }
-
- return index;
-}
-
static int is_typeflag_valid(unsigned cpu, unsigned flag)
{
- unsigned vendor, modelflag;
- int i, index;
+ int i;
/* Standard Registers should be always valid */
if (flag >= CPU_TSS)
return 1;
- modelflag = per_cpu(cpu_modelflag, cpu);
- vendor = per_cpu(cpu_model, cpu) >> 16;
- index = get_cpu_range_count(cpu);
-
- for (i = 0; i < index; i++) {
- switch (vendor) {
- case X86_VENDOR_INTEL:
- if ((cpu_intel_range[i].model & modelflag) &&
- (cpu_intel_range[i].flag & flag))
- return 1;
- break;
- case X86_VENDOR_AMD:
- if ((cpu_amd_range[i].model & modelflag) &&
- (cpu_amd_range[i].flag & flag))
- return 1;
- break;
- }
+ for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
+ if (cpu_reg_range[i].flag == flag)
+ return 1;
}
/* Invalid */
@@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag)
static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
int index, unsigned flag)
{
- unsigned modelflag;
-
- modelflag = per_cpu(cpu_modelflag, cpu);
- *max = 0;
- switch (per_cpu(cpu_model, cpu) >> 16) {
- case X86_VENDOR_INTEL:
- if ((cpu_intel_range[index].model & modelflag) &&
- (cpu_intel_range[index].flag & flag)) {
- *min = cpu_intel_range[index].min;
- *max = cpu_intel_range[index].max;
- }
- break;
- case X86_VENDOR_AMD:
- if ((cpu_amd_range[index].model & modelflag) &&
- (cpu_amd_range[index].flag & flag)) {
- *min = cpu_amd_range[index].min;
- *max = cpu_amd_range[index].max;
- }
- break;
- }
+ if (cpu_reg_range[index].flag == flag) {
+ *min = cpu_reg_range[index].min;
+ *max = cpu_reg_range[index].max;
+ } else
+ *max = 0;
return *max;
}
@@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
unsigned msr, msr_min, msr_max;
struct cpu_private *priv;
u32 low, high;
- int i, range;
+ int i;
if (seq) {
priv = seq->private;
@@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
}
}
- range = get_cpu_range_count(cpu);
-
- for (i = 0; i < range; i++) {
+ for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
continue;
@@ -588,8 +369,20 @@ static void print_apic(void *arg)
seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
-#endif /* CONFIG_X86_LOCAL_APIC */
+ if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+ unsigned int i, v, maxeilvt;
+
+ v = apic_read(APIC_EFEAT);
+ maxeilvt = (v >> 16) & 0xff;
+ seq_printf(seq, " EFEAT\t\t: %08x\n", v);
+ seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL));
+ for (i = 0; i < maxeilvt; i++) {
+ v = apic_read(APIC_EILVTn(i));
+ seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v);
+ }
+ }
+#endif /* CONFIG_X86_LOCAL_APIC */
seq_printf(seq, "\n MSR\t:\n");
}
@@ -788,13 +581,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
{
struct dentry *cpu_dentry = NULL;
unsigned reg, reg_min, reg_max;
- int i, range, err = 0;
+ int i, err = 0;
char reg_dir[12];
u32 low, high;
- range = get_cpu_range_count(cpu);
-
- for (i = 0; i < range; i++) {
+ for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
cpu_base[type].flag))
continue;
@@ -850,10 +641,6 @@ static int cpu_init_cpu(void)
cpui = &cpu_data(cpu);
if (!cpu_has(cpui, X86_FEATURE_MSR))
continue;
- per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
- (cpui->x86 << 8) |
- (cpui->x86_model));
- per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
sprintf(cpu_dir, "cpu%d", cpu);
cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index 52c83987547..f138c6c389b 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -220,11 +220,14 @@ config X86_LONGHAUL
If in doubt, say N.
config X86_E_POWERSAVER
- tristate "VIA C7 Enhanced PowerSaver"
+ tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
select CPU_FREQ_TABLE
- depends on X86_32
+ depends on X86_32 && EXPERIMENTAL
help
- This adds the CPUFreq driver for VIA C7 processors.
+ This adds the CPUFreq driver for VIA C7 processors. However, this driver
+ does not have any safeguards to prevent operating the CPU out of spec
+ and is thus considered dangerous. Please use the regular ACPI cpufreq
+ driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
If in doubt, say N.
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 752e8c6b2c7..ae9b503220c 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid)
{
struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
- if (cpu->x86_vendor != X86_VENDOR_INTEL ||
- !cpu_has(cpu, X86_FEATURE_EST))
- return 0;
-
- return 1;
+ return cpu_has(cpu, X86_FEATURE_EST);
}
static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index cf52215d9eb..2a50ef89100 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1,3 +1,4 @@
+
/*
* (c) 2003-2006 Advanced Micro Devices, Inc.
* Your use of this code is subject to the terms and conditions of the
@@ -117,20 +118,17 @@ static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
u32 i = 0;
if (cpu_family == CPU_HW_PSTATE) {
- if (data->currpstate == HW_PSTATE_INVALID) {
- /* read (initial) hw pstate if not yet set */
- rdmsr(MSR_PSTATE_STATUS, lo, hi);
- i = lo & HW_PSTATE_MASK;
-
- /*
- * a workaround for family 11h erratum 311 might cause
- * an "out-of-range Pstate if the core is in Pstate-0
- */
- if (i >= data->numps)
- data->currpstate = HW_PSTATE_0;
- else
- data->currpstate = i;
- }
+ rdmsr(MSR_PSTATE_STATUS, lo, hi);
+ i = lo & HW_PSTATE_MASK;
+ data->currpstate = i;
+
+ /*
+ * a workaround for family 11h erratum 311 might cause
+ * an "out-of-range Pstate if the core is in Pstate-0
+ */
+ if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps))
+ data->currpstate = HW_PSTATE_0;
+
return 0;
}
do {
@@ -301,7 +299,7 @@ static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
static int transition_fid_vid(struct powernow_k8_data *data,
u32 reqfid, u32 reqvid)
{
- if (core_voltage_pre_transition(data, reqvid))
+ if (core_voltage_pre_transition(data, reqvid, reqfid))
return 1;
if (core_frequency_transition(data, reqfid))
@@ -329,17 +327,20 @@ static int transition_fid_vid(struct powernow_k8_data *data,
/* Phase 1 - core voltage transition ... setup voltage */
static int core_voltage_pre_transition(struct powernow_k8_data *data,
- u32 reqvid)
+ u32 reqvid, u32 reqfid)
{
u32 rvosteps = data->rvo;
u32 savefid = data->currfid;
- u32 maxvid, lo;
+ u32 maxvid, lo, rvomult = 1;
dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
"reqvid 0x%x, rvo 0x%x\n",
smp_processor_id(),
data->currfid, data->currvid, reqvid, data->rvo);
+ if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP))
+ rvomult = 2;
+ rvosteps *= rvomult;
rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
maxvid = 0x1f & (maxvid >> 16);
dprintk("ph1 maxvid=0x%x\n", maxvid);
@@ -353,7 +354,8 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data,
return 1;
}
- while ((rvosteps > 0) && ((data->rvo + data->currvid) > reqvid)) {
+ while ((rvosteps > 0) &&
+ ((rvomult * data->rvo + data->currvid) > reqvid)) {
if (data->currvid == maxvid) {
rvosteps = 0;
} else {
@@ -386,13 +388,6 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
u32 vcoreqfid, vcocurrfid, vcofiddiff;
u32 fid_interval, savevid = data->currvid;
- if ((reqfid < HI_FID_TABLE_BOTTOM) &&
- (data->currfid < HI_FID_TABLE_BOTTOM)) {
- printk(KERN_ERR PFX "ph2: illegal lo-lo transition "
- "0x%x 0x%x\n", reqfid, data->currfid);
- return 1;
- }
-
if (data->currfid == reqfid) {
printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
data->currfid);
@@ -409,6 +404,9 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
: vcoreqfid - vcocurrfid;
+ if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP))
+ vcofiddiff = 0;
+
while (vcofiddiff > 2) {
(data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);
@@ -510,41 +508,34 @@ static int core_voltage_post_transition(struct powernow_k8_data *data,
return 0;
}
-static int check_supported_cpu(unsigned int cpu)
+static void check_supported_cpu(void *_rc)
{
- cpumask_t oldmask;
u32 eax, ebx, ecx, edx;
- unsigned int rc = 0;
+ int *rc = _rc;
- oldmask = current->cpus_allowed;
- set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-
- if (smp_processor_id() != cpu) {
- printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu);
- goto out;
- }
+ *rc = -ENODEV;
if (current_cpu_data.x86_vendor != X86_VENDOR_AMD)
- goto out;
+ return;
eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&
((eax & CPUID_XFAM) < CPUID_XFAM_10H))
- goto out;
+ return;
if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
printk(KERN_INFO PFX
"Processor cpuid %x not supported\n", eax);
- goto out;
+ return;
}
eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
printk(KERN_INFO PFX
"No frequency change capabilities detected\n");
- goto out;
+ return;
}
cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
@@ -552,21 +543,17 @@ static int check_supported_cpu(unsigned int cpu)
!= P_STATE_TRANSITION_CAPABLE) {
printk(KERN_INFO PFX
"Power state transitions not supported\n");
- goto out;
+ return;
}
} else { /* must be a HW Pstate capable processor */
cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)
cpu_family = CPU_HW_PSTATE;
else
- goto out;
+ return;
}
- rc = 1;
-
-out:
- set_cpus_allowed_ptr(current, &oldmask);
- return rc;
+ *rc = 0;
}
static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
@@ -823,13 +810,14 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
return;
- control = data->acpi_data.states[index].control; data->irt = (control
- >> IRT_SHIFT) & IRT_MASK; data->rvo = (control >>
- RVO_SHIFT) & RVO_MASK; data->exttype = (control
- >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
- data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; data->vidmvs = 1
- << ((control >> MVS_SHIFT) & MVS_MASK); data->vstable =
- (control >> VST_SHIFT) & VST_MASK; }
+ control = data->acpi_data.states[index].control;
+ data->irt = (control >> IRT_SHIFT) & IRT_MASK;
+ data->rvo = (control >> RVO_SHIFT) & RVO_MASK;
+ data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
+ data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK;
+ data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK);
+ data->vstable = (control >> VST_SHIFT) & VST_MASK;
+}
static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
{
@@ -1046,6 +1034,19 @@ static int get_transition_latency(struct powernow_k8_data *data)
if (cur_latency > max_latency)
max_latency = cur_latency;
}
+ if (max_latency == 0) {
+ /*
+ * Fam 11h always returns 0 as transition latency.
+ * This is intended and means "very fast". While cpufreq core
+ * and governors currently can handle that gracefully, better
+ * set it to 1 to avoid problems in the future.
+ * For all others it's a BIOS bug.
+ */
+ if (!boot_cpu_data.x86 == 0x11)
+ printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
+ "latency\n");
+ max_latency = 1;
+ }
/* value in usecs, needs to be in nanoseconds */
return 1000 * max_latency;
}
@@ -1080,20 +1081,12 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data,
return 0;
}
- if ((fid < HI_FID_TABLE_BOTTOM) &&
- (data->currfid < HI_FID_TABLE_BOTTOM)) {
- printk(KERN_ERR PFX
- "ignoring illegal change in lo freq table-%x to 0x%x\n",
- data->currfid, fid);
- return 1;
- }
-
dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
smp_processor_id(), fid, vid);
freqs.old = find_khz_freq_from_fid(data->currfid);
freqs.new = find_khz_freq_from_fid(fid);
- for_each_cpu_mask_nr(i, *(data->available_cores)) {
+ for_each_cpu(i, data->available_cores) {
freqs.cpu = i;
cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
}
@@ -1101,7 +1094,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data,
res = transition_fid_vid(data, fid, vid);
freqs.new = find_khz_freq_from_fid(data->currfid);
- for_each_cpu_mask_nr(i, *(data->available_cores)) {
+ for_each_cpu(i, data->available_cores) {
freqs.cpu = i;
cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
}
@@ -1126,7 +1119,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data,
data->currpstate);
freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
- for_each_cpu_mask_nr(i, *(data->available_cores)) {
+ for_each_cpu(i, data->available_cores) {
freqs.cpu = i;
cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
}
@@ -1134,7 +1127,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data,
res = transition_pstate(data, pstate);
freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
- for_each_cpu_mask_nr(i, *(data->available_cores)) {
+ for_each_cpu(i, data->available_cores) {
freqs.cpu = i;
cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
}
@@ -1235,21 +1228,47 @@ static int powernowk8_verify(struct cpufreq_policy *pol)
return cpufreq_frequency_table_verify(pol, data->powernow_table);
}
-static const char ACPI_PSS_BIOS_BUG_MSG[] =
- KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
- KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n";
+struct init_on_cpu {
+ struct powernow_k8_data *data;
+ int rc;
+};
+
+static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu)
+{
+ struct init_on_cpu *init_on_cpu = _init_on_cpu;
+
+ if (pending_bit_stuck()) {
+ printk(KERN_ERR PFX "failing init, change pending bit set\n");
+ init_on_cpu->rc = -ENODEV;
+ return;
+ }
+
+ if (query_current_values_with_pending_wait(init_on_cpu->data)) {
+ init_on_cpu->rc = -ENODEV;
+ return;
+ }
+
+ if (cpu_family == CPU_OPTERON)
+ fidvid_msr_init();
+
+ init_on_cpu->rc = 0;
+}
/* per CPU init entry point to the driver */
static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
{
+ static const char ACPI_PSS_BIOS_BUG_MSG[] =
+ KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
+ FW_BUG PFX "Try again with latest BIOS.\n";
struct powernow_k8_data *data;
- cpumask_t oldmask;
+ struct init_on_cpu init_on_cpu;
int rc;
if (!cpu_online(pol->cpu))
return -ENODEV;
- if (!check_supported_cpu(pol->cpu))
+ smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1);
+ if (rc)
return -ENODEV;
data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL);
@@ -1289,27 +1308,12 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
pol->cpuinfo.transition_latency = get_transition_latency(data);
/* only run on specific CPU from here on */
- oldmask = current->cpus_allowed;
- set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu));
-
- if (smp_processor_id() != pol->cpu) {
- printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
- goto err_out_unmask;
- }
-
- if (pending_bit_stuck()) {
- printk(KERN_ERR PFX "failing init, change pending bit set\n");
- goto err_out_unmask;
- }
-
- if (query_current_values_with_pending_wait(data))
- goto err_out_unmask;
-
- if (cpu_family == CPU_OPTERON)
- fidvid_msr_init();
-
- /* run on any CPU again */
- set_cpus_allowed_ptr(current, &oldmask);
+ init_on_cpu.data = data;
+ smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu,
+ &init_on_cpu, 1);
+ rc = init_on_cpu.rc;
+ if (rc != 0)
+ goto err_out_exit_acpi;
if (cpu_family == CPU_HW_PSTATE)
cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
@@ -1346,8 +1350,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
return 0;
-err_out_unmask:
- set_cpus_allowed_ptr(current, &oldmask);
+err_out_exit_acpi:
powernow_k8_cpu_exit_acpi(data);
err_out:
@@ -1372,28 +1375,25 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
return 0;
}
+static void query_values_on_cpu(void *_err)
+{
+ int *err = _err;
+ struct powernow_k8_data *data = __get_cpu_var(powernow_data);
+
+ *err = query_current_values_with_pending_wait(data);
+}
+
static unsigned int powernowk8_get(unsigned int cpu)
{
- struct powernow_k8_data *data;
- cpumask_t oldmask = current->cpus_allowed;
+ struct powernow_k8_data *data = per_cpu(powernow_data, cpu);
unsigned int khz = 0;
- unsigned int first;
-
- first = cpumask_first(cpu_core_mask(cpu));
- data = per_cpu(powernow_data, first);
+ int err;
if (!data)
return -EINVAL;
- set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
- if (smp_processor_id() != cpu) {
- printk(KERN_ERR PFX
- "limiting to CPU %d failed in powernowk8_get\n", cpu);
- set_cpus_allowed_ptr(current, &oldmask);
- return 0;
- }
-
- if (query_current_values_with_pending_wait(data))
+ smp_call_function_single(cpu, query_values_on_cpu, &err, true);
+ if (err)
goto out;
if (cpu_family == CPU_HW_PSTATE)
@@ -1404,7 +1404,6 @@ static unsigned int powernowk8_get(unsigned int cpu)
out:
- set_cpus_allowed_ptr(current, &oldmask);
return khz;
}
@@ -1430,7 +1429,9 @@ static int __cpuinit powernowk8_init(void)
unsigned int i, supported_cpus = 0;
for_each_online_cpu(i) {
- if (check_supported_cpu(i))
+ int rc;
+ smp_call_function_single(i, check_supported_cpu, &rc, 1);
+ if (rc == 0)
supported_cpus++;
}
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index 6c6698feade..02ce824073c 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -215,7 +215,8 @@ struct pst_s {
#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg)
-static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid);
+static int core_voltage_pre_transition(struct powernow_k8_data *data,
+ u32 reqvid, u32 regfid);
static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
@@ -223,14 +224,3 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned
static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
-
-#ifdef CONFIG_SMP
-static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[])
-{
-}
-#else
-static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[])
-{
- cpu_set(0, cpu_sharedcore_mask[0]);
-}
-#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 55c831ed71c..8d672ef162c 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -323,14 +323,8 @@ static unsigned int get_cur_freq(unsigned int cpu)
{
unsigned l, h;
unsigned clock_freq;
- cpumask_t saved_mask;
- saved_mask = current->cpus_allowed;
- set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
- if (smp_processor_id() != cpu)
- return 0;
-
- rdmsr(MSR_IA32_PERF_STATUS, l, h);
+ rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h);
clock_freq = extract_clock(l, cpu, 0);
if (unlikely(clock_freq == 0)) {
@@ -340,11 +334,9 @@ static unsigned int get_cur_freq(unsigned int cpu)
* P-state transition (like TM2). Get the last freq set
* in PERF_CTL.
*/
- rdmsr(MSR_IA32_PERF_CTL, l, h);
+ rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h);
clock_freq = extract_clock(l, cpu, 1);
}
-
- set_cpus_allowed_ptr(current, &saved_mask);
return clock_freq;
}
@@ -467,15 +459,10 @@ static int centrino_target (struct cpufreq_policy *policy,
struct cpufreq_freqs freqs;
int retval = 0;
unsigned int j, k, first_cpu, tmp;
- cpumask_var_t saved_mask, covered_cpus;
+ cpumask_var_t covered_cpus;
- if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL)))
- return -ENOMEM;
- if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) {
- free_cpumask_var(saved_mask);
+ if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)))
return -ENOMEM;
- }
- cpumask_copy(saved_mask, &current->cpus_allowed);
if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
retval = -ENODEV;
@@ -493,7 +480,7 @@ static int centrino_target (struct cpufreq_policy *policy,
first_cpu = 1;
for_each_cpu(j, policy->cpus) {
- const struct cpumask *mask;
+ int good_cpu;
/* cpufreq holds the hotplug lock, so we are safe here */
if (!cpu_online(j))
@@ -504,32 +491,30 @@ static int centrino_target (struct cpufreq_policy *policy,
* Make sure we are running on CPU that wants to change freq
*/
if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
- mask = policy->cpus;
+ good_cpu = cpumask_any_and(policy->cpus,
+ cpu_online_mask);
else
- mask = cpumask_of(j);
+ good_cpu = j;
- set_cpus_allowed_ptr(current, mask);
- preempt_disable();
- if (unlikely(!cpu_isset(smp_processor_id(), *mask))) {
+ if (good_cpu >= nr_cpu_ids) {
dprintk("couldn't limit to CPUs in this domain\n");
retval = -EAGAIN;
if (first_cpu) {
/* We haven't started the transition yet. */
- goto migrate_end;
+ goto out;
}
- preempt_enable();
break;
}
msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
if (first_cpu) {
- rdmsr(MSR_IA32_PERF_CTL, oldmsr, h);
+ rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h);
if (msr == (oldmsr & 0xffff)) {
dprintk("no change needed - msr was and needs "
"to be %x\n", oldmsr);
retval = 0;
- goto migrate_end;
+ goto out;
}
freqs.old = extract_clock(oldmsr, cpu, 0);
@@ -553,14 +538,11 @@ static int centrino_target (struct cpufreq_policy *policy,
oldmsr |= msr;
}
- wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
- if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
- preempt_enable();
+ wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h);
+ if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
break;
- }
- cpu_set(j, *covered_cpus);
- preempt_enable();
+ cpumask_set_cpu(j, covered_cpus);
}
for_each_cpu(k, policy->cpus) {
@@ -578,10 +560,8 @@ static int centrino_target (struct cpufreq_policy *policy,
* Best effort undo..
*/
- for_each_cpu_mask_nr(j, *covered_cpus) {
- set_cpus_allowed_ptr(current, &cpumask_of_cpu(j));
- wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
- }
+ for_each_cpu(j, covered_cpus)
+ wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h);
tmp = freqs.new;
freqs.new = freqs.old;
@@ -593,15 +573,9 @@ static int centrino_target (struct cpufreq_policy *policy,
cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
}
}
- set_cpus_allowed_ptr(current, saved_mask);
retval = 0;
- goto out;
-migrate_end:
- preempt_enable();
- set_cpus_allowed_ptr(current, saved_mask);
out:
- free_cpumask_var(saved_mask);
free_cpumask_var(covered_cpus);
return retval;
}
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 016c1a4fa3f..6911e91fb4f 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -89,7 +89,8 @@ static int speedstep_find_register(void)
* speedstep_set_state - set the SpeedStep state
* @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
*
- * Tries to change the SpeedStep state.
+ * Tries to change the SpeedStep state. Can be called from
+ * smp_call_function_single.
*/
static void speedstep_set_state(unsigned int state)
{
@@ -143,6 +144,11 @@ static void speedstep_set_state(unsigned int state)
return;
}
+/* Wrapper for smp_call_function_single. */
+static void _speedstep_set_state(void *_state)
+{
+ speedstep_set_state(*(unsigned int *)_state);
+}
/**
* speedstep_activate - activate SpeedStep control in the chipset
@@ -226,22 +232,28 @@ static unsigned int speedstep_detect_chipset(void)
return 0;
}
-static unsigned int _speedstep_get(const struct cpumask *cpus)
-{
+struct get_freq_data {
unsigned int speed;
- cpumask_t cpus_allowed;
-
- cpus_allowed = current->cpus_allowed;
- set_cpus_allowed_ptr(current, cpus);
- speed = speedstep_get_frequency(speedstep_processor);
- set_cpus_allowed_ptr(current, &cpus_allowed);
- dprintk("detected %u kHz as current frequency\n", speed);
- return speed;
+ unsigned int processor;
+};
+
+static void get_freq_data(void *_data)
+{
+ struct get_freq_data *data = _data;
+
+ data->speed = speedstep_get_frequency(data->processor);
}
static unsigned int speedstep_get(unsigned int cpu)
{
- return _speedstep_get(cpumask_of(cpu));
+ struct get_freq_data data = { .processor = cpu };
+
+ /* You're supposed to ensure CPU is online. */
+ if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0)
+ BUG();
+
+ dprintk("detected %u kHz as current frequency\n", data.speed);
+ return data.speed;
}
/**
@@ -257,16 +269,16 @@ static int speedstep_target(struct cpufreq_policy *policy,
unsigned int target_freq,
unsigned int relation)
{
- unsigned int newstate = 0;
+ unsigned int newstate = 0, policy_cpu;
struct cpufreq_freqs freqs;
- cpumask_t cpus_allowed;
int i;
if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
target_freq, relation, &newstate))
return -EINVAL;
- freqs.old = _speedstep_get(policy->cpus);
+ policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
+ freqs.old = speedstep_get(policy_cpu);
freqs.new = speedstep_freqs[newstate].frequency;
freqs.cpu = policy->cpu;
@@ -276,20 +288,13 @@ static int speedstep_target(struct cpufreq_policy *policy,
if (freqs.old == freqs.new)
return 0;
- cpus_allowed = current->cpus_allowed;
-
for_each_cpu(i, policy->cpus) {
freqs.cpu = i;
cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
}
- /* switch to physical CPU where state is to be changed */
- set_cpus_allowed_ptr(current, policy->cpus);
-
- speedstep_set_state(newstate);
-
- /* allow to be run on all CPUs */
- set_cpus_allowed_ptr(current, &cpus_allowed);
+ smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate,
+ true);
for_each_cpu(i, policy->cpus) {
freqs.cpu = i;
@@ -312,33 +317,43 @@ static int speedstep_verify(struct cpufreq_policy *policy)
return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
}
+struct get_freqs {
+ struct cpufreq_policy *policy;
+ int ret;
+};
+
+static void get_freqs_on_cpu(void *_get_freqs)
+{
+ struct get_freqs *get_freqs = _get_freqs;
+
+ get_freqs->ret =
+ speedstep_get_freqs(speedstep_processor,
+ &speedstep_freqs[SPEEDSTEP_LOW].frequency,
+ &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
+ &get_freqs->policy->cpuinfo.transition_latency,
+ &speedstep_set_state);
+}
static int speedstep_cpu_init(struct cpufreq_policy *policy)
{
- int result = 0;
- unsigned int speed;
- cpumask_t cpus_allowed;
+ int result;
+ unsigned int policy_cpu, speed;
+ struct get_freqs gf;
/* only run on CPU to be set, or on its sibling */
#ifdef CONFIG_SMP
cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
#endif
-
- cpus_allowed = current->cpus_allowed;
- set_cpus_allowed_ptr(current, policy->cpus);
+ policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
/* detect low and high frequency and transition latency */
- result = speedstep_get_freqs(speedstep_processor,
- &speedstep_freqs[SPEEDSTEP_LOW].frequency,
- &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
- &policy->cpuinfo.transition_latency,
- &speedstep_set_state);
- set_cpus_allowed_ptr(current, &cpus_allowed);
- if (result)
- return result;
+ gf.policy = policy;
+ smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1);
+ if (gf.ret)
+ return gf.ret;
/* get current speed setting */
- speed = _speedstep_get(policy->cpus);
+ speed = speedstep_get(policy_cpu);
if (!speed)
return -EIO;
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index 2e3c6862657..f4c290b8482 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -226,6 +226,7 @@ static unsigned int pentium4_get_frequency(void)
}
+/* Warning: may get called from smp_call_function_single. */
unsigned int speedstep_get_frequency(unsigned int processor)
{
switch (processor) {
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 7437fa133c0..3260ab04499 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -86,6 +86,29 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
*/
if (c->x86 == 6 && c->x86_model < 15)
clear_cpu_cap(c, X86_FEATURE_PAT);
+
+#ifdef CONFIG_KMEMCHECK
+ /*
+ * P4s have a "fast strings" feature which causes single-
+ * stepping REP instructions to only generate a #DB on
+ * cache-line boundaries.
+ *
+ * Ingo Molnar reported a Pentium D (model 6) and a Xeon
+ * (model 2) with the same problem.
+ */
+ if (c->x86 == 15) {
+ u64 misc_enable;
+
+ rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+
+ if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
+ printk(KERN_INFO "kmemcheck: Disabling fast string operations\n");
+
+ misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
+ wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+ }
+ }
+#endif
}
#ifdef CONFIG_X86_32
@@ -229,12 +252,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
}
#endif
-static void __cpuinit srat_detect_node(void)
+static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
{
#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
unsigned node;
int cpu = smp_processor_id();
- int apicid = hard_smp_processor_id();
+ int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
/* Don't do the funky fallback heuristics the AMD version employs
for now. */
@@ -400,7 +423,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
}
/* Work around errata */
- srat_detect_node();
+ srat_detect_node(c);
if (cpu_has(c, X86_FEATURE_VMX))
detect_vmx_virtcap(c);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 483eda96e10..789efe217e1 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,6 +17,7 @@
#include <asm/processor.h>
#include <asm/smp.h>
+#include <asm/k8.h>
#define LVL_1_INST 1
#define LVL_1_DATA 2
@@ -159,14 +160,6 @@ struct _cpuid4_info_regs {
unsigned long can_disable;
};
-#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS)
-static struct pci_device_id k8_nb_id[] = {
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
- {}
-};
-#endif
-
unsigned short num_cache_leaves;
/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -207,10 +200,17 @@ union l3_cache {
};
static const unsigned short __cpuinitconst assocs[] = {
- [1] = 1, [2] = 2, [4] = 4, [6] = 8,
- [8] = 16, [0xa] = 32, [0xb] = 48,
+ [1] = 1,
+ [2] = 2,
+ [4] = 4,
+ [6] = 8,
+ [8] = 16,
+ [0xa] = 32,
+ [0xb] = 48,
[0xc] = 64,
- [0xf] = 0xffff // ??
+ [0xd] = 96,
+ [0xe] = 128,
+ [0xf] = 0xffff /* fully associative - no way to show this currently */
};
static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
@@ -271,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
eax->split.type = types[leaf];
eax->split.level = levels[leaf];
if (leaf == 3)
- eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1;
+ eax->split.num_threads_sharing =
+ current_cpu_data.x86_max_cores - 1;
else
eax->split.num_threads_sharing = 0;
eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
@@ -291,6 +292,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
{
if (index < 3)
return;
+
+ if (boot_cpu_data.x86 == 0x11)
+ return;
+
+ /* see erratum #382 */
+ if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8))
+ return;
+
this_leaf->can_disable = 1;
}
@@ -696,97 +705,75 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
#define to_object(k) container_of(k, struct _index_kobject, kobj)
#define to_attr(a) container_of(a, struct _cache_attr, attr)
-#ifdef CONFIG_PCI
-static struct pci_dev *get_k8_northbridge(int node)
-{
- struct pci_dev *dev = NULL;
- int i;
-
- for (i = 0; i <= node; i++) {
- do {
- dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
- if (!dev)
- break;
- } while (!pci_match_id(&k8_nb_id[0], dev));
- if (!dev)
- break;
- }
- return dev;
-}
-#else
-static struct pci_dev *get_k8_northbridge(int node)
-{
- return NULL;
-}
-#endif
-
-static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
+static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
+ unsigned int index)
{
- const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
- int node = cpu_to_node(cpumask_first(mask));
- struct pci_dev *dev = NULL;
- ssize_t ret = 0;
- int i;
+ int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+ int node = cpu_to_node(cpu);
+ struct pci_dev *dev = node_to_k8_nb_misc(node);
+ unsigned int reg = 0;
if (!this_leaf->can_disable)
- return sprintf(buf, "Feature not enabled\n");
-
- dev = get_k8_northbridge(node);
- if (!dev) {
- printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
return -EINVAL;
- }
- for (i = 0; i < 2; i++) {
- unsigned int reg;
+ if (!dev)
+ return -EINVAL;
- pci_read_config_dword(dev, 0x1BC + i * 4, &reg);
+ pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
+ return sprintf(buf, "%x\n", reg);
+}
- ret += sprintf(buf, "%sEntry: %d\n", buf, i);
- ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n",
- buf,
- reg & 0x80000000 ? "Disabled" : "Allowed",
- reg & 0x40000000 ? "Disabled" : "Allowed");
- ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
- buf, (reg & 0x30000) >> 16, reg & 0xfff);
- }
- return ret;
+#define SHOW_CACHE_DISABLE(index) \
+static ssize_t \
+show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
+{ \
+ return show_cache_disable(this_leaf, buf, index); \
}
+SHOW_CACHE_DISABLE(0)
+SHOW_CACHE_DISABLE(1)
-static ssize_t
-store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
- size_t count)
+static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
+ const char *buf, size_t count, unsigned int index)
{
- const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
- int node = cpu_to_node(cpumask_first(mask));
- struct pci_dev *dev = NULL;
- unsigned int ret, index, val;
+ int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+ int node = cpu_to_node(cpu);
+ struct pci_dev *dev = node_to_k8_nb_misc(node);
+ unsigned long val = 0;
+ unsigned int scrubber = 0;
if (!this_leaf->can_disable)
- return 0;
-
- if (strlen(buf) > 15)
return -EINVAL;
- ret = sscanf(buf, "%x %x", &index, &val);
- if (ret != 2)
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!dev)
return -EINVAL;
- if (index > 1)
+
+ if (strict_strtoul(buf, 10, &val) < 0)
return -EINVAL;
val |= 0xc0000000;
- dev = get_k8_northbridge(node);
- if (!dev) {
- printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
- return -EINVAL;
- }
+
+ pci_read_config_dword(dev, 0x58, &scrubber);
+ scrubber &= ~0x1f000000;
+ pci_write_config_dword(dev, 0x58, scrubber);
pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
wbinvd();
pci_write_config_dword(dev, 0x1BC + index * 4, val);
+ return count;
+}
- return 1;
+#define STORE_CACHE_DISABLE(index) \
+static ssize_t \
+store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
+ const char *buf, size_t count) \
+{ \
+ return store_cache_disable(this_leaf, buf, count, index); \
}
+STORE_CACHE_DISABLE(0)
+STORE_CACHE_DISABLE(1)
struct _cache_attr {
struct attribute attr;
@@ -808,7 +795,10 @@ define_one_ro(size);
define_one_ro(shared_cpu_map);
define_one_ro(shared_cpu_list);
-static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable);
+static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
+ show_cache_disable_0, store_cache_disable_0);
+static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
+ show_cache_disable_1, store_cache_disable_1);
static struct attribute * default_attrs[] = {
&type.attr,
@@ -820,7 +810,8 @@ static struct attribute * default_attrs[] = {
&size.attr,
&shared_cpu_map.attr,
&shared_cpu_list.attr,
- &cache_disable.attr,
+ &cache_disable_0.attr,
+ &cache_disable_1.attr,
NULL
};
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index b2f89829bbe..188a1ca5ad2 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,7 +1,12 @@
-obj-y = mce_$(BITS).o therm_throt.o
+obj-y = mce.o
-obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o
-obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o
-obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
+obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o
+obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o
+obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
+obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
+obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
+obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
+
+obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index dd3af6e7b39..b945d5dbc60 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -2,25 +2,23 @@
* Athlon specific Machine Check Exception Reporting
* (C) Copyright 2002 Dave Jones <davej@redhat.com>
*/
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
+#include <asm/mce.h>
#include <asm/msr.h>
-#include "mce.h"
-
-/* Machine Check Handler For AMD Athlon/Duron */
+/* Machine Check Handler For AMD Athlon/Duron: */
static void k7_machine_check(struct pt_regs *regs, long error_code)
{
- int recover = 1;
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
+ int recover = 1;
int i;
rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -32,15 +30,19 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)
for (i = 1; i < nr_mce_banks; i++) {
rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
- if (high&(1<<31)) {
+ if (high & (1<<31)) {
char misc[20];
char addr[24];
- misc[0] = addr[0] = '\0';
+
+ misc[0] = '\0';
+ addr[0] = '\0';
+
if (high & (1<<29))
recover |= 1;
if (high & (1<<25))
recover |= 2;
high &= ~(1<<31);
+
if (high & (1<<27)) {
rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
@@ -49,27 +51,31 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)
rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
snprintf(addr, 24, " at %08x%08x", ahigh, alow);
}
+
printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
smp_processor_id(), i, high, low, misc, addr);
- /* Clear it */
+
+ /* Clear it: */
wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
- /* Serialize */
+ /* Serialize: */
wmb();
add_taint(TAINT_MACHINE_CHECK);
}
}
- if (recover&2)
+ if (recover & 2)
panic("CPU context corrupt");
- if (recover&1)
+ if (recover & 1)
panic("Unable to continue");
+
printk(KERN_EMERG "Attempting to continue.\n");
+
mcgstl &= ~(1<<2);
wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
}
-/* AMD K7 machine check is Intel like */
+/* AMD K7 machine check is Intel like: */
void amd_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
@@ -79,21 +85,26 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
return;
machine_check_vector = k7_machine_check;
+ /* Make sure the vector pointer is visible before we enable MCEs: */
wmb();
printk(KERN_INFO "Intel machine check architecture supported.\n");
+
rdmsr(MSR_IA32_MCG_CAP, l, h);
if (l & (1<<8)) /* Control register present ? */
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
nr_mce_banks = l & 0xff;
- /* Clear status for MC index 0 separately, we don't touch CTL,
- * as some K7 Athlons cause spurious MCEs when its enabled. */
+ /*
+ * Clear status for MC index 0 separately, we don't touch CTL,
+ * as some K7 Athlons cause spurious MCEs when its enabled:
+ */
if (boot_cpu_data.x86 == 6) {
wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
i = 1;
} else
i = 0;
+
for (; i < nr_mce_banks; i++) {
wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
new file mode 100644
index 00000000000..a3a235a53f0
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -0,0 +1,127 @@
+/*
+ * Machine check injection support.
+ * Copyright 2008 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Authors:
+ * Andi Kleen
+ * Ying Huang
+ */
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/smp.h>
+#include <asm/mce.h>
+
+/* Update fake mce registers on current CPU. */
+static void inject_mce(struct mce *m)
+{
+ struct mce *i = &per_cpu(injectm, m->extcpu);
+
+ /* Make sure noone reads partially written injectm */
+ i->finished = 0;
+ mb();
+ m->finished = 0;
+ /* First set the fields after finished */
+ i->extcpu = m->extcpu;
+ mb();
+ /* Now write record in order, finished last (except above) */
+ memcpy(i, m, sizeof(struct mce));
+ /* Finally activate it */
+ mb();
+ i->finished = 1;
+}
+
+struct delayed_mce {
+ struct timer_list timer;
+ struct mce m;
+};
+
+/* Inject mce on current CPU */
+static void raise_mce(unsigned long data)
+{
+ struct delayed_mce *dm = (struct delayed_mce *)data;
+ struct mce *m = &dm->m;
+ int cpu = m->extcpu;
+
+ inject_mce(m);
+ if (m->status & MCI_STATUS_UC) {
+ struct pt_regs regs;
+ memset(&regs, 0, sizeof(struct pt_regs));
+ regs.ip = m->ip;
+ regs.cs = m->cs;
+ printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
+ do_machine_check(&regs, 0);
+ printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
+ } else {
+ mce_banks_t b;
+ memset(&b, 0xff, sizeof(mce_banks_t));
+ printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
+ machine_check_poll(0, &b);
+ mce_notify_irq();
+ printk(KERN_INFO "Finished machine check poll on CPU %d\n",
+ cpu);
+ }
+ kfree(dm);
+}
+
+/* Error injection interface */
+static ssize_t mce_write(struct file *filp, const char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ struct delayed_mce *dm;
+ struct mce m;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ /*
+ * There are some cases where real MSR reads could slip
+ * through.
+ */
+ if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
+ return -EIO;
+
+ if ((unsigned long)usize > sizeof(struct mce))
+ usize = sizeof(struct mce);
+ if (copy_from_user(&m, ubuf, usize))
+ return -EFAULT;
+
+ if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
+ return -EINVAL;
+
+ dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL);
+ if (!dm)
+ return -ENOMEM;
+
+ /*
+ * Need to give user space some time to set everything up,
+ * so do it a jiffie or two later everywhere.
+ * Should we use a hrtimer here for better synchronization?
+ */
+ memcpy(&dm->m, &m, sizeof(struct mce));
+ setup_timer(&dm->timer, raise_mce, (unsigned long)dm);
+ dm->timer.expires = jiffies + 2;
+ add_timer_on(&dm->timer, m.extcpu);
+ return usize;
+}
+
+static int inject_init(void)
+{
+ printk(KERN_INFO "Machine check injector initialized\n");
+ mce_chrdev_ops.write = mce_write;
+ return 0;
+}
+
+module_init(inject_init);
+/*
+ * Cannot tolerate unloading currently because we cannot
+ * guarantee all openers of mce_chrdev will get a reference to us.
+ */
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
new file mode 100644
index 00000000000..54dcb8ff12e
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -0,0 +1,15 @@
+#include <asm/mce.h>
+
+enum severity_level {
+ MCE_NO_SEVERITY,
+ MCE_KEEP_SEVERITY,
+ MCE_SOME_SEVERITY,
+ MCE_AO_SEVERITY,
+ MCE_UC_SEVERITY,
+ MCE_AR_SEVERITY,
+ MCE_PANIC_SEVERITY,
+};
+
+int mce_severity(struct mce *a, int tolerant, char **msg);
+
+extern int mce_ser;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
new file mode 100644
index 00000000000..ff0807f9705
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -0,0 +1,218 @@
+/*
+ * MCE grading rules.
+ * Copyright 2008, 2009 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Author: Andi Kleen
+ */
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <asm/mce.h>
+
+#include "mce-internal.h"
+
+/*
+ * Grade an mce by severity. In general the most severe ones are processed
+ * first. Since there are quite a lot of combinations test the bits in a
+ * table-driven way. The rules are simply processed in order, first
+ * match wins.
+ *
+ * Note this is only used for machine check exceptions, the corrected
+ * errors use much simpler rules. The exceptions still check for the corrected
+ * errors, but only to leave them alone for the CMCI handler (except for
+ * panic situations)
+ */
+
+enum context { IN_KERNEL = 1, IN_USER = 2 };
+enum ser { SER_REQUIRED = 1, NO_SER = 2 };
+
+static struct severity {
+ u64 mask;
+ u64 result;
+ unsigned char sev;
+ unsigned char mcgmask;
+ unsigned char mcgres;
+ unsigned char ser;
+ unsigned char context;
+ unsigned char covered;
+ char *msg;
+} severities[] = {
+#define KERNEL .context = IN_KERNEL
+#define USER .context = IN_USER
+#define SER .ser = SER_REQUIRED
+#define NOSER .ser = NO_SER
+#define SEV(s) .sev = MCE_ ## s ## _SEVERITY
+#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r }
+#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r }
+#define MCGMASK(x, res, s, m, r...) \
+ { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r }
+#define MASK(x, y, s, m, r...) \
+ { .mask = x, .result = y, SEV(s), .msg = m, ## r }
+#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
+#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
+#define MCACOD 0xffff
+
+ BITCLR(MCI_STATUS_VAL, NO, "Invalid"),
+ BITCLR(MCI_STATUS_EN, NO, "Not enabled"),
+ BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"),
+ /* When MCIP is not set something is very confused */
+ MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"),
+ /* Neither return not error IP -- no chance to recover -> PANIC */
+ MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC,
+ "Neither restart nor error IP"),
+ MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP",
+ KERNEL),
+ BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER),
+ MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME,
+ "Spurious not enabled", SER),
+
+ /* ignore OVER for UCNA */
+ MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP,
+ "Uncorrected no action required", SER),
+ MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC,
+ "Illegal combination (UCNA with AR=1)", SER),
+ MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER),
+
+ /* AR add known MCACODs here */
+ MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC,
+ "Action required with lost events", SER),
+ MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC,
+ "Action required; unknown MCACOD", SER),
+
+ /* known AO MCACODs: */
+ MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO,
+ "Action optional: memory scrubbing error", SER),
+ MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO,
+ "Action optional: last level cache writeback error", SER),
+
+ MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME,
+ "Action optional unknown MCACOD", SER),
+ MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME,
+ "Action optional with lost events", SER),
+ BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"),
+ BITSET(MCI_STATUS_UC, UC, "Uncorrected"),
+ BITSET(0, SOME, "No match") /* always matches. keep at end */
+};
+
+/*
+ * If the EIPV bit is set, it means the saved IP is the
+ * instruction which caused the MCE.
+ */
+static int error_context(struct mce *m)
+{
+ if (m->mcgstatus & MCG_STATUS_EIPV)
+ return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
+ /* Unknown, assume kernel */
+ return IN_KERNEL;
+}
+
+int mce_severity(struct mce *a, int tolerant, char **msg)
+{
+ enum context ctx = error_context(a);
+ struct severity *s;
+
+ for (s = severities;; s++) {
+ if ((a->status & s->mask) != s->result)
+ continue;
+ if ((a->mcgstatus & s->mcgmask) != s->mcgres)
+ continue;
+ if (s->ser == SER_REQUIRED && !mce_ser)
+ continue;
+ if (s->ser == NO_SER && mce_ser)
+ continue;
+ if (s->context && ctx != s->context)
+ continue;
+ if (msg)
+ *msg = s->msg;
+ s->covered = 1;
+ if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
+ if (panic_on_oops || tolerant < 1)
+ return MCE_PANIC_SEVERITY;
+ }
+ return s->sev;
+ }
+}
+
+static void *s_start(struct seq_file *f, loff_t *pos)
+{
+ if (*pos >= ARRAY_SIZE(severities))
+ return NULL;
+ return &severities[*pos];
+}
+
+static void *s_next(struct seq_file *f, void *data, loff_t *pos)
+{
+ if (++(*pos) >= ARRAY_SIZE(severities))
+ return NULL;
+ return &severities[*pos];
+}
+
+static void s_stop(struct seq_file *f, void *data)
+{
+}
+
+static int s_show(struct seq_file *f, void *data)
+{
+ struct severity *ser = data;
+ seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
+ return 0;
+}
+
+static const struct seq_operations severities_seq_ops = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show,
+};
+
+static int severities_coverage_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &severities_seq_ops);
+}
+
+static ssize_t severities_coverage_write(struct file *file,
+ const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE(severities); i++)
+ severities[i].covered = 0;
+ return count;
+}
+
+static const struct file_operations severities_coverage_fops = {
+ .open = severities_coverage_open,
+ .release = seq_release,
+ .read = seq_read,
+ .write = severities_coverage_write,
+};
+
+static int __init severities_debugfs_init(void)
+{
+ struct dentry *dmce = NULL, *fseverities_coverage = NULL;
+
+ dmce = debugfs_create_dir("mce", NULL);
+ if (dmce == NULL)
+ goto err_out;
+ fseverities_coverage = debugfs_create_file("severities-coverage",
+ 0444, dmce, NULL,
+ &severities_coverage_fops);
+ if (fseverities_coverage == NULL)
+ goto err_out;
+
+ return 0;
+
+err_out:
+ if (fseverities_coverage)
+ debugfs_remove(fseverities_coverage);
+ if (dmce)
+ debugfs_remove(dmce);
+ return -ENOMEM;
+}
+late_initcall(severities_debugfs_init);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
new file mode 100644
index 00000000000..01213048f62
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -0,0 +1,2060 @@
+/*
+ * Machine check handler.
+ *
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
+ */
+#include <linux/thread_info.h>
+#include <linux/capability.h>
+#include <linux/miscdevice.h>
+#include <linux/interrupt.h>
+#include <linux/ratelimit.h>
+#include <linux/kallsyms.h>
+#include <linux/rcupdate.h>
+#include <linux/kobject.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
+#include <linux/sysdev.h>
+#include <linux/delay.h>
+#include <linux/ctype.h>
+#include <linux/sched.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+#include <linux/nmi.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+#include <asm/processor.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/idle.h>
+#include <asm/ipi.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+
+#include "mce-internal.h"
+
+/* Handle unconfigured int18 (should never happen) */
+static void unexpected_machine_check(struct pt_regs *regs, long error_code)
+{
+ printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
+ smp_processor_id());
+}
+
+/* Call the installed machine check handler for this CPU setup. */
+void (*machine_check_vector)(struct pt_regs *, long error_code) =
+ unexpected_machine_check;
+
+int mce_disabled __read_mostly;
+
+#ifdef CONFIG_X86_NEW_MCE
+
+#define MISC_MCELOG_MINOR 227
+
+#define SPINUNIT 100 /* 100ns */
+
+atomic_t mce_entry;
+
+DEFINE_PER_CPU(unsigned, mce_exception_count);
+
+/*
+ * Tolerant levels:
+ * 0: always panic on uncorrected errors, log corrected errors
+ * 1: panic or SIGBUS on uncorrected errors, log corrected errors
+ * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
+ * 3: never panic or SIGBUS, log all errors (for testing only)
+ */
+static int tolerant __read_mostly = 1;
+static int banks __read_mostly;
+static u64 *bank __read_mostly;
+static int rip_msr __read_mostly;
+static int mce_bootlog __read_mostly = -1;
+static int monarch_timeout __read_mostly = -1;
+static int mce_panic_timeout __read_mostly;
+static int mce_dont_log_ce __read_mostly;
+int mce_cmci_disabled __read_mostly;
+int mce_ignore_ce __read_mostly;
+int mce_ser __read_mostly;
+
+/* User mode helper program triggered by machine check event */
+static unsigned long mce_need_notify;
+static char mce_helper[128];
+static char *mce_helper_argv[2] = { mce_helper, NULL };
+
+static unsigned long dont_init_banks;
+
+static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+static DEFINE_PER_CPU(struct mce, mces_seen);
+static int cpu_missing;
+
+
+/* MCA banks polled by the period polling timer for corrected events */
+DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
+ [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
+};
+
+static inline int skip_bank_init(int i)
+{
+ return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
+}
+
+static DEFINE_PER_CPU(struct work_struct, mce_work);
+
+/* Do initial initialization of a struct mce */
+void mce_setup(struct mce *m)
+{
+ memset(m, 0, sizeof(struct mce));
+ m->cpu = m->extcpu = smp_processor_id();
+ rdtscll(m->tsc);
+ /* We hope get_seconds stays lockless */
+ m->time = get_seconds();
+ m->cpuvendor = boot_cpu_data.x86_vendor;
+ m->cpuid = cpuid_eax(1);
+#ifdef CONFIG_SMP
+ m->socketid = cpu_data(m->extcpu).phys_proc_id;
+#endif
+ m->apicid = cpu_data(m->extcpu).initial_apicid;
+ rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
+}
+
+DEFINE_PER_CPU(struct mce, injectm);
+EXPORT_PER_CPU_SYMBOL_GPL(injectm);
+
+/*
+ * Lockless MCE logging infrastructure.
+ * This avoids deadlocks on printk locks without having to break locks. Also
+ * separate MCEs from kernel messages to avoid bogus bug reports.
+ */
+
+static struct mce_log mcelog = {
+ .signature = MCE_LOG_SIGNATURE,
+ .len = MCE_LOG_LEN,
+ .recordlen = sizeof(struct mce),
+};
+
+void mce_log(struct mce *mce)
+{
+ unsigned next, entry;
+
+ mce->finished = 0;
+ wmb();
+ for (;;) {
+ entry = rcu_dereference(mcelog.next);
+ for (;;) {
+ /*
+ * When the buffer fills up discard new entries.
+ * Assume that the earlier errors are the more
+ * interesting ones:
+ */
+ if (entry >= MCE_LOG_LEN) {
+ set_bit(MCE_OVERFLOW,
+ (unsigned long *)&mcelog.flags);
+ return;
+ }
+ /* Old left over entry. Skip: */
+ if (mcelog.entry[entry].finished) {
+ entry++;
+ continue;
+ }
+ break;
+ }
+ smp_rmb();
+ next = entry + 1;
+ if (cmpxchg(&mcelog.next, entry, next) == entry)
+ break;
+ }
+ memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+ wmb();
+ mcelog.entry[entry].finished = 1;
+ wmb();
+
+ mce->finished = 1;
+ set_bit(0, &mce_need_notify);
+}
+
+static void print_mce(struct mce *m)
+{
+ printk(KERN_EMERG
+ "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
+ m->extcpu, m->mcgstatus, m->bank, m->status);
+ if (m->ip) {
+ printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
+ !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
+ m->cs, m->ip);
+ if (m->cs == __KERNEL_CS)
+ print_symbol("{%s}", m->ip);
+ printk(KERN_CONT "\n");
+ }
+ printk(KERN_EMERG "TSC %llx ", m->tsc);
+ if (m->addr)
+ printk(KERN_CONT "ADDR %llx ", m->addr);
+ if (m->misc)
+ printk(KERN_CONT "MISC %llx ", m->misc);
+ printk(KERN_CONT "\n");
+ printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
+ m->cpuvendor, m->cpuid, m->time, m->socketid,
+ m->apicid);
+}
+
+static void print_mce_head(void)
+{
+ printk(KERN_EMERG "\nHARDWARE ERROR\n");
+}
+
+static void print_mce_tail(void)
+{
+ printk(KERN_EMERG "This is not a software problem!\n"
+ "Run through mcelog --ascii to decode and contact your hardware vendor\n");
+}
+
+#define PANIC_TIMEOUT 5 /* 5 seconds */
+
+static atomic_t mce_paniced;
+
+/* Panic in progress. Enable interrupts and wait for final IPI */
+static void wait_for_panic(void)
+{
+ long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
+ preempt_disable();
+ local_irq_enable();
+ while (timeout-- > 0)
+ udelay(1);
+ if (panic_timeout == 0)
+ panic_timeout = mce_panic_timeout;
+ panic("Panicing machine check CPU died");
+}
+
+static void mce_panic(char *msg, struct mce *final, char *exp)
+{
+ int i;
+
+ /*
+ * Make sure only one CPU runs in machine check panic
+ */
+ if (atomic_add_return(1, &mce_paniced) > 1)
+ wait_for_panic();
+ barrier();
+
+ bust_spinlocks(1);
+ console_verbose();
+ print_mce_head();
+ /* First print corrected ones that are still unlogged */
+ for (i = 0; i < MCE_LOG_LEN; i++) {
+ struct mce *m = &mcelog.entry[i];
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+ if (!(m->status & MCI_STATUS_UC))
+ print_mce(m);
+ }
+ /* Now print uncorrected but with the final one last */
+ for (i = 0; i < MCE_LOG_LEN; i++) {
+ struct mce *m = &mcelog.entry[i];
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+ if (!(m->status & MCI_STATUS_UC))
+ continue;
+ if (!final || memcmp(m, final, sizeof(struct mce)))
+ print_mce(m);
+ }
+ if (final)
+ print_mce(final);
+ if (cpu_missing)
+ printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
+ print_mce_tail();
+ if (exp)
+ printk(KERN_EMERG "Machine check: %s\n", exp);
+ if (panic_timeout == 0)
+ panic_timeout = mce_panic_timeout;
+ panic(msg);
+}
+
+/* Support code for software error injection */
+
+static int msr_to_offset(u32 msr)
+{
+ unsigned bank = __get_cpu_var(injectm.bank);
+ if (msr == rip_msr)
+ return offsetof(struct mce, ip);
+ if (msr == MSR_IA32_MC0_STATUS + bank*4)
+ return offsetof(struct mce, status);
+ if (msr == MSR_IA32_MC0_ADDR + bank*4)
+ return offsetof(struct mce, addr);
+ if (msr == MSR_IA32_MC0_MISC + bank*4)
+ return offsetof(struct mce, misc);
+ if (msr == MSR_IA32_MCG_STATUS)
+ return offsetof(struct mce, mcgstatus);
+ return -1;
+}
+
+/* MSR access wrappers used for error injection */
+static u64 mce_rdmsrl(u32 msr)
+{
+ u64 v;
+ if (__get_cpu_var(injectm).finished) {
+ int offset = msr_to_offset(msr);
+ if (offset < 0)
+ return 0;
+ return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
+ }
+ rdmsrl(msr, v);
+ return v;
+}
+
+static void mce_wrmsrl(u32 msr, u64 v)
+{
+ if (__get_cpu_var(injectm).finished) {
+ int offset = msr_to_offset(msr);
+ if (offset >= 0)
+ *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
+ return;
+ }
+ wrmsrl(msr, v);
+}
+
+/*
+ * Simple lockless ring to communicate PFNs from the exception handler with the
+ * process context work function. This is vastly simplified because there's
+ * only a single reader and a single writer.
+ */
+#define MCE_RING_SIZE 16 /* we use one entry less */
+
+struct mce_ring {
+ unsigned short start;
+ unsigned short end;
+ unsigned long ring[MCE_RING_SIZE];
+};
+static DEFINE_PER_CPU(struct mce_ring, mce_ring);
+
+/* Runs with CPU affinity in workqueue */
+static int mce_ring_empty(void)
+{
+ struct mce_ring *r = &__get_cpu_var(mce_ring);
+
+ return r->start == r->end;
+}
+
+static int mce_ring_get(unsigned long *pfn)
+{
+ struct mce_ring *r;
+ int ret = 0;
+
+ *pfn = 0;
+ get_cpu();
+ r = &__get_cpu_var(mce_ring);
+ if (r->start == r->end)
+ goto out;
+ *pfn = r->ring[r->start];
+ r->start = (r->start + 1) % MCE_RING_SIZE;
+ ret = 1;
+out:
+ put_cpu();
+ return ret;
+}
+
+/* Always runs in MCE context with preempt off */
+static int mce_ring_add(unsigned long pfn)
+{
+ struct mce_ring *r = &__get_cpu_var(mce_ring);
+ unsigned next;
+
+ next = (r->end + 1) % MCE_RING_SIZE;
+ if (next == r->start)
+ return -1;
+ r->ring[r->end] = pfn;
+ wmb();
+ r->end = next;
+ return 0;
+}
+
+int mce_available(struct cpuinfo_x86 *c)
+{
+ if (mce_disabled)
+ return 0;
+ return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
+}
+
+static void mce_schedule_work(void)
+{
+ if (!mce_ring_empty()) {
+ struct work_struct *work = &__get_cpu_var(mce_work);
+ if (!work_pending(work))
+ schedule_work(work);
+ }
+}
+
+/*
+ * Get the address of the instruction at the time of the machine check
+ * error.
+ */
+static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
+{
+
+ if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+ } else {
+ m->ip = 0;
+ m->cs = 0;
+ }
+ if (rip_msr)
+ m->ip = mce_rdmsrl(rip_msr);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Called after interrupts have been reenabled again
+ * when a MCE happened during an interrupts off region
+ * in the kernel.
+ */
+asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
+{
+ ack_APIC_irq();
+ exit_idle();
+ irq_enter();
+ mce_notify_irq();
+ mce_schedule_work();
+ irq_exit();
+}
+#endif
+
+static void mce_report_event(struct pt_regs *regs)
+{
+ if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
+ mce_notify_irq();
+ /*
+ * Triggering the work queue here is just an insurance
+ * policy in case the syscall exit notify handler
+ * doesn't run soon enough or ends up running on the
+ * wrong CPU (can happen when audit sleeps)
+ */
+ mce_schedule_work();
+ return;
+ }
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ /*
+ * Without APIC do not notify. The event will be picked
+ * up eventually.
+ */
+ if (!cpu_has_apic)
+ return;
+
+ /*
+ * When interrupts are disabled we cannot use
+ * kernel services safely. Trigger an self interrupt
+ * through the APIC to instead do the notification
+ * after interrupts are reenabled again.
+ */
+ apic->send_IPI_self(MCE_SELF_VECTOR);
+
+ /*
+ * Wait for idle afterwards again so that we don't leave the
+ * APIC in a non idle state because the normal APIC writes
+ * cannot exclude us.
+ */
+ apic_wait_icr_idle();
+#endif
+}
+
+DEFINE_PER_CPU(unsigned, mce_poll_count);
+
+/*
+ * Poll for corrected events or events that happened before reset.
+ * Those are just logged through /dev/mcelog.
+ *
+ * This is executed in standard interrupt context.
+ *
+ * Note: spec recommends to panic for fatal unsignalled
+ * errors here. However this would be quite problematic --
+ * we would need to reimplement the Monarch handling and
+ * it would mess up the exclusion between exception handler
+ * and poll hander -- * so we skip this for now.
+ * These cases should not happen anyways, or only when the CPU
+ * is already totally * confused. In this case it's likely it will
+ * not fully execute the machine check handler either.
+ */
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+{
+ struct mce m;
+ int i;
+
+ __get_cpu_var(mce_poll_count)++;
+
+ mce_setup(&m);
+
+ m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ for (i = 0; i < banks; i++) {
+ if (!bank[i] || !test_bit(i, *b))
+ continue;
+
+ m.misc = 0;
+ m.addr = 0;
+ m.bank = i;
+ m.tsc = 0;
+
+ barrier();
+ m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
+ if (!(m.status & MCI_STATUS_VAL))
+ continue;
+
+ /*
+ * Uncorrected or signalled events are handled by the exception
+ * handler when it is enabled, so don't process those here.
+ *
+ * TBD do the same check for MCI_STATUS_EN here?
+ */
+ if (!(flags & MCP_UC) &&
+ (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
+ continue;
+
+ if (m.status & MCI_STATUS_MISCV)
+ m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
+ if (m.status & MCI_STATUS_ADDRV)
+ m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
+
+ if (!(flags & MCP_TIMESTAMP))
+ m.tsc = 0;
+ /*
+ * Don't get the IP here because it's unlikely to
+ * have anything to do with the actual error location.
+ */
+ if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
+ mce_log(&m);
+ add_taint(TAINT_MACHINE_CHECK);
+ }
+
+ /*
+ * Clear state for this bank.
+ */
+ mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ }
+
+ /*
+ * Don't clear MCG_STATUS here because it's only defined for
+ * exceptions.
+ */
+
+ sync_core();
+}
+EXPORT_SYMBOL_GPL(machine_check_poll);
+
+/*
+ * Do a quick check if any of the events requires a panic.
+ * This decides if we keep the events around or clear them.
+ */
+static int mce_no_way_out(struct mce *m, char **msg)
+{
+ int i;
+
+ for (i = 0; i < banks; i++) {
+ m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
+ if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Variable to establish order between CPUs while scanning.
+ * Each CPU spins initially until executing is equal its number.
+ */
+static atomic_t mce_executing;
+
+/*
+ * Defines order of CPUs on entry. First CPU becomes Monarch.
+ */
+static atomic_t mce_callin;
+
+/*
+ * Check if a timeout waiting for other CPUs happened.
+ */
+static int mce_timed_out(u64 *t)
+{
+ /*
+ * The others already did panic for some reason.
+ * Bail out like in a timeout.
+ * rmb() to tell the compiler that system_state
+ * might have been modified by someone else.
+ */
+ rmb();
+ if (atomic_read(&mce_paniced))
+ wait_for_panic();
+ if (!monarch_timeout)
+ goto out;
+ if ((s64)*t < SPINUNIT) {
+ /* CHECKME: Make panic default for 1 too? */
+ if (tolerant < 1)
+ mce_panic("Timeout synchronizing machine check over CPUs",
+ NULL, NULL);
+ cpu_missing = 1;
+ return 1;
+ }
+ *t -= SPINUNIT;
+out:
+ touch_nmi_watchdog();
+ return 0;
+}
+
+/*
+ * The Monarch's reign. The Monarch is the CPU who entered
+ * the machine check handler first. It waits for the others to
+ * raise the exception too and then grades them. When any
+ * error is fatal panic. Only then let the others continue.
+ *
+ * The other CPUs entering the MCE handler will be controlled by the
+ * Monarch. They are called Subjects.
+ *
+ * This way we prevent any potential data corruption in a unrecoverable case
+ * and also makes sure always all CPU's errors are examined.
+ *
+ * Also this detects the case of an machine check event coming from outer
+ * space (not detected by any CPUs) In this case some external agent wants
+ * us to shut down, so panic too.
+ *
+ * The other CPUs might still decide to panic if the handler happens
+ * in a unrecoverable place, but in this case the system is in a semi-stable
+ * state and won't corrupt anything by itself. It's ok to let the others
+ * continue for a bit first.
+ *
+ * All the spin loops have timeouts; when a timeout happens a CPU
+ * typically elects itself to be Monarch.
+ */
+static void mce_reign(void)
+{
+ int cpu;
+ struct mce *m = NULL;
+ int global_worst = 0;
+ char *msg = NULL;
+ char *nmsg = NULL;
+
+ /*
+ * This CPU is the Monarch and the other CPUs have run
+ * through their handlers.
+ * Grade the severity of the errors of all the CPUs.
+ */
+ for_each_possible_cpu(cpu) {
+ int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
+ &nmsg);
+ if (severity > global_worst) {
+ msg = nmsg;
+ global_worst = severity;
+ m = &per_cpu(mces_seen, cpu);
+ }
+ }
+
+ /*
+ * Cannot recover? Panic here then.
+ * This dumps all the mces in the log buffer and stops the
+ * other CPUs.
+ */
+ if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
+ mce_panic("Fatal Machine check", m, msg);
+
+ /*
+ * For UC somewhere we let the CPU who detects it handle it.
+ * Also must let continue the others, otherwise the handling
+ * CPU could deadlock on a lock.
+ */
+
+ /*
+ * No machine check event found. Must be some external
+ * source or one CPU is hung. Panic.
+ */
+ if (!m && tolerant < 3)
+ mce_panic("Machine check from unknown source", NULL, NULL);
+
+ /*
+ * Now clear all the mces_seen so that they don't reappear on
+ * the next mce.
+ */
+ for_each_possible_cpu(cpu)
+ memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
+}
+
+static atomic_t global_nwo;
+
+/*
+ * Start of Monarch synchronization. This waits until all CPUs have
+ * entered the exception handler and then determines if any of them
+ * saw a fatal event that requires panic. Then it executes them
+ * in the entry order.
+ * TBD double check parallel CPU hotunplug
+ */
+static int mce_start(int *no_way_out)
+{
+ int order;
+ int cpus = num_online_cpus();
+ u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
+
+ if (!timeout)
+ return -1;
+
+ atomic_add(*no_way_out, &global_nwo);
+ /*
+ * global_nwo should be updated before mce_callin
+ */
+ smp_wmb();
+ order = atomic_add_return(1, &mce_callin);
+
+ /*
+ * Wait for everyone.
+ */
+ while (atomic_read(&mce_callin) != cpus) {
+ if (mce_timed_out(&timeout)) {
+ atomic_set(&global_nwo, 0);
+ return -1;
+ }
+ ndelay(SPINUNIT);
+ }
+
+ /*
+ * mce_callin should be read before global_nwo
+ */
+ smp_rmb();
+
+ if (order == 1) {
+ /*
+ * Monarch: Starts executing now, the others wait.
+ */
+ atomic_set(&mce_executing, 1);
+ } else {
+ /*
+ * Subject: Now start the scanning loop one by one in
+ * the original callin order.
+ * This way when there are any shared banks it will be
+ * only seen by one CPU before cleared, avoiding duplicates.
+ */
+ while (atomic_read(&mce_executing) < order) {
+ if (mce_timed_out(&timeout)) {
+ atomic_set(&global_nwo, 0);
+ return -1;
+ }
+ ndelay(SPINUNIT);
+ }
+ }
+
+ /*
+ * Cache the global no_way_out state.
+ */
+ *no_way_out = atomic_read(&global_nwo);
+
+ return order;
+}
+
+/*
+ * Synchronize between CPUs after main scanning loop.
+ * This invokes the bulk of the Monarch processing.
+ */
+static int mce_end(int order)
+{
+ int ret = -1;
+ u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
+
+ if (!timeout)
+ goto reset;
+ if (order < 0)
+ goto reset;
+
+ /*
+ * Allow others to run.
+ */
+ atomic_inc(&mce_executing);
+
+ if (order == 1) {
+ /* CHECKME: Can this race with a parallel hotplug? */
+ int cpus = num_online_cpus();
+
+ /*
+ * Monarch: Wait for everyone to go through their scanning
+ * loops.
+ */
+ while (atomic_read(&mce_executing) <= cpus) {
+ if (mce_timed_out(&timeout))
+ goto reset;
+ ndelay(SPINUNIT);
+ }
+
+ mce_reign();
+ barrier();
+ ret = 0;
+ } else {
+ /*
+ * Subject: Wait for Monarch to finish.
+ */
+ while (atomic_read(&mce_executing) != 0) {
+ if (mce_timed_out(&timeout))
+ goto reset;
+ ndelay(SPINUNIT);
+ }
+
+ /*
+ * Don't reset anything. That's done by the Monarch.
+ */
+ return 0;
+ }
+
+ /*
+ * Reset all global state.
+ */
+reset:
+ atomic_set(&global_nwo, 0);
+ atomic_set(&mce_callin, 0);
+ barrier();
+
+ /*
+ * Let others run again.
+ */
+ atomic_set(&mce_executing, 0);
+ return ret;
+}
+
+/*
+ * Check if the address reported by the CPU is in a format we can parse.
+ * It would be possible to add code for most other cases, but all would
+ * be somewhat complicated (e.g. segment offset would require an instruction
+ * parser). So only support physical addresses upto page granuality for now.
+ */
+static int mce_usable_address(struct mce *m)
+{
+ if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
+ return 0;
+ if ((m->misc & 0x3f) > PAGE_SHIFT)
+ return 0;
+ if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
+ return 0;
+ return 1;
+}
+
+static void mce_clear_state(unsigned long *toclear)
+{
+ int i;
+
+ for (i = 0; i < banks; i++) {
+ if (test_bit(i, toclear))
+ mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ }
+}
+
+/*
+ * The actual machine check handler. This only handles real
+ * exceptions when something got corrupted coming in through int 18.
+ *
+ * This is executed in NMI context not subject to normal locking rules. This
+ * implies that most kernel services cannot be safely used. Don't even
+ * think about putting a printk in there!
+ *
+ * On Intel systems this is entered on all CPUs in parallel through
+ * MCE broadcast. However some CPUs might be broken beyond repair,
+ * so be always careful when synchronizing with others.
+ */
+void do_machine_check(struct pt_regs *regs, long error_code)
+{
+ struct mce m, *final;
+ int i;
+ int worst = 0;
+ int severity;
+ /*
+ * Establish sequential order between the CPUs entering the machine
+ * check handler.
+ */
+ int order;
+ /*
+ * If no_way_out gets set, there is no safe way to recover from this
+ * MCE. If tolerant is cranked up, we'll try anyway.
+ */
+ int no_way_out = 0;
+ /*
+ * If kill_it gets set, there might be a way to recover from this
+ * error.
+ */
+ int kill_it = 0;
+ DECLARE_BITMAP(toclear, MAX_NR_BANKS);
+ char *msg = "Unknown";
+
+ atomic_inc(&mce_entry);
+
+ __get_cpu_var(mce_exception_count)++;
+
+ if (notify_die(DIE_NMI, "machine check", regs, error_code,
+ 18, SIGKILL) == NOTIFY_STOP)
+ goto out;
+ if (!banks)
+ goto out;
+
+ mce_setup(&m);
+
+ m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ no_way_out = mce_no_way_out(&m, &msg);
+
+ final = &__get_cpu_var(mces_seen);
+ *final = m;
+
+ barrier();
+
+ /*
+ * When no restart IP must always kill or panic.
+ */
+ if (!(m.mcgstatus & MCG_STATUS_RIPV))
+ kill_it = 1;
+
+ /*
+ * Go through all the banks in exclusion of the other CPUs.
+ * This way we don't report duplicated events on shared banks
+ * because the first one to see it will clear it.
+ */
+ order = mce_start(&no_way_out);
+ for (i = 0; i < banks; i++) {
+ __clear_bit(i, toclear);
+ if (!bank[i])
+ continue;
+
+ m.misc = 0;
+ m.addr = 0;
+ m.bank = i;
+
+ m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
+ if ((m.status & MCI_STATUS_VAL) == 0)
+ continue;
+
+ /*
+ * Non uncorrected or non signaled errors are handled by
+ * machine_check_poll. Leave them alone, unless this panics.
+ */
+ if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
+ !no_way_out)
+ continue;
+
+ /*
+ * Set taint even when machine check was not enabled.
+ */
+ add_taint(TAINT_MACHINE_CHECK);
+
+ severity = mce_severity(&m, tolerant, NULL);
+
+ /*
+ * When machine check was for corrected handler don't touch,
+ * unless we're panicing.
+ */
+ if (severity == MCE_KEEP_SEVERITY && !no_way_out)
+ continue;
+ __set_bit(i, toclear);
+ if (severity == MCE_NO_SEVERITY) {
+ /*
+ * Machine check event was not enabled. Clear, but
+ * ignore.
+ */
+ continue;
+ }
+
+ /*
+ * Kill on action required.
+ */
+ if (severity == MCE_AR_SEVERITY)
+ kill_it = 1;
+
+ if (m.status & MCI_STATUS_MISCV)
+ m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
+ if (m.status & MCI_STATUS_ADDRV)
+ m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
+
+ /*
+ * Action optional error. Queue address for later processing.
+ * When the ring overflows we just ignore the AO error.
+ * RED-PEN add some logging mechanism when
+ * usable_address or mce_add_ring fails.
+ * RED-PEN don't ignore overflow for tolerant == 0
+ */
+ if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
+ mce_ring_add(m.addr >> PAGE_SHIFT);
+
+ mce_get_rip(&m, regs);
+ mce_log(&m);
+
+ if (severity > worst) {
+ *final = m;
+ worst = severity;
+ }
+ }
+
+ if (!no_way_out)
+ mce_clear_state(toclear);
+
+ /*
+ * Do most of the synchronization with other CPUs.
+ * When there's any problem use only local no_way_out state.
+ */
+ if (mce_end(order) < 0)
+ no_way_out = worst >= MCE_PANIC_SEVERITY;
+
+ /*
+ * If we have decided that we just CAN'T continue, and the user
+ * has not set tolerant to an insane level, give up and die.
+ *
+ * This is mainly used in the case when the system doesn't
+ * support MCE broadcasting or it has been disabled.
+ */
+ if (no_way_out && tolerant < 3)
+ mce_panic("Fatal machine check on current CPU", final, msg);
+
+ /*
+ * If the error seems to be unrecoverable, something should be
+ * done. Try to kill as little as possible. If we can kill just
+ * one task, do that. If the user has set the tolerance very
+ * high, don't try to do anything at all.
+ */
+
+ if (kill_it && tolerant < 3)
+ force_sig(SIGBUS, current);
+
+ /* notify userspace ASAP */
+ set_thread_flag(TIF_MCE_NOTIFY);
+
+ if (worst > 0)
+ mce_report_event(regs);
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+out:
+ atomic_dec(&mce_entry);
+ sync_core();
+}
+EXPORT_SYMBOL_GPL(do_machine_check);
+
+/* dummy to break dependency. actual code is in mm/memory-failure.c */
+void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
+{
+ printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
+}
+
+/*
+ * Called after mce notification in process context. This code
+ * is allowed to sleep. Call the high level VM handler to process
+ * any corrupted pages.
+ * Assume that the work queue code only calls this one at a time
+ * per CPU.
+ * Note we don't disable preemption, so this code might run on the wrong
+ * CPU. In this case the event is picked up by the scheduled work queue.
+ * This is merely a fast path to expedite processing in some common
+ * cases.
+ */
+void mce_notify_process(void)
+{
+ unsigned long pfn;
+ mce_notify_irq();
+ while (mce_ring_get(&pfn))
+ memory_failure(pfn, MCE_VECTOR);
+}
+
+static void mce_process_work(struct work_struct *dummy)
+{
+ mce_notify_process();
+}
+
+#ifdef CONFIG_X86_MCE_INTEL
+/***
+ * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
+ * @cpu: The CPU on which the event occurred.
+ * @status: Event status information
+ *
+ * This function should be called by the thermal interrupt after the
+ * event has been processed and the decision was made to log the event
+ * further.
+ *
+ * The status parameter will be saved to the 'status' field of 'struct mce'
+ * and historically has been the register value of the
+ * MSR_IA32_THERMAL_STATUS (Intel) msr.
+ */
+void mce_log_therm_throt_event(__u64 status)
+{
+ struct mce m;
+
+ mce_setup(&m);
+ m.bank = MCE_THERMAL_BANK;
+ m.status = status;
+ mce_log(&m);
+}
+#endif /* CONFIG_X86_MCE_INTEL */
+
+/*
+ * Periodic polling timer for "silent" machine check errors. If the
+ * poller finds an MCE, poll 2x faster. When the poller finds no more
+ * errors, poll 2x slower (up to check_interval seconds).
+ */
+static int check_interval = 5 * 60; /* 5 minutes */
+
+static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
+
+static void mcheck_timer(unsigned long data)
+{
+ struct timer_list *t = &per_cpu(mce_timer, data);
+ int *n;
+
+ WARN_ON(smp_processor_id() != data);
+
+ if (mce_available(&current_cpu_data)) {
+ machine_check_poll(MCP_TIMESTAMP,
+ &__get_cpu_var(mce_poll_banks));
+ }
+
+ /*
+ * Alert userspace if needed. If we logged an MCE, reduce the
+ * polling interval, otherwise increase the polling interval.
+ */
+ n = &__get_cpu_var(next_interval);
+ if (mce_notify_irq())
+ *n = max(*n/2, HZ/100);
+ else
+ *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
+
+ t->expires = jiffies + *n;
+ add_timer_on(t, smp_processor_id());
+}
+
+static void mce_do_trigger(struct work_struct *work)
+{
+ call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
+}
+
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
+/*
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
+ */
+int mce_notify_irq(void)
+{
+ /* Not more than two messages every minute */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
+ clear_thread_flag(TIF_MCE_NOTIFY);
+
+ if (test_and_clear_bit(0, &mce_need_notify)) {
+ wake_up_interruptible(&mce_wait);
+
+ /*
+ * There is no risk of missing notifications because
+ * work_pending is always cleared before the function is
+ * executed.
+ */
+ if (mce_helper[0] && !work_pending(&mce_trigger_work))
+ schedule_work(&mce_trigger_work);
+
+ if (__ratelimit(&ratelimit))
+ printk(KERN_INFO "Machine check events logged\n");
+
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mce_notify_irq);
+
+/*
+ * Initialize Machine Checks for a CPU.
+ */
+static int mce_cap_init(void)
+{
+ unsigned b;
+ u64 cap;
+
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
+
+ b = cap & MCG_BANKCNT_MASK;
+ printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
+
+ if (b > MAX_NR_BANKS) {
+ printk(KERN_WARNING
+ "MCE: Using only %u machine check banks out of %u\n",
+ MAX_NR_BANKS, b);
+ b = MAX_NR_BANKS;
+ }
+
+ /* Don't support asymmetric configurations today */
+ WARN_ON(banks != 0 && b != banks);
+ banks = b;
+ if (!bank) {
+ bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
+ if (!bank)
+ return -ENOMEM;
+ memset(bank, 0xff, banks * sizeof(u64));
+ }
+
+ /* Use accurate RIP reporting if available. */
+ if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
+ rip_msr = MSR_IA32_MCG_EIP;
+
+ if (cap & MCG_SER_P)
+ mce_ser = 1;
+
+ return 0;
+}
+
+static void mce_init(void)
+{
+ mce_banks_t all_banks;
+ u64 cap;
+ int i;
+
+ /*
+ * Log the machine checks left over from the previous reset.
+ */
+ bitmap_fill(all_banks, MAX_NR_BANKS);
+ machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
+
+ set_in_cr4(X86_CR4_MCE);
+
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
+ if (cap & MCG_CTL_P)
+ wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+
+ for (i = 0; i < banks; i++) {
+ if (skip_bank_init(i))
+ continue;
+ wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
+ wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ }
+}
+
+/* Add per CPU specific workarounds here */
+static int mce_cpu_quirks(struct cpuinfo_x86 *c)
+{
+ if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
+ pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
+ return -EOPNOTSUPP;
+ }
+
+ /* This should be disabled by the BIOS, but isn't always */
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ if (c->x86 == 15 && banks > 4) {
+ /*
+ * disable GART TBL walk error reporting, which
+ * trips off incorrectly with the IOMMU & 3ware
+ * & Cerberus:
+ */
+ clear_bit(10, (unsigned long *)&bank[4]);
+ }
+ if (c->x86 <= 17 && mce_bootlog < 0) {
+ /*
+ * Lots of broken BIOS around that don't clear them
+ * by default and leave crap in there. Don't log:
+ */
+ mce_bootlog = 0;
+ }
+ /*
+ * Various K7s with broken bank 0 around. Always disable
+ * by default.
+ */
+ if (c->x86 == 6 && banks > 0)
+ bank[0] = 0;
+ }
+
+ if (c->x86_vendor == X86_VENDOR_INTEL) {
+ /*
+ * SDM documents that on family 6 bank 0 should not be written
+ * because it aliases to another special BIOS controlled
+ * register.
+ * But it's not aliased anymore on model 0x1a+
+ * Don't ignore bank 0 completely because there could be a
+ * valid event later, merely don't write CTL0.
+ */
+
+ if (c->x86 == 6 && c->x86_model < 0x1A)
+ __set_bit(0, &dont_init_banks);
+
+ /*
+ * All newer Intel systems support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
+ monarch_timeout < 0)
+ monarch_timeout = USEC_PER_SEC;
+
+ /*
+ * There are also broken BIOSes on some Pentium M and
+ * earlier systems:
+ */
+ if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
+ mce_bootlog = 0;
+ }
+ if (monarch_timeout < 0)
+ monarch_timeout = 0;
+ if (mce_bootlog != 0)
+ mce_panic_timeout = 30;
+
+ return 0;
+}
+
+static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
+{
+ if (c->x86 != 5)
+ return;
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ intel_p5_mcheck_init(c);
+ break;
+ case X86_VENDOR_CENTAUR:
+ winchip_mcheck_init(c);
+ break;
+ }
+}
+
+static void mce_cpu_features(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_feature_init(c);
+ break;
+ case X86_VENDOR_AMD:
+ mce_amd_feature_init(c);
+ break;
+ default:
+ break;
+ }
+}
+
+static void mce_init_timer(void)
+{
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+ int *n = &__get_cpu_var(next_interval);
+
+ if (mce_ignore_ce)
+ return;
+
+ *n = check_interval * HZ;
+ if (!*n)
+ return;
+ setup_timer(t, mcheck_timer, smp_processor_id());
+ t->expires = round_jiffies(jiffies + *n);
+ add_timer_on(t, smp_processor_id());
+}
+
+/*
+ * Called for each booted CPU to set up machine checks.
+ * Must be called with preempt off:
+ */
+void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
+{
+ if (mce_disabled)
+ return;
+
+ mce_ancient_init(c);
+
+ if (!mce_available(c))
+ return;
+
+ if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) {
+ mce_disabled = 1;
+ return;
+ }
+
+ machine_check_vector = do_machine_check;
+
+ mce_init();
+ mce_cpu_features(c);
+ mce_init_timer();
+ INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
+}
+
+/*
+ * Character device to read and clear the MCE log.
+ */
+
+static DEFINE_SPINLOCK(mce_state_lock);
+static int open_count; /* #times opened */
+static int open_exclu; /* already open exclusive? */
+
+static int mce_open(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_state_lock);
+
+ if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
+ spin_unlock(&mce_state_lock);
+
+ return -EBUSY;
+ }
+
+ if (file->f_flags & O_EXCL)
+ open_exclu = 1;
+ open_count++;
+
+ spin_unlock(&mce_state_lock);
+
+ return nonseekable_open(inode, file);
+}
+
+static int mce_release(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_state_lock);
+
+ open_count--;
+ open_exclu = 0;
+
+ spin_unlock(&mce_state_lock);
+
+ return 0;
+}
+
+static void collect_tscs(void *data)
+{
+ unsigned long *cpu_tsc = (unsigned long *)data;
+
+ rdtscll(cpu_tsc[smp_processor_id()]);
+}
+
+static DEFINE_MUTEX(mce_read_mutex);
+
+static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
+ loff_t *off)
+{
+ char __user *buf = ubuf;
+ unsigned long *cpu_tsc;
+ unsigned prev, next;
+ int i, err;
+
+ cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
+ if (!cpu_tsc)
+ return -ENOMEM;
+
+ mutex_lock(&mce_read_mutex);
+ next = rcu_dereference(mcelog.next);
+
+ /* Only supports full reads right now */
+ if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
+ mutex_unlock(&mce_read_mutex);
+ kfree(cpu_tsc);
+
+ return -EINVAL;
+ }
+
+ err = 0;
+ prev = 0;
+ do {
+ for (i = prev; i < next; i++) {
+ unsigned long start = jiffies;
+
+ while (!mcelog.entry[i].finished) {
+ if (time_after_eq(jiffies, start + 2)) {
+ memset(mcelog.entry + i, 0,
+ sizeof(struct mce));
+ goto timeout;
+ }
+ cpu_relax();
+ }
+ smp_rmb();
+ err |= copy_to_user(buf, mcelog.entry + i,
+ sizeof(struct mce));
+ buf += sizeof(struct mce);
+timeout:
+ ;
+ }
+
+ memset(mcelog.entry + prev, 0,
+ (next - prev) * sizeof(struct mce));
+ prev = next;
+ next = cmpxchg(&mcelog.next, prev, 0);
+ } while (next != prev);
+
+ synchronize_sched();
+
+ /*
+ * Collect entries that were still getting written before the
+ * synchronize.
+ */
+ on_each_cpu(collect_tscs, cpu_tsc, 1);
+
+ for (i = next; i < MCE_LOG_LEN; i++) {
+ if (mcelog.entry[i].finished &&
+ mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
+ err |= copy_to_user(buf, mcelog.entry+i,
+ sizeof(struct mce));
+ smp_rmb();
+ buf += sizeof(struct mce);
+ memset(&mcelog.entry[i], 0, sizeof(struct mce));
+ }
+ }
+ mutex_unlock(&mce_read_mutex);
+ kfree(cpu_tsc);
+
+ return err ? -EFAULT : buf - ubuf;
+}
+
+static unsigned int mce_poll(struct file *file, poll_table *wait)
+{
+ poll_wait(file, &mce_wait, wait);
+ if (rcu_dereference(mcelog.next))
+ return POLLIN | POLLRDNORM;
+ return 0;
+}
+
+static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+ int __user *p = (int __user *)arg;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case MCE_GET_RECORD_LEN:
+ return put_user(sizeof(struct mce), p);
+ case MCE_GET_LOG_LEN:
+ return put_user(MCE_LOG_LEN, p);
+ case MCE_GETCLEAR_FLAGS: {
+ unsigned flags;
+
+ do {
+ flags = mcelog.flags;
+ } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
+
+ return put_user(flags, p);
+ }
+ default:
+ return -ENOTTY;
+ }
+}
+
+/* Modified in mce-inject.c, so not static or const */
+struct file_operations mce_chrdev_ops = {
+ .open = mce_open,
+ .release = mce_release,
+ .read = mce_read,
+ .poll = mce_poll,
+ .unlocked_ioctl = mce_ioctl,
+};
+EXPORT_SYMBOL_GPL(mce_chrdev_ops);
+
+static struct miscdevice mce_log_device = {
+ MISC_MCELOG_MINOR,
+ "mcelog",
+ &mce_chrdev_ops,
+};
+
+/*
+ * mce=off Disables machine check
+ * mce=no_cmci Disables CMCI
+ * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
+ * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
+ * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
+ * monarchtimeout is how long to wait for other CPUs on machine
+ * check, or 0 to not wait
+ * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
+ * mce=nobootlog Don't log MCEs from before booting.
+ */
+static int __init mcheck_enable(char *str)
+{
+ if (*str == 0)
+ enable_p5_mce();
+ if (*str == '=')
+ str++;
+ if (!strcmp(str, "off"))
+ mce_disabled = 1;
+ else if (!strcmp(str, "no_cmci"))
+ mce_cmci_disabled = 1;
+ else if (!strcmp(str, "dont_log_ce"))
+ mce_dont_log_ce = 1;
+ else if (!strcmp(str, "ignore_ce"))
+ mce_ignore_ce = 1;
+ else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
+ mce_bootlog = (str[0] == 'b');
+ else if (isdigit(str[0])) {
+ get_option(&str, &tolerant);
+ if (*str == ',') {
+ ++str;
+ get_option(&str, &monarch_timeout);
+ }
+ } else {
+ printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
+ str);
+ return 0;
+ }
+ return 1;
+}
+__setup("mce", mcheck_enable);
+
+/*
+ * Sysfs support
+ */
+
+/*
+ * Disable machine checks on suspend and shutdown. We can't really handle
+ * them later.
+ */
+static int mce_disable(void)
+{
+ int i;
+
+ for (i = 0; i < banks; i++) {
+ if (!skip_bank_init(i))
+ wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+ }
+ return 0;
+}
+
+static int mce_suspend(struct sys_device *dev, pm_message_t state)
+{
+ return mce_disable();
+}
+
+static int mce_shutdown(struct sys_device *dev)
+{
+ return mce_disable();
+}
+
+/*
+ * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
+ * Only one CPU is active at this time, the others get re-added later using
+ * CPU hotplug:
+ */
+static int mce_resume(struct sys_device *dev)
+{
+ mce_init();
+ mce_cpu_features(&current_cpu_data);
+
+ return 0;
+}
+
+static void mce_cpu_restart(void *data)
+{
+ del_timer_sync(&__get_cpu_var(mce_timer));
+ if (!mce_available(&current_cpu_data))
+ return;
+ mce_init();
+ mce_init_timer();
+}
+
+/* Reinit MCEs after user configuration changes */
+static void mce_restart(void)
+{
+ on_each_cpu(mce_cpu_restart, NULL, 1);
+}
+
+/* Toggle features for corrected errors */
+static void mce_disable_ce(void *all)
+{
+ if (!mce_available(&current_cpu_data))
+ return;
+ if (all)
+ del_timer_sync(&__get_cpu_var(mce_timer));
+ cmci_clear();
+}
+
+static void mce_enable_ce(void *all)
+{
+ if (!mce_available(&current_cpu_data))
+ return;
+ cmci_reenable();
+ cmci_recheck();
+ if (all)
+ mce_init_timer();
+}
+
+static struct sysdev_class mce_sysclass = {
+ .suspend = mce_suspend,
+ .shutdown = mce_shutdown,
+ .resume = mce_resume,
+ .name = "machinecheck",
+};
+
+DEFINE_PER_CPU(struct sys_device, mce_dev);
+
+__cpuinitdata
+void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
+
+static struct sysdev_attribute *bank_attrs;
+
+static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
+ char *buf)
+{
+ u64 b = bank[attr - bank_attrs];
+
+ return sprintf(buf, "%llx\n", b);
+}
+
+static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+ const char *buf, size_t size)
+{
+ u64 new;
+
+ if (strict_strtoull(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ bank[attr - bank_attrs] = new;
+ mce_restart();
+
+ return size;
+}
+
+static ssize_t
+show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
+{
+ strcpy(buf, mce_helper);
+ strcat(buf, "\n");
+ return strlen(mce_helper) + 1;
+}
+
+static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
+ const char *buf, size_t siz)
+{
+ char *p;
+
+ strncpy(mce_helper, buf, sizeof(mce_helper));
+ mce_helper[sizeof(mce_helper)-1] = 0;
+ p = strchr(mce_helper, '\n');
+
+ if (p)
+ *p = 0;
+
+ return strlen(mce_helper) + !!p;
+}
+
+static ssize_t set_ignore_ce(struct sys_device *s,
+ struct sysdev_attribute *attr,
+ const char *buf, size_t size)
+{
+ u64 new;
+
+ if (strict_strtoull(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ if (mce_ignore_ce ^ !!new) {
+ if (new) {
+ /* disable ce features */
+ on_each_cpu(mce_disable_ce, (void *)1, 1);
+ mce_ignore_ce = 1;
+ } else {
+ /* enable ce features */
+ mce_ignore_ce = 0;
+ on_each_cpu(mce_enable_ce, (void *)1, 1);
+ }
+ }
+ return size;
+}
+
+static ssize_t set_cmci_disabled(struct sys_device *s,
+ struct sysdev_attribute *attr,
+ const char *buf, size_t size)
+{
+ u64 new;
+
+ if (strict_strtoull(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ if (mce_cmci_disabled ^ !!new) {
+ if (new) {
+ /* disable cmci */
+ on_each_cpu(mce_disable_ce, NULL, 1);
+ mce_cmci_disabled = 1;
+ } else {
+ /* enable cmci */
+ mce_cmci_disabled = 0;
+ on_each_cpu(mce_enable_ce, NULL, 1);
+ }
+ }
+ return size;
+}
+
+static ssize_t store_int_with_restart(struct sys_device *s,
+ struct sysdev_attribute *attr,
+ const char *buf, size_t size)
+{
+ ssize_t ret = sysdev_store_int(s, attr, buf, size);
+ mce_restart();
+ return ret;
+}
+
+static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
+static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
+static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
+static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
+
+static struct sysdev_ext_attribute attr_check_interval = {
+ _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
+ store_int_with_restart),
+ &check_interval
+};
+
+static struct sysdev_ext_attribute attr_ignore_ce = {
+ _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
+ &mce_ignore_ce
+};
+
+static struct sysdev_ext_attribute attr_cmci_disabled = {
+ _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
+ &mce_cmci_disabled
+};
+
+static struct sysdev_attribute *mce_attrs[] = {
+ &attr_tolerant.attr,
+ &attr_check_interval.attr,
+ &attr_trigger,
+ &attr_monarch_timeout.attr,
+ &attr_dont_log_ce.attr,
+ &attr_ignore_ce.attr,
+ &attr_cmci_disabled.attr,
+ NULL
+};
+
+static cpumask_var_t mce_dev_initialized;
+
+/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
+static __cpuinit int mce_create_device(unsigned int cpu)
+{
+ int err;
+ int i, j;
+
+ if (!mce_available(&boot_cpu_data))
+ return -EIO;
+
+ memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
+ per_cpu(mce_dev, cpu).id = cpu;
+ per_cpu(mce_dev, cpu).cls = &mce_sysclass;
+
+ err = sysdev_register(&per_cpu(mce_dev, cpu));
+ if (err)
+ return err;
+
+ for (i = 0; mce_attrs[i]; i++) {
+ err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+ if (err)
+ goto error;
+ }
+ for (j = 0; j < banks; j++) {
+ err = sysdev_create_file(&per_cpu(mce_dev, cpu),
+ &bank_attrs[j]);
+ if (err)
+ goto error2;
+ }
+ cpumask_set_cpu(cpu, mce_dev_initialized);
+
+ return 0;
+error2:
+ while (--j >= 0)
+ sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]);
+error:
+ while (--i >= 0)
+ sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+
+ sysdev_unregister(&per_cpu(mce_dev, cpu));
+
+ return err;
+}
+
+static __cpuinit void mce_remove_device(unsigned int cpu)
+{
+ int i;
+
+ if (!cpumask_test_cpu(cpu, mce_dev_initialized))
+ return;
+
+ for (i = 0; mce_attrs[i]; i++)
+ sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+
+ for (i = 0; i < banks; i++)
+ sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
+
+ sysdev_unregister(&per_cpu(mce_dev, cpu));
+ cpumask_clear_cpu(cpu, mce_dev_initialized);
+}
+
+/* Make sure there are no machine checks on offlined CPUs. */
+static void mce_disable_cpu(void *h)
+{
+ unsigned long action = *(unsigned long *)h;
+ int i;
+
+ if (!mce_available(&current_cpu_data))
+ return;
+ if (!(action & CPU_TASKS_FROZEN))
+ cmci_clear();
+ for (i = 0; i < banks; i++) {
+ if (!skip_bank_init(i))
+ wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+ }
+}
+
+static void mce_reenable_cpu(void *h)
+{
+ unsigned long action = *(unsigned long *)h;
+ int i;
+
+ if (!mce_available(&current_cpu_data))
+ return;
+
+ if (!(action & CPU_TASKS_FROZEN))
+ cmci_reenable();
+ for (i = 0; i < banks; i++) {
+ if (!skip_bank_init(i))
+ wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
+ }
+}
+
+/* Get notified when a cpu comes on/off. Be hotplug friendly. */
+static int __cpuinit
+mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+ struct timer_list *t = &per_cpu(mce_timer, cpu);
+
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ mce_create_device(cpu);
+ if (threshold_cpu_callback)
+ threshold_cpu_callback(action, cpu);
+ break;
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ if (threshold_cpu_callback)
+ threshold_cpu_callback(action, cpu);
+ mce_remove_device(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ del_timer_sync(t);
+ smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+ break;
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ t->expires = round_jiffies(jiffies +
+ __get_cpu_var(next_interval));
+ add_timer_on(t, cpu);
+ smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+ break;
+ case CPU_POST_DEAD:
+ /* intentionally ignoring frozen here */
+ cmci_rediscover(cpu);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block mce_cpu_notifier __cpuinitdata = {
+ .notifier_call = mce_cpu_callback,
+};
+
+static __init int mce_init_banks(void)
+{
+ int i;
+
+ bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
+ GFP_KERNEL);
+ if (!bank_attrs)
+ return -ENOMEM;
+
+ for (i = 0; i < banks; i++) {
+ struct sysdev_attribute *a = &bank_attrs[i];
+
+ a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
+ if (!a->attr.name)
+ goto nomem;
+
+ a->attr.mode = 0644;
+ a->show = show_bank;
+ a->store = set_bank;
+ }
+ return 0;
+
+nomem:
+ while (--i >= 0)
+ kfree(bank_attrs[i].attr.name);
+ kfree(bank_attrs);
+ bank_attrs = NULL;
+
+ return -ENOMEM;
+}
+
+static __init int mce_init_device(void)
+{
+ int err;
+ int i = 0;
+
+ if (!mce_available(&boot_cpu_data))
+ return -EIO;
+
+ zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
+
+ err = mce_init_banks();
+ if (err)
+ return err;
+
+ err = sysdev_class_register(&mce_sysclass);
+ if (err)
+ return err;
+
+ for_each_online_cpu(i) {
+ err = mce_create_device(i);
+ if (err)
+ return err;
+ }
+
+ register_hotcpu_notifier(&mce_cpu_notifier);
+ misc_register(&mce_log_device);
+
+ return err;
+}
+
+device_initcall(mce_init_device);
+
+#else /* CONFIG_X86_OLD_MCE: */
+
+int nr_mce_banks;
+EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
+
+/* This has to be run for each processor */
+void mcheck_init(struct cpuinfo_x86 *c)
+{
+ if (mce_disabled)
+ return;
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_AMD:
+ amd_mcheck_init(c);
+ break;
+
+ case X86_VENDOR_INTEL:
+ if (c->x86 == 5)
+ intel_p5_mcheck_init(c);
+ if (c->x86 == 6)
+ intel_p6_mcheck_init(c);
+ if (c->x86 == 15)
+ intel_p4_mcheck_init(c);
+ break;
+
+ case X86_VENDOR_CENTAUR:
+ if (c->x86 == 5)
+ winchip_mcheck_init(c);
+ break;
+
+ default:
+ break;
+ }
+ printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
+}
+
+static int __init mcheck_enable(char *str)
+{
+ mce_p5_enabled = 1;
+ return 1;
+}
+__setup("mce", mcheck_enable);
+
+#endif /* CONFIG_X86_OLD_MCE */
+
+/*
+ * Old style boot options parsing. Only for compatibility.
+ */
+static int __init mcheck_disable(char *str)
+{
+ mce_disabled = 1;
+ return 1;
+}
+__setup("nomce", mcheck_disable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h
deleted file mode 100644
index ae9f628838f..00000000000
--- a/arch/x86/kernel/cpu/mcheck/mce.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <linux/init.h>
-#include <asm/mce.h>
-
-void amd_mcheck_init(struct cpuinfo_x86 *c);
-void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
-void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
-void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
-void winchip_mcheck_init(struct cpuinfo_x86 *c);
-
-/* Call the installed machine check handler for this CPU setup. */
-extern void (*machine_check_vector)(struct pt_regs *, long error_code);
-
-extern int nr_mce_banks;
-
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
deleted file mode 100644
index 3552119b091..00000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * mce.c - x86 Machine Check Exception Reporting
- * (c) 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>, Dave Jones <davej@redhat.com>
- */
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/smp.h>
-#include <linux/thread_info.h>
-
-#include <asm/processor.h>
-#include <asm/system.h>
-#include <asm/mce.h>
-
-#include "mce.h"
-
-int mce_disabled;
-int nr_mce_banks;
-
-EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
-
-/* Handle unconfigured int18 (should never happen) */
-static void unexpected_machine_check(struct pt_regs *regs, long error_code)
-{
- printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
-}
-
-/* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
-
-/* This has to be run for each processor */
-void mcheck_init(struct cpuinfo_x86 *c)
-{
- if (mce_disabled == 1)
- return;
-
- switch (c->x86_vendor) {
- case X86_VENDOR_AMD:
- amd_mcheck_init(c);
- break;
-
- case X86_VENDOR_INTEL:
- if (c->x86 == 5)
- intel_p5_mcheck_init(c);
- if (c->x86 == 6)
- intel_p6_mcheck_init(c);
- if (c->x86 == 15)
- intel_p4_mcheck_init(c);
- break;
-
- case X86_VENDOR_CENTAUR:
- if (c->x86 == 5)
- winchip_mcheck_init(c);
- break;
-
- default:
- break;
- }
-}
-
-static int __init mcheck_disable(char *str)
-{
- mce_disabled = 1;
- return 1;
-}
-
-static int __init mcheck_enable(char *str)
-{
- mce_disabled = -1;
- return 1;
-}
-
-__setup("nomce", mcheck_disable);
-__setup("mce", mcheck_enable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
deleted file mode 100644
index 09dd1d414fc..00000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ /dev/null
@@ -1,1187 +0,0 @@
-/*
- * Machine check handler.
- * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
- * Rest from unknown author(s).
- * 2004 Andi Kleen. Rewrote most of it.
- * Copyright 2008 Intel Corporation
- * Author: Andi Kleen
- */
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/smp_lock.h>
-#include <linux/string.h>
-#include <linux/rcupdate.h>
-#include <linux/kallsyms.h>
-#include <linux/sysdev.h>
-#include <linux/miscdevice.h>
-#include <linux/fs.h>
-#include <linux/capability.h>
-#include <linux/cpu.h>
-#include <linux/percpu.h>
-#include <linux/poll.h>
-#include <linux/thread_info.h>
-#include <linux/ctype.h>
-#include <linux/kmod.h>
-#include <linux/kdebug.h>
-#include <linux/kobject.h>
-#include <linux/sysfs.h>
-#include <linux/ratelimit.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/mce.h>
-#include <asm/uaccess.h>
-#include <asm/smp.h>
-#include <asm/idle.h>
-
-#define MISC_MCELOG_MINOR 227
-
-atomic_t mce_entry;
-
-static int mce_dont_init;
-
-/*
- * Tolerant levels:
- * 0: always panic on uncorrected errors, log corrected errors
- * 1: panic or SIGBUS on uncorrected errors, log corrected errors
- * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
- * 3: never panic or SIGBUS, log all errors (for testing only)
- */
-static int tolerant = 1;
-static int banks;
-static u64 *bank;
-static unsigned long notify_user;
-static int rip_msr;
-static int mce_bootlog = -1;
-static atomic_t mce_events;
-
-static char trigger[128];
-static char *trigger_argv[2] = { trigger, NULL };
-
-static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
-
-/* MCA banks polled by the period polling timer for corrected events */
-DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
- [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
-};
-
-/* Do initial initialization of a struct mce */
-void mce_setup(struct mce *m)
-{
- memset(m, 0, sizeof(struct mce));
- m->cpu = smp_processor_id();
- rdtscll(m->tsc);
-}
-
-/*
- * Lockless MCE logging infrastructure.
- * This avoids deadlocks on printk locks without having to break locks. Also
- * separate MCEs from kernel messages to avoid bogus bug reports.
- */
-
-static struct mce_log mcelog = {
- MCE_LOG_SIGNATURE,
- MCE_LOG_LEN,
-};
-
-void mce_log(struct mce *mce)
-{
- unsigned next, entry;
- atomic_inc(&mce_events);
- mce->finished = 0;
- wmb();
- for (;;) {
- entry = rcu_dereference(mcelog.next);
- for (;;) {
- /* When the buffer fills up discard new entries. Assume
- that the earlier errors are the more interesting. */
- if (entry >= MCE_LOG_LEN) {
- set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
- return;
- }
- /* Old left over entry. Skip. */
- if (mcelog.entry[entry].finished) {
- entry++;
- continue;
- }
- break;
- }
- smp_rmb();
- next = entry + 1;
- if (cmpxchg(&mcelog.next, entry, next) == entry)
- break;
- }
- memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
- wmb();
- mcelog.entry[entry].finished = 1;
- wmb();
-
- set_bit(0, &notify_user);
-}
-
-static void print_mce(struct mce *m)
-{
- printk(KERN_EMERG "\n"
- KERN_EMERG "HARDWARE ERROR\n"
- KERN_EMERG
- "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
- m->cpu, m->mcgstatus, m->bank, m->status);
- if (m->ip) {
- printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
- !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
- m->cs, m->ip);
- if (m->cs == __KERNEL_CS)
- print_symbol("{%s}", m->ip);
- printk("\n");
- }
- printk(KERN_EMERG "TSC %llx ", m->tsc);
- if (m->addr)
- printk("ADDR %llx ", m->addr);
- if (m->misc)
- printk("MISC %llx ", m->misc);
- printk("\n");
- printk(KERN_EMERG "This is not a software problem!\n");
- printk(KERN_EMERG "Run through mcelog --ascii to decode "
- "and contact your hardware vendor\n");
-}
-
-static void mce_panic(char *msg, struct mce *backup, unsigned long start)
-{
- int i;
-
- oops_begin();
- for (i = 0; i < MCE_LOG_LEN; i++) {
- unsigned long tsc = mcelog.entry[i].tsc;
-
- if (time_before(tsc, start))
- continue;
- print_mce(&mcelog.entry[i]);
- if (backup && mcelog.entry[i].tsc == backup->tsc)
- backup = NULL;
- }
- if (backup)
- print_mce(backup);
- panic(msg);
-}
-
-int mce_available(struct cpuinfo_x86 *c)
-{
- if (mce_dont_init)
- return 0;
- return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
-}
-
-static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
-{
- if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
- m->ip = regs->ip;
- m->cs = regs->cs;
- } else {
- m->ip = 0;
- m->cs = 0;
- }
- if (rip_msr) {
- /* Assume the RIP in the MSR is exact. Is this true? */
- m->mcgstatus |= MCG_STATUS_EIPV;
- rdmsrl(rip_msr, m->ip);
- m->cs = 0;
- }
-}
-
-/*
- * Poll for corrected events or events that happened before reset.
- * Those are just logged through /dev/mcelog.
- *
- * This is executed in standard interrupt context.
- */
-void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
-{
- struct mce m;
- int i;
-
- mce_setup(&m);
-
- rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
- for (i = 0; i < banks; i++) {
- if (!bank[i] || !test_bit(i, *b))
- continue;
-
- m.misc = 0;
- m.addr = 0;
- m.bank = i;
- m.tsc = 0;
-
- barrier();
- rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
- if (!(m.status & MCI_STATUS_VAL))
- continue;
-
- /*
- * Uncorrected events are handled by the exception handler
- * when it is enabled. But when the exception is disabled log
- * everything.
- *
- * TBD do the same check for MCI_STATUS_EN here?
- */
- if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
- continue;
-
- if (m.status & MCI_STATUS_MISCV)
- rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
- if (m.status & MCI_STATUS_ADDRV)
- rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
-
- if (!(flags & MCP_TIMESTAMP))
- m.tsc = 0;
- /*
- * Don't get the IP here because it's unlikely to
- * have anything to do with the actual error location.
- */
- if (!(flags & MCP_DONTLOG)) {
- mce_log(&m);
- add_taint(TAINT_MACHINE_CHECK);
- }
-
- /*
- * Clear state for this bank.
- */
- wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
- }
-
- /*
- * Don't clear MCG_STATUS here because it's only defined for
- * exceptions.
- */
-}
-
-/*
- * The actual machine check handler. This only handles real
- * exceptions when something got corrupted coming in through int 18.
- *
- * This is executed in NMI context not subject to normal locking rules. This
- * implies that most kernel services cannot be safely used. Don't even
- * think about putting a printk in there!
- */
-void do_machine_check(struct pt_regs * regs, long error_code)
-{
- struct mce m, panicm;
- u64 mcestart = 0;
- int i;
- int panicm_found = 0;
- /*
- * If no_way_out gets set, there is no safe way to recover from this
- * MCE. If tolerant is cranked up, we'll try anyway.
- */
- int no_way_out = 0;
- /*
- * If kill_it gets set, there might be a way to recover from this
- * error.
- */
- int kill_it = 0;
- DECLARE_BITMAP(toclear, MAX_NR_BANKS);
-
- atomic_inc(&mce_entry);
-
- if (notify_die(DIE_NMI, "machine check", regs, error_code,
- 18, SIGKILL) == NOTIFY_STOP)
- goto out2;
- if (!banks)
- goto out2;
-
- mce_setup(&m);
-
- rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
- /* if the restart IP is not valid, we're done for */
- if (!(m.mcgstatus & MCG_STATUS_RIPV))
- no_way_out = 1;
-
- rdtscll(mcestart);
- barrier();
-
- for (i = 0; i < banks; i++) {
- __clear_bit(i, toclear);
- if (!bank[i])
- continue;
-
- m.misc = 0;
- m.addr = 0;
- m.bank = i;
-
- rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
- if ((m.status & MCI_STATUS_VAL) == 0)
- continue;
-
- /*
- * Non uncorrected errors are handled by machine_check_poll
- * Leave them alone.
- */
- if ((m.status & MCI_STATUS_UC) == 0)
- continue;
-
- /*
- * Set taint even when machine check was not enabled.
- */
- add_taint(TAINT_MACHINE_CHECK);
-
- __set_bit(i, toclear);
-
- if (m.status & MCI_STATUS_EN) {
- /* if PCC was set, there's no way out */
- no_way_out |= !!(m.status & MCI_STATUS_PCC);
- /*
- * If this error was uncorrectable and there was
- * an overflow, we're in trouble. If no overflow,
- * we might get away with just killing a task.
- */
- if (m.status & MCI_STATUS_UC) {
- if (tolerant < 1 || m.status & MCI_STATUS_OVER)
- no_way_out = 1;
- kill_it = 1;
- }
- } else {
- /*
- * Machine check event was not enabled. Clear, but
- * ignore.
- */
- continue;
- }
-
- if (m.status & MCI_STATUS_MISCV)
- rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
- if (m.status & MCI_STATUS_ADDRV)
- rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
-
- mce_get_rip(&m, regs);
- mce_log(&m);
-
- /* Did this bank cause the exception? */
- /* Assume that the bank with uncorrectable errors did it,
- and that there is only a single one. */
- if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
- panicm = m;
- panicm_found = 1;
- }
- }
-
- /* If we didn't find an uncorrectable error, pick
- the last one (shouldn't happen, just being safe). */
- if (!panicm_found)
- panicm = m;
-
- /*
- * If we have decided that we just CAN'T continue, and the user
- * has not set tolerant to an insane level, give up and die.
- */
- if (no_way_out && tolerant < 3)
- mce_panic("Machine check", &panicm, mcestart);
-
- /*
- * If the error seems to be unrecoverable, something should be
- * done. Try to kill as little as possible. If we can kill just
- * one task, do that. If the user has set the tolerance very
- * high, don't try to do anything at all.
- */
- if (kill_it && tolerant < 3) {
- int user_space = 0;
-
- /*
- * If the EIPV bit is set, it means the saved IP is the
- * instruction which caused the MCE.
- */
- if (m.mcgstatus & MCG_STATUS_EIPV)
- user_space = panicm.ip && (panicm.cs & 3);
-
- /*
- * If we know that the error was in user space, send a
- * SIGBUS. Otherwise, panic if tolerance is low.
- *
- * force_sig() takes an awful lot of locks and has a slight
- * risk of deadlocking.
- */
- if (user_space) {
- force_sig(SIGBUS, current);
- } else if (panic_on_oops || tolerant < 2) {
- mce_panic("Uncorrected machine check",
- &panicm, mcestart);
- }
- }
-
- /* notify userspace ASAP */
- set_thread_flag(TIF_MCE_NOTIFY);
-
- /* the last thing we do is clear state */
- for (i = 0; i < banks; i++) {
- if (test_bit(i, toclear))
- wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
- }
- wrmsrl(MSR_IA32_MCG_STATUS, 0);
- out2:
- atomic_dec(&mce_entry);
-}
-
-#ifdef CONFIG_X86_MCE_INTEL
-/***
- * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
- * @cpu: The CPU on which the event occurred.
- * @status: Event status information
- *
- * This function should be called by the thermal interrupt after the
- * event has been processed and the decision was made to log the event
- * further.
- *
- * The status parameter will be saved to the 'status' field of 'struct mce'
- * and historically has been the register value of the
- * MSR_IA32_THERMAL_STATUS (Intel) msr.
- */
-void mce_log_therm_throt_event(__u64 status)
-{
- struct mce m;
-
- mce_setup(&m);
- m.bank = MCE_THERMAL_BANK;
- m.status = status;
- mce_log(&m);
-}
-#endif /* CONFIG_X86_MCE_INTEL */
-
-/*
- * Periodic polling timer for "silent" machine check errors. If the
- * poller finds an MCE, poll 2x faster. When the poller finds no more
- * errors, poll 2x slower (up to check_interval seconds).
- */
-
-static int check_interval = 5 * 60; /* 5 minutes */
-static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
-static void mcheck_timer(unsigned long);
-static DEFINE_PER_CPU(struct timer_list, mce_timer);
-
-static void mcheck_timer(unsigned long data)
-{
- struct timer_list *t = &per_cpu(mce_timer, data);
- int *n;
-
- WARN_ON(smp_processor_id() != data);
-
- if (mce_available(&current_cpu_data))
- machine_check_poll(MCP_TIMESTAMP,
- &__get_cpu_var(mce_poll_banks));
-
- /*
- * Alert userspace if needed. If we logged an MCE, reduce the
- * polling interval, otherwise increase the polling interval.
- */
- n = &__get_cpu_var(next_interval);
- if (mce_notify_user()) {
- *n = max(*n/2, HZ/100);
- } else {
- *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
- }
-
- t->expires = jiffies + *n;
- add_timer(t);
-}
-
-static void mce_do_trigger(struct work_struct *work)
-{
- call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
-}
-
-static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
-
-/*
- * Notify the user(s) about new machine check events.
- * Can be called from interrupt context, but not from machine check/NMI
- * context.
- */
-int mce_notify_user(void)
-{
- /* Not more than two messages every minute */
- static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
-
- clear_thread_flag(TIF_MCE_NOTIFY);
- if (test_and_clear_bit(0, &notify_user)) {
- wake_up_interruptible(&mce_wait);
-
- /*
- * There is no risk of missing notifications because
- * work_pending is always cleared before the function is
- * executed.
- */
- if (trigger[0] && !work_pending(&mce_trigger_work))
- schedule_work(&mce_trigger_work);
-
- if (__ratelimit(&ratelimit))
- printk(KERN_INFO "Machine check events logged\n");
-
- return 1;
- }
- return 0;
-}
-
-/* see if the idle task needs to notify userspace */
-static int
-mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
-{
- /* IDLE_END should be safe - interrupts are back on */
- if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
- mce_notify_user();
-
- return NOTIFY_OK;
-}
-
-static struct notifier_block mce_idle_notifier = {
- .notifier_call = mce_idle_callback,
-};
-
-static __init int periodic_mcheck_init(void)
-{
- idle_notifier_register(&mce_idle_notifier);
- return 0;
-}
-__initcall(periodic_mcheck_init);
-
-/*
- * Initialize Machine Checks for a CPU.
- */
-static int mce_cap_init(void)
-{
- u64 cap;
- unsigned b;
-
- rdmsrl(MSR_IA32_MCG_CAP, cap);
- b = cap & 0xff;
- if (b > MAX_NR_BANKS) {
- printk(KERN_WARNING
- "MCE: Using only %u machine check banks out of %u\n",
- MAX_NR_BANKS, b);
- b = MAX_NR_BANKS;
- }
-
- /* Don't support asymmetric configurations today */
- WARN_ON(banks != 0 && b != banks);
- banks = b;
- if (!bank) {
- bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
- if (!bank)
- return -ENOMEM;
- memset(bank, 0xff, banks * sizeof(u64));
- }
-
- /* Use accurate RIP reporting if available. */
- if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
- rip_msr = MSR_IA32_MCG_EIP;
-
- return 0;
-}
-
-static void mce_init(void *dummy)
-{
- u64 cap;
- int i;
- mce_banks_t all_banks;
-
- /*
- * Log the machine checks left over from the previous reset.
- */
- bitmap_fill(all_banks, MAX_NR_BANKS);
- machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
-
- set_in_cr4(X86_CR4_MCE);
-
- rdmsrl(MSR_IA32_MCG_CAP, cap);
- if (cap & MCG_CTL_P)
- wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
-
- for (i = 0; i < banks; i++) {
- wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
- wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
- }
-}
-
-/* Add per CPU specific workarounds here */
-static void mce_cpu_quirks(struct cpuinfo_x86 *c)
-{
- /* This should be disabled by the BIOS, but isn't always */
- if (c->x86_vendor == X86_VENDOR_AMD) {
- if (c->x86 == 15 && banks > 4)
- /* disable GART TBL walk error reporting, which trips off
- incorrectly with the IOMMU & 3ware & Cerberus. */
- clear_bit(10, (unsigned long *)&bank[4]);
- if(c->x86 <= 17 && mce_bootlog < 0)
- /* Lots of broken BIOS around that don't clear them
- by default and leave crap in there. Don't log. */
- mce_bootlog = 0;
- }
-
-}
-
-static void mce_cpu_features(struct cpuinfo_x86 *c)
-{
- switch (c->x86_vendor) {
- case X86_VENDOR_INTEL:
- mce_intel_feature_init(c);
- break;
- case X86_VENDOR_AMD:
- mce_amd_feature_init(c);
- break;
- default:
- break;
- }
-}
-
-static void mce_init_timer(void)
-{
- struct timer_list *t = &__get_cpu_var(mce_timer);
- int *n = &__get_cpu_var(next_interval);
-
- *n = check_interval * HZ;
- if (!*n)
- return;
- setup_timer(t, mcheck_timer, smp_processor_id());
- t->expires = round_jiffies(jiffies + *n);
- add_timer(t);
-}
-
-/*
- * Called for each booted CPU to set up machine checks.
- * Must be called with preempt off.
- */
-void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
-{
- if (!mce_available(c))
- return;
-
- if (mce_cap_init() < 0) {
- mce_dont_init = 1;
- return;
- }
- mce_cpu_quirks(c);
-
- mce_init(NULL);
- mce_cpu_features(c);
- mce_init_timer();
-}
-
-/*
- * Character device to read and clear the MCE log.
- */
-
-static DEFINE_SPINLOCK(mce_state_lock);
-static int open_count; /* #times opened */
-static int open_exclu; /* already open exclusive? */
-
-static int mce_open(struct inode *inode, struct file *file)
-{
- lock_kernel();
- spin_lock(&mce_state_lock);
-
- if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
- spin_unlock(&mce_state_lock);
- unlock_kernel();
- return -EBUSY;
- }
-
- if (file->f_flags & O_EXCL)
- open_exclu = 1;
- open_count++;
-
- spin_unlock(&mce_state_lock);
- unlock_kernel();
-
- return nonseekable_open(inode, file);
-}
-
-static int mce_release(struct inode *inode, struct file *file)
-{
- spin_lock(&mce_state_lock);
-
- open_count--;
- open_exclu = 0;
-
- spin_unlock(&mce_state_lock);
-
- return 0;
-}
-
-static void collect_tscs(void *data)
-{
- unsigned long *cpu_tsc = (unsigned long *)data;
-
- rdtscll(cpu_tsc[smp_processor_id()]);
-}
-
-static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
- loff_t *off)
-{
- unsigned long *cpu_tsc;
- static DEFINE_MUTEX(mce_read_mutex);
- unsigned prev, next;
- char __user *buf = ubuf;
- int i, err;
-
- cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
- if (!cpu_tsc)
- return -ENOMEM;
-
- mutex_lock(&mce_read_mutex);
- next = rcu_dereference(mcelog.next);
-
- /* Only supports full reads right now */
- if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
- mutex_unlock(&mce_read_mutex);
- kfree(cpu_tsc);
- return -EINVAL;
- }
-
- err = 0;
- prev = 0;
- do {
- for (i = prev; i < next; i++) {
- unsigned long start = jiffies;
-
- while (!mcelog.entry[i].finished) {
- if (time_after_eq(jiffies, start + 2)) {
- memset(mcelog.entry + i, 0,
- sizeof(struct mce));
- goto timeout;
- }
- cpu_relax();
- }
- smp_rmb();
- err |= copy_to_user(buf, mcelog.entry + i,
- sizeof(struct mce));
- buf += sizeof(struct mce);
-timeout:
- ;
- }
-
- memset(mcelog.entry + prev, 0,
- (next - prev) * sizeof(struct mce));
- prev = next;
- next = cmpxchg(&mcelog.next, prev, 0);
- } while (next != prev);
-
- synchronize_sched();
-
- /*
- * Collect entries that were still getting written before the
- * synchronize.
- */
- on_each_cpu(collect_tscs, cpu_tsc, 1);
- for (i = next; i < MCE_LOG_LEN; i++) {
- if (mcelog.entry[i].finished &&
- mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
- err |= copy_to_user(buf, mcelog.entry+i,
- sizeof(struct mce));
- smp_rmb();
- buf += sizeof(struct mce);
- memset(&mcelog.entry[i], 0, sizeof(struct mce));
- }
- }
- mutex_unlock(&mce_read_mutex);
- kfree(cpu_tsc);
- return err ? -EFAULT : buf - ubuf;
-}
-
-static unsigned int mce_poll(struct file *file, poll_table *wait)
-{
- poll_wait(file, &mce_wait, wait);
- if (rcu_dereference(mcelog.next))
- return POLLIN | POLLRDNORM;
- return 0;
-}
-
-static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
-{
- int __user *p = (int __user *)arg;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- switch (cmd) {
- case MCE_GET_RECORD_LEN:
- return put_user(sizeof(struct mce), p);
- case MCE_GET_LOG_LEN:
- return put_user(MCE_LOG_LEN, p);
- case MCE_GETCLEAR_FLAGS: {
- unsigned flags;
-
- do {
- flags = mcelog.flags;
- } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
- return put_user(flags, p);
- }
- default:
- return -ENOTTY;
- }
-}
-
-static const struct file_operations mce_chrdev_ops = {
- .open = mce_open,
- .release = mce_release,
- .read = mce_read,
- .poll = mce_poll,
- .unlocked_ioctl = mce_ioctl,
-};
-
-static struct miscdevice mce_log_device = {
- MISC_MCELOG_MINOR,
- "mcelog",
- &mce_chrdev_ops,
-};
-
-/*
- * Old style boot options parsing. Only for compatibility.
- */
-static int __init mcheck_disable(char *str)
-{
- mce_dont_init = 1;
- return 1;
-}
-
-/* mce=off disables machine check.
- mce=TOLERANCELEVEL (number, see above)
- mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
- mce=nobootlog Don't log MCEs from before booting. */
-static int __init mcheck_enable(char *str)
-{
- if (!strcmp(str, "off"))
- mce_dont_init = 1;
- else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
- mce_bootlog = str[0] == 'b';
- else if (isdigit(str[0]))
- get_option(&str, &tolerant);
- else
- printk("mce= argument %s ignored. Please use /sys", str);
- return 1;
-}
-
-__setup("nomce", mcheck_disable);
-__setup("mce=", mcheck_enable);
-
-/*
- * Sysfs support
- */
-
-/*
- * Disable machine checks on suspend and shutdown. We can't really handle
- * them later.
- */
-static int mce_disable(void)
-{
- int i;
-
- for (i = 0; i < banks; i++)
- wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
- return 0;
-}
-
-static int mce_suspend(struct sys_device *dev, pm_message_t state)
-{
- return mce_disable();
-}
-
-static int mce_shutdown(struct sys_device *dev)
-{
- return mce_disable();
-}
-
-/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
- Only one CPU is active at this time, the others get readded later using
- CPU hotplug. */
-static int mce_resume(struct sys_device *dev)
-{
- mce_init(NULL);
- mce_cpu_features(&current_cpu_data);
- return 0;
-}
-
-static void mce_cpu_restart(void *data)
-{
- del_timer_sync(&__get_cpu_var(mce_timer));
- if (mce_available(&current_cpu_data))
- mce_init(NULL);
- mce_init_timer();
-}
-
-/* Reinit MCEs after user configuration changes */
-static void mce_restart(void)
-{
- on_each_cpu(mce_cpu_restart, NULL, 1);
-}
-
-static struct sysdev_class mce_sysclass = {
- .suspend = mce_suspend,
- .shutdown = mce_shutdown,
- .resume = mce_resume,
- .name = "machinecheck",
-};
-
-DEFINE_PER_CPU(struct sys_device, device_mce);
-void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
-
-/* Why are there no generic functions for this? */
-#define ACCESSOR(name, var, start) \
- static ssize_t show_ ## name(struct sys_device *s, \
- struct sysdev_attribute *attr, \
- char *buf) { \
- return sprintf(buf, "%lx\n", (unsigned long)var); \
- } \
- static ssize_t set_ ## name(struct sys_device *s, \
- struct sysdev_attribute *attr, \
- const char *buf, size_t siz) { \
- char *end; \
- unsigned long new = simple_strtoul(buf, &end, 0); \
- if (end == buf) return -EINVAL; \
- var = new; \
- start; \
- return end-buf; \
- } \
- static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
-
-static struct sysdev_attribute *bank_attrs;
-
-static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
- char *buf)
-{
- u64 b = bank[attr - bank_attrs];
- return sprintf(buf, "%llx\n", b);
-}
-
-static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
- const char *buf, size_t siz)
-{
- char *end;
- u64 new = simple_strtoull(buf, &end, 0);
- if (end == buf)
- return -EINVAL;
- bank[attr - bank_attrs] = new;
- mce_restart();
- return end-buf;
-}
-
-static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
- char *buf)
-{
- strcpy(buf, trigger);
- strcat(buf, "\n");
- return strlen(trigger) + 1;
-}
-
-static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
- const char *buf,size_t siz)
-{
- char *p;
- int len;
- strncpy(trigger, buf, sizeof(trigger));
- trigger[sizeof(trigger)-1] = 0;
- len = strlen(trigger);
- p = strchr(trigger, '\n');
- if (*p) *p = 0;
- return len;
-}
-
-static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
-static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
-ACCESSOR(check_interval,check_interval,mce_restart())
-static struct sysdev_attribute *mce_attributes[] = {
- &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
- NULL
-};
-
-static cpumask_var_t mce_device_initialized;
-
-/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
-static __cpuinit int mce_create_device(unsigned int cpu)
-{
- int err;
- int i;
-
- if (!mce_available(&boot_cpu_data))
- return -EIO;
-
- memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
- per_cpu(device_mce,cpu).id = cpu;
- per_cpu(device_mce,cpu).cls = &mce_sysclass;
-
- err = sysdev_register(&per_cpu(device_mce,cpu));
- if (err)
- return err;
-
- for (i = 0; mce_attributes[i]; i++) {
- err = sysdev_create_file(&per_cpu(device_mce,cpu),
- mce_attributes[i]);
- if (err)
- goto error;
- }
- for (i = 0; i < banks; i++) {
- err = sysdev_create_file(&per_cpu(device_mce, cpu),
- &bank_attrs[i]);
- if (err)
- goto error2;
- }
- cpumask_set_cpu(cpu, mce_device_initialized);
-
- return 0;
-error2:
- while (--i >= 0) {
- sysdev_remove_file(&per_cpu(device_mce, cpu),
- &bank_attrs[i]);
- }
-error:
- while (--i >= 0) {
- sysdev_remove_file(&per_cpu(device_mce,cpu),
- mce_attributes[i]);
- }
- sysdev_unregister(&per_cpu(device_mce,cpu));
-
- return err;
-}
-
-static __cpuinit void mce_remove_device(unsigned int cpu)
-{
- int i;
-
- if (!cpumask_test_cpu(cpu, mce_device_initialized))
- return;
-
- for (i = 0; mce_attributes[i]; i++)
- sysdev_remove_file(&per_cpu(device_mce,cpu),
- mce_attributes[i]);
- for (i = 0; i < banks; i++)
- sysdev_remove_file(&per_cpu(device_mce, cpu),
- &bank_attrs[i]);
- sysdev_unregister(&per_cpu(device_mce,cpu));
- cpumask_clear_cpu(cpu, mce_device_initialized);
-}
-
-/* Make sure there are no machine checks on offlined CPUs. */
-static void mce_disable_cpu(void *h)
-{
- int i;
- unsigned long action = *(unsigned long *)h;
-
- if (!mce_available(&current_cpu_data))
- return;
- if (!(action & CPU_TASKS_FROZEN))
- cmci_clear();
- for (i = 0; i < banks; i++)
- wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
-}
-
-static void mce_reenable_cpu(void *h)
-{
- int i;
- unsigned long action = *(unsigned long *)h;
-
- if (!mce_available(&current_cpu_data))
- return;
- if (!(action & CPU_TASKS_FROZEN))
- cmci_reenable();
- for (i = 0; i < banks; i++)
- wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
-}
-
-/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- struct timer_list *t = &per_cpu(mce_timer, cpu);
-
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- mce_create_device(cpu);
- if (threshold_cpu_callback)
- threshold_cpu_callback(action, cpu);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- if (threshold_cpu_callback)
- threshold_cpu_callback(action, cpu);
- mce_remove_device(cpu);
- break;
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- del_timer_sync(t);
- smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
- break;
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- t->expires = round_jiffies(jiffies +
- __get_cpu_var(next_interval));
- add_timer_on(t, cpu);
- smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
- break;
- case CPU_POST_DEAD:
- /* intentionally ignoring frozen here */
- cmci_rediscover(cpu);
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block mce_cpu_notifier __cpuinitdata = {
- .notifier_call = mce_cpu_callback,
-};
-
-static __init int mce_init_banks(void)
-{
- int i;
-
- bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
- GFP_KERNEL);
- if (!bank_attrs)
- return -ENOMEM;
-
- for (i = 0; i < banks; i++) {
- struct sysdev_attribute *a = &bank_attrs[i];
- a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
- if (!a->attr.name)
- goto nomem;
- a->attr.mode = 0644;
- a->show = show_bank;
- a->store = set_bank;
- }
- return 0;
-
-nomem:
- while (--i >= 0)
- kfree(bank_attrs[i].attr.name);
- kfree(bank_attrs);
- bank_attrs = NULL;
- return -ENOMEM;
-}
-
-static __init int mce_init_device(void)
-{
- int err;
- int i = 0;
-
- if (!mce_available(&boot_cpu_data))
- return -EIO;
-
- zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
-
- err = mce_init_banks();
- if (err)
- return err;
-
- err = sysdev_class_register(&mce_sysclass);
- if (err)
- return err;
-
- for_each_online_cpu(i) {
- err = mce_create_device(i);
- if (err)
- return err;
- }
-
- register_hotcpu_notifier(&mce_cpu_notifier);
- misc_register(&mce_log_device);
- return err;
-}
-
-device_initcall(mce_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 56dde9c4bc9..ddae21620bd 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -13,22 +13,22 @@
*
* All MC4_MISCi registers are shared between multi-cores
*/
-
-#include <linux/cpu.h>
-#include <linux/errno.h>
-#include <linux/init.h>
#include <linux/interrupt.h>
-#include <linux/kobject.h>
#include <linux/notifier.h>
-#include <linux/sched.h>
-#include <linux/smp.h>
+#include <linux/kobject.h>
+#include <linux/percpu.h>
#include <linux/sysdev.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+
#include <asm/apic.h>
+#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/msr.h>
-#include <asm/percpu.h>
-#include <asm/idle.h>
#define PFX "mce_threshold: "
#define VERSION "version 1.1.1"
@@ -48,26 +48,26 @@
#define MCG_XBLK_ADDR 0xC0000400
struct threshold_block {
- unsigned int block;
- unsigned int bank;
- unsigned int cpu;
- u32 address;
- u16 interrupt_enable;
- u16 threshold_limit;
- struct kobject kobj;
- struct list_head miscj;
+ unsigned int block;
+ unsigned int bank;
+ unsigned int cpu;
+ u32 address;
+ u16 interrupt_enable;
+ u16 threshold_limit;
+ struct kobject kobj;
+ struct list_head miscj;
};
/* defaults used early on boot */
static struct threshold_block threshold_defaults = {
- .interrupt_enable = 0,
- .threshold_limit = THRESHOLD_MAX,
+ .interrupt_enable = 0,
+ .threshold_limit = THRESHOLD_MAX,
};
struct threshold_bank {
- struct kobject *kobj;
- struct threshold_block *blocks;
- cpumask_var_t cpus;
+ struct kobject *kobj;
+ struct threshold_block *blocks;
+ cpumask_var_t cpus;
};
static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
@@ -86,9 +86,9 @@ static void amd_threshold_interrupt(void);
*/
struct thresh_restart {
- struct threshold_block *b;
- int reset;
- u16 old_limit;
+ struct threshold_block *b;
+ int reset;
+ u16 old_limit;
};
/* must be called with correct cpu affinity */
@@ -110,6 +110,7 @@ static void threshold_restart_bank(void *_tr)
} else if (tr->old_limit) { /* change limit w/o reset */
int new_count = (mci_misc_hi & THRESHOLD_MAX) +
(tr->old_limit - tr->b->threshold_limit);
+
mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
(new_count & THRESHOLD_MAX);
}
@@ -125,11 +126,11 @@ static void threshold_restart_bank(void *_tr)
/* cpu init entry point, called from mce.c with preempt off */
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
- unsigned int bank, block;
unsigned int cpu = smp_processor_id();
- u8 lvt_off;
u32 low = 0, high = 0, address = 0;
+ unsigned int bank, block;
struct thresh_restart tr;
+ u8 lvt_off;
for (bank = 0; bank < NR_BANKS; ++bank) {
for (block = 0; block < NR_BLOCKS; ++block) {
@@ -140,8 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
if (!address)
break;
address += MCG_XBLK_ADDR;
- }
- else
+ } else
++address;
if (rdmsr_safe(address, &low, &high))
@@ -193,9 +193,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
*/
static void amd_threshold_interrupt(void)
{
+ u32 low = 0, high = 0, address = 0;
unsigned int bank, block;
struct mce m;
- u32 low = 0, high = 0, address = 0;
mce_setup(&m);
@@ -204,16 +204,16 @@ static void amd_threshold_interrupt(void)
if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
continue;
for (block = 0; block < NR_BLOCKS; ++block) {
- if (block == 0)
+ if (block == 0) {
address = MSR_IA32_MC0_MISC + bank * 4;
- else if (block == 1) {
+ } else if (block == 1) {
address = (low & MASK_BLKPTR_LO) >> 21;
if (!address)
break;
address += MCG_XBLK_ADDR;
- }
- else
+ } else {
++address;
+ }
if (rdmsr_safe(address, &low, &high))
break;
@@ -229,8 +229,10 @@ static void amd_threshold_interrupt(void)
(high & MASK_LOCKED_HI))
continue;
- /* Log the machine check that caused the threshold
- event. */
+ /*
+ * Log the machine check that caused the threshold
+ * event.
+ */
machine_check_poll(MCP_TIMESTAMP,
&__get_cpu_var(mce_poll_banks));
@@ -254,48 +256,52 @@ static void amd_threshold_interrupt(void)
struct threshold_attr {
struct attribute attr;
- ssize_t(*show) (struct threshold_block *, char *);
- ssize_t(*store) (struct threshold_block *, const char *, size_t count);
+ ssize_t (*show) (struct threshold_block *, char *);
+ ssize_t (*store) (struct threshold_block *, const char *, size_t count);
};
-#define SHOW_FIELDS(name) \
-static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
-{ \
- return sprintf(buf, "%lx\n", (unsigned long) b->name); \
+#define SHOW_FIELDS(name) \
+static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
+{ \
+ return sprintf(buf, "%lx\n", (unsigned long) b->name); \
}
SHOW_FIELDS(interrupt_enable)
SHOW_FIELDS(threshold_limit)
-static ssize_t store_interrupt_enable(struct threshold_block *b,
- const char *buf, size_t count)
+static ssize_t
+store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
{
- char *end;
struct thresh_restart tr;
- unsigned long new = simple_strtoul(buf, &end, 0);
- if (end == buf)
+ unsigned long new;
+
+ if (strict_strtoul(buf, 0, &new) < 0)
return -EINVAL;
+
b->interrupt_enable = !!new;
- tr.b = b;
- tr.reset = 0;
- tr.old_limit = 0;
+ tr.b = b;
+ tr.reset = 0;
+ tr.old_limit = 0;
+
smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
- return end - buf;
+ return size;
}
-static ssize_t store_threshold_limit(struct threshold_block *b,
- const char *buf, size_t count)
+static ssize_t
+store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
{
- char *end;
struct thresh_restart tr;
- unsigned long new = simple_strtoul(buf, &end, 0);
- if (end == buf)
+ unsigned long new;
+
+ if (strict_strtoul(buf, 0, &new) < 0)
return -EINVAL;
+
if (new > THRESHOLD_MAX)
new = THRESHOLD_MAX;
if (new < 1)
new = 1;
+
tr.old_limit = b->threshold_limit;
b->threshold_limit = new;
tr.b = b;
@@ -303,12 +309,12 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
- return end - buf;
+ return size;
}
struct threshold_block_cross_cpu {
- struct threshold_block *tb;
- long retval;
+ struct threshold_block *tb;
+ long retval;
};
static void local_error_count_handler(void *_tbcc)
@@ -338,16 +344,13 @@ static ssize_t store_error_count(struct threshold_block *b,
return 1;
}
-#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \
- .attr = {.name = __stringify(_name), .mode = _mode }, \
- .show = _show, \
- .store = _store, \
+#define RW_ATTR(val) \
+static struct threshold_attr val = { \
+ .attr = {.name = __stringify(val), .mode = 0644 }, \
+ .show = show_## val, \
+ .store = store_## val, \
};
-#define RW_ATTR(name) \
-static struct threshold_attr name = \
- THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
-
RW_ATTR(interrupt_enable);
RW_ATTR(threshold_limit);
RW_ATTR(error_count);
@@ -359,15 +362,17 @@ static struct attribute *default_attrs[] = {
NULL
};
-#define to_block(k) container_of(k, struct threshold_block, kobj)
-#define to_attr(a) container_of(a, struct threshold_attr, attr)
+#define to_block(k) container_of(k, struct threshold_block, kobj)
+#define to_attr(a) container_of(a, struct threshold_attr, attr)
static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
{
struct threshold_block *b = to_block(kobj);
struct threshold_attr *a = to_attr(attr);
ssize_t ret;
+
ret = a->show ? a->show(b, buf) : -EIO;
+
return ret;
}
@@ -377,18 +382,20 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
struct threshold_block *b = to_block(kobj);
struct threshold_attr *a = to_attr(attr);
ssize_t ret;
+
ret = a->store ? a->store(b, buf, count) : -EIO;
+
return ret;
}
static struct sysfs_ops threshold_ops = {
- .show = show,
- .store = store,
+ .show = show,
+ .store = store,
};
static struct kobj_type threshold_ktype = {
- .sysfs_ops = &threshold_ops,
- .default_attrs = default_attrs,
+ .sysfs_ops = &threshold_ops,
+ .default_attrs = default_attrs,
};
static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
@@ -396,9 +403,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
unsigned int block,
u32 address)
{
- int err;
- u32 low, high;
struct threshold_block *b = NULL;
+ u32 low, high;
+ int err;
if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
return 0;
@@ -421,20 +428,21 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
if (!b)
return -ENOMEM;
- b->block = block;
- b->bank = bank;
- b->cpu = cpu;
- b->address = address;
- b->interrupt_enable = 0;
- b->threshold_limit = THRESHOLD_MAX;
+ b->block = block;
+ b->bank = bank;
+ b->cpu = cpu;
+ b->address = address;
+ b->interrupt_enable = 0;
+ b->threshold_limit = THRESHOLD_MAX;
INIT_LIST_HEAD(&b->miscj);
- if (per_cpu(threshold_banks, cpu)[bank]->blocks)
+ if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
list_add(&b->miscj,
&per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
- else
+ } else {
per_cpu(threshold_banks, cpu)[bank]->blocks = b;
+ }
err = kobject_init_and_add(&b->kobj, &threshold_ktype,
per_cpu(threshold_banks, cpu)[bank]->kobj,
@@ -447,8 +455,9 @@ recurse:
if (!address)
return 0;
address += MCG_XBLK_ADDR;
- } else
+ } else {
++address;
+ }
err = allocate_threshold_blocks(cpu, bank, ++block, address);
if (err)
@@ -500,13 +509,14 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (!b)
goto out;
- err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj,
+ err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj,
b->kobj, name);
if (err)
goto out;
cpumask_copy(b->cpus, cpu_core_mask(cpu));
per_cpu(threshold_banks, cpu)[bank] = b;
+
goto out;
}
#endif
@@ -522,7 +532,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
goto out;
}
- b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj);
+ b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj);
if (!b->kobj)
goto out_free;
@@ -542,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (i == cpu)
continue;
- err = sysfs_create_link(&per_cpu(device_mce, i).kobj,
+ err = sysfs_create_link(&per_cpu(mce_dev, i).kobj,
b->kobj, name);
if (err)
goto out;
@@ -605,15 +615,13 @@ static void deallocate_threshold_block(unsigned int cpu,
static void threshold_remove_bank(unsigned int cpu, int bank)
{
- int i = 0;
struct threshold_bank *b;
char name[32];
+ int i = 0;
b = per_cpu(threshold_banks, cpu)[bank];
-
if (!b)
return;
-
if (!b->blocks)
goto free_out;
@@ -622,8 +630,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
#ifdef CONFIG_SMP
/* sibling symlink */
if (shared_bank[bank] && b->blocks->cpu != cpu) {
- sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name);
+ sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name);
per_cpu(threshold_banks, cpu)[bank] = NULL;
+
return;
}
#endif
@@ -633,7 +642,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
if (i == cpu)
continue;
- sysfs_remove_link(&per_cpu(device_mce, i).kobj, name);
+ sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name);
per_cpu(threshold_banks, i)[bank] = NULL;
}
@@ -659,12 +668,9 @@ static void threshold_remove_device(unsigned int cpu)
}
/* get notified when a cpu comes on/off */
-static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action,
- unsigned int cpu)
+static void __cpuinit
+amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
{
- if (cpu >= NR_CPUS)
- return;
-
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
@@ -686,11 +692,12 @@ static __init int threshold_init_device(void)
/* to hit CPUs online before the notifier is up */
for_each_online_cpu(lcpu) {
int err = threshold_create_device(lcpu);
+
if (err)
return err;
}
threshold_cpu_callback = amd_64_threshold_cpu_callback;
+
return 0;
}
-
device_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index cef3ee30744..e1acec0f7a3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -8,85 +8,10 @@
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
-#include <asm/processor.h>
#include <asm/apic.h>
+#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
-#include <asm/hw_irq.h>
-#include <asm/idle.h>
-#include <asm/therm_throt.h>
-#include <asm/apic.h>
-
-asmlinkage void smp_thermal_interrupt(void)
-{
- __u64 msr_val;
-
- ack_APIC_irq();
-
- exit_idle();
- irq_enter();
-
- rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
- if (therm_throt_process(msr_val & 1))
- mce_log_therm_throt_event(msr_val);
-
- inc_irq_stat(irq_thermal_count);
- irq_exit();
-}
-
-static void intel_init_thermal(struct cpuinfo_x86 *c)
-{
- u32 l, h;
- int tm2 = 0;
- unsigned int cpu = smp_processor_id();
-
- if (!cpu_has(c, X86_FEATURE_ACPI))
- return;
-
- if (!cpu_has(c, X86_FEATURE_ACC))
- return;
-
- /* first check if TM1 is already enabled by the BIOS, in which
- * case there might be some SMM goo which handles it, so we can't even
- * put a handler since it might be delivered via SMI already.
- */
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- h = apic_read(APIC_LVTTHMR);
- if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
- printk(KERN_DEBUG
- "CPU%d: Thermal monitoring handled by SMI\n", cpu);
- return;
- }
-
- if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
- tm2 = 1;
-
- if (h & APIC_VECTOR_MASK) {
- printk(KERN_DEBUG
- "CPU%d: Thermal LVT vector (%#x) already "
- "installed\n", cpu, (h & APIC_VECTOR_MASK));
- return;
- }
-
- h = THERMAL_APIC_VECTOR;
- h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
- apic_write(APIC_LVTTHMR, h);
-
- rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
- wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
-
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
-
- l = apic_read(APIC_LVTTHMR);
- apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
- printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
- cpu, tm2 ? "TM2" : "TM1");
-
- /* enable thermal throttle processing */
- atomic_set(&therm_throt_en, 1);
- return;
-}
/*
* Support for Intel Correct Machine Check Interrupts. This allows
@@ -109,6 +34,9 @@ static int cmci_supported(int *banks)
{
u64 cap;
+ if (mce_cmci_disabled || mce_ignore_ce)
+ return 0;
+
/*
* Vendor check is not strictly needed, but the initial
* initialization is vendor keyed and this
@@ -132,7 +60,7 @@ static int cmci_supported(int *banks)
static void intel_threshold_interrupt(void)
{
machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
- mce_notify_user();
+ mce_notify_irq();
}
static void print_update(char *type, int *hdr, int num)
@@ -248,7 +176,7 @@ void cmci_rediscover(int dying)
return;
cpumask_copy(old, &current->cpus_allowed);
- for_each_online_cpu (cpu) {
+ for_each_online_cpu(cpu) {
if (cpu == dying)
continue;
if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index a74af128efc..f5f2d6f71fb 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -6,25 +6,23 @@
* This file contains routines to check for non-fatal MCEs every 15s
*
*/
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/jiffies.h>
-#include <linux/workqueue.h>
#include <linux/interrupt.h>
-#include <linux/smp.h>
+#include <linux/workqueue.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
+#include <asm/mce.h>
#include <asm/msr.h>
-#include "mce.h"
+static int firstbank;
-static int firstbank;
-
-#define MCE_RATE 15*HZ /* timer rate is 15s */
+#define MCE_RATE (15*HZ) /* timer rate is 15s */
static void mce_checkregs(void *info)
{
@@ -34,23 +32,24 @@ static void mce_checkregs(void *info)
for (i = firstbank; i < nr_mce_banks; i++) {
rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
- if (high & (1<<31)) {
- printk(KERN_INFO "MCE: The hardware reports a non "
- "fatal, correctable incident occurred on "
- "CPU %d.\n",
+ if (!(high & (1<<31)))
+ continue;
+
+ printk(KERN_INFO "MCE: The hardware reports a non fatal, "
+ "correctable incident occurred on CPU %d.\n",
smp_processor_id());
- printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
-
- /*
- * Scrub the error so we don't pick it up in MCE_RATE
- * seconds time.
- */
- wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
-
- /* Serialize */
- wmb();
- add_taint(TAINT_MACHINE_CHECK);
- }
+
+ printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
+
+ /*
+ * Scrub the error so we don't pick it up in MCE_RATE
+ * seconds time:
+ */
+ wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
+
+ /* Serialize: */
+ wmb();
+ add_taint(TAINT_MACHINE_CHECK);
}
}
@@ -77,16 +76,17 @@ static int __init init_nonfatal_mce_checker(void)
/* Some Athlons misbehave when we frob bank 0 */
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
- boot_cpu_data.x86 == 6)
- firstbank = 1;
+ boot_cpu_data.x86 == 6)
+ firstbank = 1;
else
- firstbank = 0;
+ firstbank = 0;
/*
* Check for non-fatal errors every MCE_RATE s
*/
schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
printk(KERN_INFO "Machine check exception polling timer started.\n");
+
return 0;
}
module_init(init_nonfatal_mce_checker);
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index f53bdcbaf38..4482aea9aa2 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -1,21 +1,14 @@
/*
* P4 specific Machine Check Exception Reporting
*/
-
-#include <linux/init.h>
-#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/interrupt.h>
+#include <linux/types.h>
+#include <linux/init.h>
#include <linux/smp.h>
#include <asm/processor.h>
-#include <asm/system.h>
+#include <asm/mce.h>
#include <asm/msr.h>
-#include <asm/apic.h>
-
-#include <asm/therm_throt.h>
-
-#include "mce.h"
/* as supported by the P4/Xeon family */
struct intel_mce_extended_msrs {
@@ -34,98 +27,8 @@ struct intel_mce_extended_msrs {
static int mce_num_extended_msrs;
-
-#ifdef CONFIG_X86_MCE_P4THERMAL
-static void unexpected_thermal_interrupt(struct pt_regs *regs)
-{
- printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
- smp_processor_id());
- add_taint(TAINT_MACHINE_CHECK);
-}
-
-/* P4/Xeon Thermal transition interrupt handler */
-static void intel_thermal_interrupt(struct pt_regs *regs)
-{
- __u64 msr_val;
-
- ack_APIC_irq();
-
- rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
- therm_throt_process(msr_val & 0x1);
-}
-
-/* Thermal interrupt handler for this CPU setup */
-static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt;
-
-void smp_thermal_interrupt(struct pt_regs *regs)
-{
- irq_enter();
- vendor_thermal_interrupt(regs);
- __get_cpu_var(irq_stat).irq_thermal_count++;
- irq_exit();
-}
-
-/* P4/Xeon Thermal regulation detect and init */
-static void intel_init_thermal(struct cpuinfo_x86 *c)
-{
- u32 l, h;
- unsigned int cpu = smp_processor_id();
-
- /* Thermal monitoring */
- if (!cpu_has(c, X86_FEATURE_ACPI))
- return; /* -ENODEV */
-
- /* Clock modulation */
- if (!cpu_has(c, X86_FEATURE_ACC))
- return; /* -ENODEV */
-
- /* first check if its enabled already, in which case there might
- * be some SMM goo which handles it, so we can't even put a handler
- * since it might be delivered via SMI already -zwanem.
- */
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- h = apic_read(APIC_LVTTHMR);
- if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
- printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
- cpu);
- return; /* -EBUSY */
- }
-
- /* check whether a vector already exists, temporarily masked? */
- if (h & APIC_VECTOR_MASK) {
- printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already "
- "installed\n",
- cpu, (h & APIC_VECTOR_MASK));
- return; /* -EBUSY */
- }
-
- /* The temperature transition interrupt handler setup */
- h = THERMAL_APIC_VECTOR; /* our delivery vector */
- h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
- apic_write(APIC_LVTTHMR, h);
-
- rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
- wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
-
- /* ok we're good to go... */
- vendor_thermal_interrupt = intel_thermal_interrupt;
-
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
-
- l = apic_read(APIC_LVTTHMR);
- apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
- printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
-
- /* enable thermal throttle processing */
- atomic_set(&therm_throt_en, 1);
- return;
-}
-#endif /* CONFIG_X86_MCE_P4THERMAL */
-
-
/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
-static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
+static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
{
u32 h;
@@ -143,9 +46,9 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
static void intel_machine_check(struct pt_regs *regs, long error_code)
{
- int recover = 1;
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
+ int recover = 1;
int i;
rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -157,7 +60,9 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
if (mce_num_extended_msrs > 0) {
struct intel_mce_extended_msrs dbg;
+
intel_get_extended_msrs(&dbg);
+
printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
"\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
"\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
@@ -171,6 +76,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
if (high & (1<<31)) {
char misc[20];
char addr[24];
+
misc[0] = addr[0] = '\0';
if (high & (1<<29))
recover |= 1;
@@ -196,6 +102,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
panic("Unable to continue");
printk(KERN_EMERG "Attempting to continue.\n");
+
/*
* Do not clear the MSR_IA32_MCi_STATUS if the error is not
* recoverable/continuable.This will allow BIOS to look at the MSRs
@@ -217,7 +124,6 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
}
-
void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index c9f77ea69ed..5c0e6533d9b 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -2,52 +2,67 @@
* P5 specific Machine Check Exception Reporting
* (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
*/
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
+#include <asm/mce.h>
#include <asm/msr.h>
-#include "mce.h"
+/* By default disabled */
+int mce_p5_enabled __read_mostly;
-/* Machine check handler for Pentium class Intel */
+/* Machine check handler for Pentium class Intel CPUs: */
static void pentium_machine_check(struct pt_regs *regs, long error_code)
{
u32 loaddr, hi, lotype;
+
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
- printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype);
- if (lotype&(1<<5))
- printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id());
+
+ printk(KERN_EMERG
+ "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
+ smp_processor_id(), loaddr, lotype);
+
+ if (lotype & (1<<5)) {
+ printk(KERN_EMERG
+ "CPU#%d: Possible thermal failure (CPU on fire ?).\n",
+ smp_processor_id());
+ }
+
add_taint(TAINT_MACHINE_CHECK);
}
-/* Set up machine check reporting for processors with Intel style MCE */
+/* Set up machine check reporting for processors with Intel style MCE: */
void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
- /*Check for MCE support */
- if (!cpu_has(c, X86_FEATURE_MCE))
+ /* Default P5 to off as its often misconnected: */
+ if (!mce_p5_enabled)
return;
- /* Default P5 to off as its often misconnected */
- if (mce_disabled != -1)
+ /* Check for MCE support: */
+ if (!cpu_has(c, X86_FEATURE_MCE))
return;
+
machine_check_vector = pentium_machine_check;
+ /* Make sure the vector pointer is visible before we enable MCEs: */
wmb();
- /* Read registers before enabling */
+ /* Read registers before enabling: */
rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
- printk(KERN_INFO "Intel old style machine check architecture supported.\n");
+ printk(KERN_INFO
+ "Intel old style machine check architecture supported.\n");
- /* Enable MCE */
+ /* Enable MCE: */
set_in_cr4(X86_CR4_MCE);
- printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id());
+ printk(KERN_INFO
+ "Intel old style machine check reporting enabled on CPU#%d.\n",
+ smp_processor_id());
}
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index 2ac52d7b434..01e4f817818 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -2,25 +2,23 @@
* P6 specific Machine Check Exception Reporting
* (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
*/
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
+#include <asm/mce.h>
#include <asm/msr.h>
-#include "mce.h"
-
/* Machine Check Handler For PII/PIII */
static void intel_machine_check(struct pt_regs *regs, long error_code)
{
- int recover = 1;
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
+ int recover = 1;
int i;
rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -35,12 +33,16 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
if (high & (1<<31)) {
char misc[20];
char addr[24];
- misc[0] = addr[0] = '\0';
+
+ misc[0] = '\0';
+ addr[0] = '\0';
+
if (high & (1<<29))
recover |= 1;
if (high & (1<<25))
recover |= 2;
high &= ~(1<<31);
+
if (high & (1<<27)) {
rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
@@ -49,6 +51,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
snprintf(addr, 24, " at %08x%08x", ahigh, alow);
}
+
printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
smp_processor_id(), i, high, low, misc, addr);
}
@@ -63,16 +66,17 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
/*
* Do not clear the MSR_IA32_MCi_STATUS if the error is not
* recoverable/continuable.This will allow BIOS to look at the MSRs
- * for errors if the OS could not log the error.
+ * for errors if the OS could not log the error:
*/
for (i = 0; i < nr_mce_banks; i++) {
unsigned int msr;
+
msr = MSR_IA32_MC0_STATUS+i*4;
rdmsr(msr, low, high);
if (high & (1<<31)) {
- /* Clear it */
+ /* Clear it: */
wrmsr(msr, 0UL, 0UL);
- /* Serialize */
+ /* Serialize: */
wmb();
add_taint(TAINT_MACHINE_CHECK);
}
@@ -81,7 +85,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
}
-/* Set up machine check reporting for processors with Intel style MCE */
+/* Set up machine check reporting for processors with Intel style MCE: */
void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
@@ -97,6 +101,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
/* Ok machine check is available */
machine_check_vector = intel_machine_check;
+ /* Make sure the vector pointer is visible before we enable MCEs: */
wmb();
printk(KERN_INFO "Intel machine check architecture supported.\n");
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index d5ae2243f0b..5957a93e517 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -1,7 +1,7 @@
/*
- *
* Thermal throttle event support code (such as syslog messaging and rate
* limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
+ *
* This allows consistent reporting of CPU thermal throttle events.
*
* Maintains a counter in /sys that keeps track of the number of thermal
@@ -13,43 +13,54 @@
* Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
* Inspired by Ross Biro's and Al Borchers' counter code.
*/
-
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/sysdev.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/smp.h>
#include <linux/cpu.h>
-#include <asm/cpu.h>
-#include <linux/notifier.h>
-#include <linux/jiffies.h>
-#include <asm/therm_throt.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/apic.h>
+#include <asm/idle.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
/* How long to wait between reporting thermal events */
-#define CHECK_INTERVAL (300 * HZ)
+#define CHECK_INTERVAL (300 * HZ)
static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
-atomic_t therm_throt_en = ATOMIC_INIT(0);
+static DEFINE_PER_CPU(bool, thermal_throttle_active);
+
+static atomic_t therm_throt_en = ATOMIC_INIT(0);
#ifdef CONFIG_SYSFS
-#define define_therm_throt_sysdev_one_ro(_name) \
- static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
-
-#define define_therm_throt_sysdev_show_func(name) \
-static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
- struct sysdev_attribute *attr, \
- char *buf) \
-{ \
- unsigned int cpu = dev->id; \
- ssize_t ret; \
- \
- preempt_disable(); /* CPU hotplug */ \
- if (cpu_online(cpu)) \
- ret = sprintf(buf, "%lu\n", \
- per_cpu(thermal_throttle_##name, cpu)); \
- else \
- ret = 0; \
- preempt_enable(); \
- \
- return ret; \
+#define define_therm_throt_sysdev_one_ro(_name) \
+ static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
+
+#define define_therm_throt_sysdev_show_func(name) \
+static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
+ struct sysdev_attribute *attr, \
+ char *buf) \
+{ \
+ unsigned int cpu = dev->id; \
+ ssize_t ret; \
+ \
+ preempt_disable(); /* CPU hotplug */ \
+ if (cpu_online(cpu)) \
+ ret = sprintf(buf, "%lu\n", \
+ per_cpu(thermal_throttle_##name, cpu)); \
+ else \
+ ret = 0; \
+ preempt_enable(); \
+ \
+ return ret; \
}
define_therm_throt_sysdev_show_func(count);
@@ -61,8 +72,8 @@ static struct attribute *thermal_throttle_attrs[] = {
};
static struct attribute_group thermal_throttle_attr_group = {
- .attrs = thermal_throttle_attrs,
- .name = "thermal_throttle"
+ .attrs = thermal_throttle_attrs,
+ .name = "thermal_throttle"
};
#endif /* CONFIG_SYSFS */
@@ -82,38 +93,45 @@ static struct attribute_group thermal_throttle_attr_group = {
* 1 : Event should be logged further, and a message has been
* printed to the syslog.
*/
-int therm_throt_process(int curr)
+static int therm_throt_process(int curr)
{
unsigned int cpu = smp_processor_id();
__u64 tmp_jiffs = get_jiffies_64();
+ bool was_throttled = __get_cpu_var(thermal_throttle_active);
+ bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr;
- if (curr)
+ if (is_throttled)
__get_cpu_var(thermal_throttle_count)++;
- if (time_before64(tmp_jiffs, __get_cpu_var(next_check)))
+ if (!(was_throttled ^ is_throttled) &&
+ time_before64(tmp_jiffs, __get_cpu_var(next_check)))
return 0;
__get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL;
/* if we just entered the thermal event */
- if (curr) {
+ if (is_throttled) {
printk(KERN_CRIT "CPU%d: Temperature above threshold, "
- "cpu clock throttled (total events = %lu)\n", cpu,
- __get_cpu_var(thermal_throttle_count));
+ "cpu clock throttled (total events = %lu)\n",
+ cpu, __get_cpu_var(thermal_throttle_count));
add_taint(TAINT_MACHINE_CHECK);
- } else {
- printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu);
+ return 1;
+ }
+ if (was_throttled) {
+ printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
+ return 1;
}
- return 1;
+ return 0;
}
#ifdef CONFIG_SYSFS
-/* Add/Remove thermal_throttle interface for CPU device */
+/* Add/Remove thermal_throttle interface for CPU device: */
static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
{
- return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+ return sysfs_create_group(&sys_dev->kobj,
+ &thermal_throttle_attr_group);
}
static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
@@ -121,19 +139,21 @@ static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
}
-/* Mutex protecting device creation against CPU hotplug */
+/* Mutex protecting device creation against CPU hotplug: */
static DEFINE_MUTEX(therm_cpu_lock);
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static __cpuinit int
+thermal_throttle_cpu_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
struct sys_device *sys_dev;
int err = 0;
sys_dev = get_cpu_sysdev(cpu);
+
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
@@ -183,6 +203,94 @@ static __init int thermal_throttle_init_device(void)
return 0;
}
-
device_initcall(thermal_throttle_init_device);
+
#endif /* CONFIG_SYSFS */
+
+/* Thermal transition interrupt handler */
+static void intel_thermal_interrupt(void)
+{
+ __u64 msr_val;
+
+ rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
+ if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT))
+ mce_log_therm_throt_event(msr_val);
+}
+
+static void unexpected_thermal_interrupt(void)
+{
+ printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
+ smp_processor_id());
+ add_taint(TAINT_MACHINE_CHECK);
+}
+
+static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
+
+asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
+{
+ exit_idle();
+ irq_enter();
+ inc_irq_stat(irq_thermal_count);
+ smp_thermal_vector();
+ irq_exit();
+ /* Ack only at the end to avoid potential reentry */
+ ack_APIC_irq();
+}
+
+void intel_init_thermal(struct cpuinfo_x86 *c)
+{
+ unsigned int cpu = smp_processor_id();
+ int tm2 = 0;
+ u32 l, h;
+
+ /* Thermal monitoring depends on ACPI and clock modulation*/
+ if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
+ return;
+
+ /*
+ * First check if its enabled already, in which case there might
+ * be some SMM goo which handles it, so we can't even put a handler
+ * since it might be delivered via SMI already:
+ */
+ rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+ h = apic_read(APIC_LVTTHMR);
+ if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
+ printk(KERN_DEBUG
+ "CPU%d: Thermal monitoring handled by SMI\n", cpu);
+ return;
+ }
+
+ if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
+ tm2 = 1;
+
+ /* Check whether a vector already exists */
+ if (h & APIC_VECTOR_MASK) {
+ printk(KERN_DEBUG
+ "CPU%d: Thermal LVT vector (%#x) already installed\n",
+ cpu, (h & APIC_VECTOR_MASK));
+ return;
+ }
+
+ /* We'll mask the thermal vector in the lapic till we're ready: */
+ h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
+ apic_write(APIC_LVTTHMR, h);
+
+ rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
+ wrmsr(MSR_IA32_THERM_INTERRUPT,
+ l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
+
+ smp_thermal_vector = intel_thermal_interrupt;
+
+ rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+ wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
+
+ /* Unmask the thermal vector: */
+ l = apic_read(APIC_LVTTHMR);
+ apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+
+ printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
+ cpu, tm2 ? "TM2" : "TM1");
+
+ /* enable thermal throttle processing */
+ atomic_set(&therm_throt_en, 1);
+}
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index 23ee9e730f7..d746df2909c 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -17,7 +17,7 @@ static void default_threshold_interrupt(void)
void (*mce_threshold_vector)(void) = default_threshold_interrupt;
-asmlinkage void mce_threshold_interrupt(void)
+asmlinkage void smp_threshold_interrupt(void)
{
exit_idle();
irq_enter();
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 2a043d89811..54060f56597 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -2,19 +2,17 @@
* IDT Winchip specific Machine Check Exception Reporting
* (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
*/
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
#include <asm/processor.h>
#include <asm/system.h>
+#include <asm/mce.h>
#include <asm/msr.h>
-#include "mce.h"
-
-/* Machine check handler for WinChip C6 */
+/* Machine check handler for WinChip C6: */
static void winchip_machine_check(struct pt_regs *regs, long error_code)
{
printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
@@ -25,12 +23,18 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code)
void winchip_mcheck_init(struct cpuinfo_x86 *c)
{
u32 lo, hi;
+
machine_check_vector = winchip_machine_check;
+ /* Make sure the vector pointer is visible before we enable MCEs: */
wmb();
+
rdmsr(MSR_IDT_FCR1, lo, hi);
lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */
lo &= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);
+
set_in_cr4(X86_CR4_MCE);
- printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n");
+
+ printk(KERN_INFO
+ "Winchip machine check reporting enabled on CPU#0.\n");
}
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index ce0fe4b5c04..1d584a18a50 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits)
if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
return 0;
- rdmsr(MTRRdefType_MSR, def, dummy);
+ rdmsr(MSR_MTRRdefType, def, dummy);
def &= 0xff;
if (def != MTRR_TYPE_UNCACHABLE)
return 0;
@@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
*/
if (!is_cpu(INTEL) || disable_mtrr_trim)
return 0;
- rdmsr(MTRRdefType_MSR, def, dummy);
+ rdmsr(MSR_MTRRdefType, def, dummy);
def &= 0xff;
if (def != MTRR_TYPE_UNCACHABLE)
return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index d21d4fb161f..0543f69f0b2 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -20,9 +20,9 @@ struct fixed_range_block {
};
static struct fixed_range_block fixed_range_blocks[] = {
- { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */
- { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */
- { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */
+ { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */
+ { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */
+ { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */
{}
};
@@ -194,12 +194,12 @@ get_fixed_ranges(mtrr_type * frs)
k8_check_syscfg_dram_mod_en();
- rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
+ rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]);
for (i = 0; i < 2; i++)
- rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]);
+ rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);
for (i = 0; i < 8; i++)
- rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
+ rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]);
}
void mtrr_save_fixed_ranges(void *info)
@@ -310,7 +310,7 @@ void __init get_mtrr_state(void)
vrs = mtrr_state.var_ranges;
- rdmsr(MTRRcap_MSR, lo, dummy);
+ rdmsr(MSR_MTRRcap, lo, dummy);
mtrr_state.have_fixed = (lo >> 8) & 1;
for (i = 0; i < num_var_ranges; i++)
@@ -318,7 +318,7 @@ void __init get_mtrr_state(void)
if (mtrr_state.have_fixed)
get_fixed_ranges(mtrr_state.fixed_ranges);
- rdmsr(MTRRdefType_MSR, lo, dummy);
+ rdmsr(MSR_MTRRdefType, lo, dummy);
mtrr_state.def_type = (lo & 0xff);
mtrr_state.enabled = (lo & 0xc00) >> 10;
@@ -583,10 +583,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
__flush_tlb();
/* Save MTRR state */
- rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
+ rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
/* Disable MTRRs, and set the default type to uncached */
- mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi);
+ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
}
static void post_set(void) __releases(set_atomicity_lock)
@@ -595,7 +595,7 @@ static void post_set(void) __releases(set_atomicity_lock)
__flush_tlb();
/* Intel (P6) standard MTRRs */
- mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
+ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
/* Enable caches */
write_cr0(read_cr0() & 0xbfffffff);
@@ -707,7 +707,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
static int generic_have_wrcomb(void)
{
unsigned long config, dummy;
- rdmsr(MTRRcap_MSR, config, dummy);
+ rdmsr(MSR_MTRRcap, config, dummy);
return (config & (1 << 10));
}
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 03cda01f57c..8fc248b5aea 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void)
unsigned long config = 0, dummy;
if (use_intel()) {
- rdmsr(MTRRcap_MSR, config, dummy);
+ rdmsr(MSR_MTRRcap, config, dummy);
} else if (is_cpu(AMD))
config = 2;
else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 77f67f7b347..7538b767f20 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -5,21 +5,6 @@
#include <linux/types.h>
#include <linux/stddef.h>
-#define MTRRcap_MSR 0x0fe
-#define MTRRdefType_MSR 0x2ff
-
-#define MTRRfix64K_00000_MSR 0x250
-#define MTRRfix16K_80000_MSR 0x258
-#define MTRRfix16K_A0000_MSR 0x259
-#define MTRRfix4K_C0000_MSR 0x268
-#define MTRRfix4K_C8000_MSR 0x269
-#define MTRRfix4K_D0000_MSR 0x26a
-#define MTRRfix4K_D8000_MSR 0x26b
-#define MTRRfix4K_E0000_MSR 0x26c
-#define MTRRfix4K_E8000_MSR 0x26d
-#define MTRRfix4K_F0000_MSR 0x26e
-#define MTRRfix4K_F8000_MSR 0x26f
-
#define MTRR_CHANGE_MASK_FIXED 0x01
#define MTRR_CHANGE_MASK_VARIABLE 0x02
#define MTRR_CHANGE_MASK_DEFTYPE 0x04
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
index 7f7e2753685..1f5fb1588d1 100644
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
if (use_intel())
/* Save MTRR state */
- rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
+ rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
else
/* Cyrix ARRs - everything else were excluded at the top */
ctxt->ccr3 = getCx86(CX86_CCR3);
@@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
{
if (use_intel())
/* Disable MTRRs, and set the default type to uncached */
- mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL,
+ mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
ctxt->deftype_hi);
else if (is_cpu(CYRIX))
/* Cyrix ARRs - everything else were excluded at the top */
@@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt)
/* Restore MTRRdefType */
if (use_intel())
/* Intel (P6) standard MTRRs */
- mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
+ mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
else
/* Cyrix ARRs - everything else was excluded at the top */
setCx86(CX86_CCR3, ctxt->ccr3);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 00000000000..900332b800f
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,1964 @@
+/*
+ * Performance counter x86 architecture code
+ *
+ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2009 Jaswinder Singh Rajput
+ * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_counter.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/highmem.h>
+
+#include <asm/apic.h>
+#include <asm/stacktrace.h>
+#include <asm/nmi.h>
+
+static u64 perf_counter_mask __read_mostly;
+
+struct cpu_hw_counters {
+ struct perf_counter *counters[X86_PMC_IDX_MAX];
+ unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ unsigned long interrupts;
+ int enabled;
+};
+
+/*
+ * struct x86_pmu - generic x86 pmu
+ */
+struct x86_pmu {
+ const char *name;
+ int version;
+ int (*handle_irq)(struct pt_regs *);
+ void (*disable_all)(void);
+ void (*enable_all)(void);
+ void (*enable)(struct hw_perf_counter *, int);
+ void (*disable)(struct hw_perf_counter *, int);
+ unsigned eventsel;
+ unsigned perfctr;
+ u64 (*event_map)(int);
+ u64 (*raw_event)(u64);
+ int max_events;
+ int num_counters;
+ int num_counters_fixed;
+ int counter_bits;
+ u64 counter_mask;
+ int apic;
+ u64 max_period;
+ u64 intel_ctrl;
+};
+
+static struct x86_pmu x86_pmu __read_mostly;
+
+static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
+ .enabled = 1,
+};
+
+/*
+ * Not sure about some of these
+ */
+static const u64 p6_perfmon_event_map[] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
+ [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
+};
+
+static u64 p6_pmu_event_map(int event)
+{
+ return p6_perfmon_event_map[event];
+}
+
+/*
+ * Counter setting that is specified not to count anything.
+ * We use this to effectively disable a counter.
+ *
+ * L2_RQSTS with 0 MESI unit mask.
+ */
+#define P6_NOP_COUNTER 0x0000002EULL
+
+static u64 p6_pmu_raw_event(u64 event)
+{
+#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
+#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
+#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
+#define P6_EVNTSEL_INV_MASK 0x00800000ULL
+#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL
+
+#define P6_EVNTSEL_MASK \
+ (P6_EVNTSEL_EVENT_MASK | \
+ P6_EVNTSEL_UNIT_MASK | \
+ P6_EVNTSEL_EDGE_MASK | \
+ P6_EVNTSEL_INV_MASK | \
+ P6_EVNTSEL_COUNTER_MASK)
+
+ return event & P6_EVNTSEL_MASK;
+}
+
+
+/*
+ * Intel PerfMon v3. Used on Core2 and later.
+ */
+static const u64 intel_perfmon_event_map[] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
+ [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
+};
+
+static u64 intel_pmu_event_map(int event)
+{
+ return intel_perfmon_event_map[event];
+}
+
+/*
+ * Generalized hw caching related event table, filled
+ * in on a per model basis. A value of 0 means
+ * 'not supported', -1 means 'event makes no sense on
+ * this CPU', any other value means the raw event
+ * ID.
+ */
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+static u64 __read_mostly hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+static const u64 nehalem_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
+ [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
+ [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
+ [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
+ [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static const u64 core2_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static const u64 atom_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static u64 intel_pmu_raw_event(u64 event)
+{
+#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
+#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
+#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
+#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
+#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL
+
+#define CORE_EVNTSEL_MASK \
+ (CORE_EVNTSEL_EVENT_MASK | \
+ CORE_EVNTSEL_UNIT_MASK | \
+ CORE_EVNTSEL_EDGE_MASK | \
+ CORE_EVNTSEL_INV_MASK | \
+ CORE_EVNTSEL_COUNTER_MASK)
+
+ return event & CORE_EVNTSEL_MASK;
+}
+
+static const u64 amd_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
+ [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
+ [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
+ [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
+ [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
+ [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
+ [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
+ [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+/*
+ * AMD Performance Monitor K7 and later.
+ */
+static const u64 amd_perfmon_event_map[] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
+};
+
+static u64 amd_pmu_event_map(int event)
+{
+ return amd_perfmon_event_map[event];
+}
+
+static u64 amd_pmu_raw_event(u64 event)
+{
+#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
+#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
+#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
+#define K7_EVNTSEL_INV_MASK 0x000800000ULL
+#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL
+
+#define K7_EVNTSEL_MASK \
+ (K7_EVNTSEL_EVENT_MASK | \
+ K7_EVNTSEL_UNIT_MASK | \
+ K7_EVNTSEL_EDGE_MASK | \
+ K7_EVNTSEL_INV_MASK | \
+ K7_EVNTSEL_COUNTER_MASK)
+
+ return event & K7_EVNTSEL_MASK;
+}
+
+/*
+ * Propagate counter elapsed time into the generic counter.
+ * Can only be executed on the CPU where the counter is active.
+ * Returns the delta events processed.
+ */
+static u64
+x86_perf_counter_update(struct perf_counter *counter,
+ struct hw_perf_counter *hwc, int idx)
+{
+ int shift = 64 - x86_pmu.counter_bits;
+ u64 prev_raw_count, new_raw_count;
+ s64 delta;
+
+ /*
+ * Careful: an NMI might modify the previous counter value.
+ *
+ * Our tactic to handle this is to first atomically read and
+ * exchange a new raw count - then add that new-prev delta
+ * count to the generic counter atomically:
+ */
+again:
+ prev_raw_count = atomic64_read(&hwc->prev_count);
+ rdmsrl(hwc->counter_base + idx, new_raw_count);
+
+ if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ new_raw_count) != prev_raw_count)
+ goto again;
+
+ /*
+ * Now we have the new raw value and have updated the prev
+ * timestamp already. We can now calculate the elapsed delta
+ * (counter-)time and add that to the generic counter.
+ *
+ * Careful, not all hw sign-extends above the physical width
+ * of the count.
+ */
+ delta = (new_raw_count << shift) - (prev_raw_count << shift);
+ delta >>= shift;
+
+ atomic64_add(delta, &counter->count);
+ atomic64_sub(delta, &hwc->period_left);
+
+ return new_raw_count;
+}
+
+static atomic_t active_counters;
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+static bool reserve_pmc_hardware(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ int i;
+
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ disable_lapic_nmi_watchdog();
+
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
+ goto perfctr_fail;
+ }
+
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
+ goto eventsel_fail;
+ }
+#endif
+
+ return true;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+eventsel_fail:
+ for (i--; i >= 0; i--)
+ release_evntsel_nmi(x86_pmu.eventsel + i);
+
+ i = x86_pmu.num_counters;
+
+perfctr_fail:
+ for (i--; i >= 0; i--)
+ release_perfctr_nmi(x86_pmu.perfctr + i);
+
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ enable_lapic_nmi_watchdog();
+
+ return false;
+#endif
+}
+
+static void release_pmc_hardware(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ int i;
+
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ release_perfctr_nmi(x86_pmu.perfctr + i);
+ release_evntsel_nmi(x86_pmu.eventsel + i);
+ }
+
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ enable_lapic_nmi_watchdog();
+#endif
+}
+
+static void hw_perf_counter_destroy(struct perf_counter *counter)
+{
+ if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
+ release_pmc_hardware();
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+}
+
+static inline int x86_pmu_initialized(void)
+{
+ return x86_pmu.handle_irq != NULL;
+}
+
+static inline int
+set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
+{
+ unsigned int cache_type, cache_op, cache_result;
+ u64 config, val;
+
+ config = attr->config;
+
+ cache_type = (config >> 0) & 0xff;
+ if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+ return -EINVAL;
+
+ cache_op = (config >> 8) & 0xff;
+ if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+ return -EINVAL;
+
+ cache_result = (config >> 16) & 0xff;
+ if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+ return -EINVAL;
+
+ val = hw_cache_event_ids[cache_type][cache_op][cache_result];
+
+ if (val == 0)
+ return -ENOENT;
+
+ if (val == -1)
+ return -EINVAL;
+
+ hwc->config |= val;
+
+ return 0;
+}
+
+/*
+ * Setup the hardware configuration for a given attr_type
+ */
+static int __hw_perf_counter_init(struct perf_counter *counter)
+{
+ struct perf_counter_attr *attr = &counter->attr;
+ struct hw_perf_counter *hwc = &counter->hw;
+ u64 config;
+ int err;
+
+ if (!x86_pmu_initialized())
+ return -ENODEV;
+
+ err = 0;
+ if (!atomic_inc_not_zero(&active_counters)) {
+ mutex_lock(&pmc_reserve_mutex);
+ if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
+ err = -EBUSY;
+ else
+ atomic_inc(&active_counters);
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+ if (err)
+ return err;
+
+ /*
+ * Generate PMC IRQs:
+ * (keep 'enabled' bit clear for now)
+ */
+ hwc->config = ARCH_PERFMON_EVENTSEL_INT;
+
+ /*
+ * Count user and OS events unless requested not to.
+ */
+ if (!attr->exclude_user)
+ hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
+ if (!attr->exclude_kernel)
+ hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+
+ if (!hwc->sample_period) {
+ hwc->sample_period = x86_pmu.max_period;
+ hwc->last_period = hwc->sample_period;
+ atomic64_set(&hwc->period_left, hwc->sample_period);
+ } else {
+ /*
+ * If we have a PMU initialized but no APIC
+ * interrupts, we cannot sample hardware
+ * counters (user-space has to fall back and
+ * sample via a hrtimer based software counter):
+ */
+ if (!x86_pmu.apic)
+ return -EOPNOTSUPP;
+ }
+
+ counter->destroy = hw_perf_counter_destroy;
+
+ /*
+ * Raw event type provide the config in the event structure
+ */
+ if (attr->type == PERF_TYPE_RAW) {
+ hwc->config |= x86_pmu.raw_event(attr->config);
+ return 0;
+ }
+
+ if (attr->type == PERF_TYPE_HW_CACHE)
+ return set_ext_hw_attr(hwc, attr);
+
+ if (attr->config >= x86_pmu.max_events)
+ return -EINVAL;
+
+ /*
+ * The generic map:
+ */
+ config = x86_pmu.event_map(attr->config);
+
+ if (config == 0)
+ return -ENOENT;
+
+ if (config == -1LL)
+ return -EINVAL;
+
+ hwc->config |= config;
+
+ return 0;
+}
+
+static void p6_pmu_disable_all(void)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ u64 val;
+
+ if (!cpuc->enabled)
+ return;
+
+ cpuc->enabled = 0;
+ barrier();
+
+ /* p6 only has one enable register */
+ rdmsrl(MSR_P6_EVNTSEL0, val);
+ val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static void intel_pmu_disable_all(void)
+{
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+}
+
+static void amd_pmu_disable_all(void)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ int idx;
+
+ if (!cpuc->enabled)
+ return;
+
+ cpuc->enabled = 0;
+ /*
+ * ensure we write the disable before we start disabling the
+ * counters proper, so that amd_pmu_enable_counter() does the
+ * right thing.
+ */
+ barrier();
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ u64 val;
+
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+ rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+ if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
+ continue;
+ val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+ }
+}
+
+void hw_perf_disable(void)
+{
+ if (!x86_pmu_initialized())
+ return;
+ return x86_pmu.disable_all();
+}
+
+static void p6_pmu_enable_all(void)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ unsigned long val;
+
+ if (cpuc->enabled)
+ return;
+
+ cpuc->enabled = 1;
+ barrier();
+
+ /* p6 only has one enable register */
+ rdmsrl(MSR_P6_EVNTSEL0, val);
+ val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static void intel_pmu_enable_all(void)
+{
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+}
+
+static void amd_pmu_enable_all(void)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ int idx;
+
+ if (cpuc->enabled)
+ return;
+
+ cpuc->enabled = 1;
+ barrier();
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ struct perf_counter *counter = cpuc->counters[idx];
+ u64 val;
+
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ val = counter->hw.config;
+ val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+ }
+}
+
+void hw_perf_enable(void)
+{
+ if (!x86_pmu_initialized())
+ return;
+ x86_pmu.enable_all();
+}
+
+static inline u64 intel_pmu_get_status(void)
+{
+ u64 status;
+
+ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+ return status;
+}
+
+static inline void intel_pmu_ack_status(u64 ack)
+{
+ wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ (void)checking_wrmsrl(hwc->config_base + idx,
+ hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
+}
+
+static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ (void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
+}
+
+static inline void
+intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
+{
+ int idx = __idx - X86_PMC_IDX_FIXED;
+ u64 ctrl_val, mask;
+
+ mask = 0xfULL << (idx * 4);
+
+ rdmsrl(hwc->config_base, ctrl_val);
+ ctrl_val &= ~mask;
+ (void)checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static inline void
+p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ u64 val = P6_NOP_COUNTER;
+
+ if (cpuc->enabled)
+ val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+
+ (void)checking_wrmsrl(hwc->config_base + idx, val);
+}
+
+static inline void
+intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+ intel_pmu_disable_fixed(hwc, idx);
+ return;
+ }
+
+ x86_pmu_disable_counter(hwc, idx);
+}
+
+static inline void
+amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ x86_pmu_disable_counter(hwc, idx);
+}
+
+static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
+
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the counter disabled in hw:
+ */
+static int
+x86_perf_counter_set_period(struct perf_counter *counter,
+ struct hw_perf_counter *hwc, int idx)
+{
+ s64 left = atomic64_read(&hwc->period_left);
+ s64 period = hwc->sample_period;
+ int err, ret = 0;
+
+ /*
+ * If we are way outside a reasoable range then just skip forward:
+ */
+ if (unlikely(left <= -period)) {
+ left = period;
+ atomic64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ ret = 1;
+ }
+
+ if (unlikely(left <= 0)) {
+ left += period;
+ atomic64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ ret = 1;
+ }
+ /*
+ * Quirk: certain CPUs dont like it if just 1 event is left:
+ */
+ if (unlikely(left < 2))
+ left = 2;
+
+ if (left > x86_pmu.max_period)
+ left = x86_pmu.max_period;
+
+ per_cpu(prev_left[idx], smp_processor_id()) = left;
+
+ /*
+ * The hw counter starts counting from this counter offset,
+ * mark it to be able to extra future deltas:
+ */
+ atomic64_set(&hwc->prev_count, (u64)-left);
+
+ err = checking_wrmsrl(hwc->counter_base + idx,
+ (u64)(-left) & x86_pmu.counter_mask);
+
+ perf_counter_update_userpage(counter);
+
+ return ret;
+}
+
+static inline void
+intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
+{
+ int idx = __idx - X86_PMC_IDX_FIXED;
+ u64 ctrl_val, bits, mask;
+ int err;
+
+ /*
+ * Enable IRQ generation (0x8),
+ * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+ * if requested:
+ */
+ bits = 0x8ULL;
+ if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+ bits |= 0x2;
+ if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+ bits |= 0x1;
+ bits <<= (idx * 4);
+ mask = 0xfULL << (idx * 4);
+
+ rdmsrl(hwc->config_base, ctrl_val);
+ ctrl_val &= ~mask;
+ ctrl_val |= bits;
+ err = checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ u64 val;
+
+ val = hwc->config;
+ if (cpuc->enabled)
+ val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+
+ (void)checking_wrmsrl(hwc->config_base + idx, val);
+}
+
+
+static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+ intel_pmu_enable_fixed(hwc, idx);
+ return;
+ }
+
+ x86_pmu_enable_counter(hwc, idx);
+}
+
+static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+ if (cpuc->enabled)
+ x86_pmu_enable_counter(hwc, idx);
+}
+
+static int
+fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
+{
+ unsigned int event;
+
+ if (!x86_pmu.num_counters_fixed)
+ return -1;
+
+ event = hwc->config & ARCH_PERFMON_EVENT_MASK;
+
+ if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
+ return X86_PMC_IDX_FIXED_INSTRUCTIONS;
+ if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
+ return X86_PMC_IDX_FIXED_CPU_CYCLES;
+ if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
+ return X86_PMC_IDX_FIXED_BUS_CYCLES;
+
+ return -1;
+}
+
+/*
+ * Find a PMC slot for the freshly enabled / scheduled in counter:
+ */
+static int x86_pmu_enable(struct perf_counter *counter)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct hw_perf_counter *hwc = &counter->hw;
+ int idx;
+
+ idx = fixed_mode_idx(counter, hwc);
+ if (idx >= 0) {
+ /*
+ * Try to get the fixed counter, if that is already taken
+ * then try to get a generic counter:
+ */
+ if (test_and_set_bit(idx, cpuc->used_mask))
+ goto try_generic;
+
+ hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+ /*
+ * We set it so that counter_base + idx in wrmsr/rdmsr maps to
+ * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
+ */
+ hwc->counter_base =
+ MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
+ hwc->idx = idx;
+ } else {
+ idx = hwc->idx;
+ /* Try to get the previous generic counter again */
+ if (test_and_set_bit(idx, cpuc->used_mask)) {
+try_generic:
+ idx = find_first_zero_bit(cpuc->used_mask,
+ x86_pmu.num_counters);
+ if (idx == x86_pmu.num_counters)
+ return -EAGAIN;
+
+ set_bit(idx, cpuc->used_mask);
+ hwc->idx = idx;
+ }
+ hwc->config_base = x86_pmu.eventsel;
+ hwc->counter_base = x86_pmu.perfctr;
+ }
+
+ perf_counters_lapic_init();
+
+ x86_pmu.disable(hwc, idx);
+
+ cpuc->counters[idx] = counter;
+ set_bit(idx, cpuc->active_mask);
+
+ x86_perf_counter_set_period(counter, hwc, idx);
+ x86_pmu.enable(hwc, idx);
+
+ perf_counter_update_userpage(counter);
+
+ return 0;
+}
+
+static void x86_pmu_unthrottle(struct perf_counter *counter)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct hw_perf_counter *hwc = &counter->hw;
+
+ if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
+ cpuc->counters[hwc->idx] != counter))
+ return;
+
+ x86_pmu.enable(hwc, hwc->idx);
+}
+
+void perf_counter_print_debug(void)
+{
+ u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+ struct cpu_hw_counters *cpuc;
+ unsigned long flags;
+ int cpu, idx;
+
+ if (!x86_pmu.num_counters)
+ return;
+
+ local_irq_save(flags);
+
+ cpu = smp_processor_id();
+ cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+ if (x86_pmu.version >= 2) {
+ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+ rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+ rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+
+ pr_info("\n");
+ pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
+ pr_info("CPU#%d: status: %016llx\n", cpu, status);
+ pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
+ pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
+ }
+ pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
+ rdmsrl(x86_pmu.perfctr + idx, pmc_count);
+
+ prev_left = per_cpu(prev_left[idx], cpu);
+
+ pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
+ cpu, idx, pmc_ctrl);
+ pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
+ cpu, idx, pmc_count);
+ pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
+ cpu, idx, prev_left);
+ }
+ for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+ rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
+
+ pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
+ cpu, idx, pmc_count);
+ }
+ local_irq_restore(flags);
+}
+
+static void x86_pmu_disable(struct perf_counter *counter)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct hw_perf_counter *hwc = &counter->hw;
+ int idx = hwc->idx;
+
+ /*
+ * Must be done before we disable, otherwise the nmi handler
+ * could reenable again:
+ */
+ clear_bit(idx, cpuc->active_mask);
+ x86_pmu.disable(hwc, idx);
+
+ /*
+ * Make sure the cleared pointer becomes visible before we
+ * (potentially) free the counter:
+ */
+ barrier();
+
+ /*
+ * Drain the remaining delta count out of a counter
+ * that we are disabling:
+ */
+ x86_perf_counter_update(counter, hwc, idx);
+ cpuc->counters[idx] = NULL;
+ clear_bit(idx, cpuc->used_mask);
+
+ perf_counter_update_userpage(counter);
+}
+
+/*
+ * Save and restart an expired counter. Called by NMI contexts,
+ * so it has to be careful about preempting normal counter ops:
+ */
+static int intel_pmu_save_and_restart(struct perf_counter *counter)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+ int idx = hwc->idx;
+ int ret;
+
+ x86_perf_counter_update(counter, hwc, idx);
+ ret = x86_perf_counter_set_period(counter, hwc, idx);
+
+ if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+ intel_pmu_enable_counter(hwc, idx);
+
+ return ret;
+}
+
+static void intel_pmu_reset(void)
+{
+ unsigned long flags;
+ int idx;
+
+ if (!x86_pmu.num_counters)
+ return;
+
+ local_irq_save(flags);
+
+ printk("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
+ checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
+ }
+ for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+ checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+ }
+
+ local_irq_restore(flags);
+}
+
+static int p6_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_counters *cpuc;
+ struct perf_counter *counter;
+ struct hw_perf_counter *hwc;
+ int idx, handled = 0;
+ u64 val;
+
+ data.regs = regs;
+ data.addr = 0;
+
+ cpuc = &__get_cpu_var(cpu_hw_counters);
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ counter = cpuc->counters[idx];
+ hwc = &counter->hw;
+
+ val = x86_perf_counter_update(counter, hwc, idx);
+ if (val & (1ULL << (x86_pmu.counter_bits - 1)))
+ continue;
+
+ /*
+ * counter overflow
+ */
+ handled = 1;
+ data.period = counter->hw.last_period;
+
+ if (!x86_perf_counter_set_period(counter, hwc, idx))
+ continue;
+
+ if (perf_counter_overflow(counter, 1, &data))
+ p6_pmu_disable_counter(hwc, idx);
+ }
+
+ if (handled)
+ inc_irq_stat(apic_perf_irqs);
+
+ return handled;
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int intel_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_counters *cpuc;
+ int bit, loops;
+ u64 ack, status;
+
+ data.regs = regs;
+ data.addr = 0;
+
+ cpuc = &__get_cpu_var(cpu_hw_counters);
+
+ perf_disable();
+ status = intel_pmu_get_status();
+ if (!status) {
+ perf_enable();
+ return 0;
+ }
+
+ loops = 0;
+again:
+ if (++loops > 100) {
+ WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
+ perf_counter_print_debug();
+ intel_pmu_reset();
+ perf_enable();
+ return 1;
+ }
+
+ inc_irq_stat(apic_perf_irqs);
+ ack = status;
+ for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+ struct perf_counter *counter = cpuc->counters[bit];
+
+ clear_bit(bit, (unsigned long *) &status);
+ if (!test_bit(bit, cpuc->active_mask))
+ continue;
+
+ if (!intel_pmu_save_and_restart(counter))
+ continue;
+
+ data.period = counter->hw.last_period;
+
+ if (perf_counter_overflow(counter, 1, &data))
+ intel_pmu_disable_counter(&counter->hw, bit);
+ }
+
+ intel_pmu_ack_status(ack);
+
+ /*
+ * Repeat if there is more work to be done:
+ */
+ status = intel_pmu_get_status();
+ if (status)
+ goto again;
+
+ perf_enable();
+
+ return 1;
+}
+
+static int amd_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_counters *cpuc;
+ struct perf_counter *counter;
+ struct hw_perf_counter *hwc;
+ int idx, handled = 0;
+ u64 val;
+
+ data.regs = regs;
+ data.addr = 0;
+
+ cpuc = &__get_cpu_var(cpu_hw_counters);
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ counter = cpuc->counters[idx];
+ hwc = &counter->hw;
+
+ val = x86_perf_counter_update(counter, hwc, idx);
+ if (val & (1ULL << (x86_pmu.counter_bits - 1)))
+ continue;
+
+ /*
+ * counter overflow
+ */
+ handled = 1;
+ data.period = counter->hw.last_period;
+
+ if (!x86_perf_counter_set_period(counter, hwc, idx))
+ continue;
+
+ if (perf_counter_overflow(counter, 1, &data))
+ amd_pmu_disable_counter(hwc, idx);
+ }
+
+ if (handled)
+ inc_irq_stat(apic_perf_irqs);
+
+ return handled;
+}
+
+void smp_perf_pending_interrupt(struct pt_regs *regs)
+{
+ irq_enter();
+ ack_APIC_irq();
+ inc_irq_stat(apic_pending_irqs);
+ perf_counter_do_pending();
+ irq_exit();
+}
+
+void set_perf_counter_pending(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+#endif
+}
+
+void perf_counters_lapic_init(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ if (!x86_pmu.apic || !x86_pmu_initialized())
+ return;
+
+ /*
+ * Always use NMI for PMU
+ */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+#endif
+}
+
+static int __kprobes
+perf_counter_nmi_handler(struct notifier_block *self,
+ unsigned long cmd, void *__args)
+{
+ struct die_args *args = __args;
+ struct pt_regs *regs;
+
+ if (!atomic_read(&active_counters))
+ return NOTIFY_DONE;
+
+ switch (cmd) {
+ case DIE_NMI:
+ case DIE_NMI_IPI:
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ regs = args->regs;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+#endif
+ /*
+ * Can't rely on the handled return value to say it was our NMI, two
+ * counters could trigger 'simultaneously' raising two back-to-back NMIs.
+ *
+ * If the first NMI handles both, the latter will be empty and daze
+ * the CPU.
+ */
+ x86_pmu.handle_irq(regs);
+
+ return NOTIFY_STOP;
+}
+
+static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
+ .notifier_call = perf_counter_nmi_handler,
+ .next = NULL,
+ .priority = 1
+};
+
+static struct x86_pmu p6_pmu = {
+ .name = "p6",
+ .handle_irq = p6_pmu_handle_irq,
+ .disable_all = p6_pmu_disable_all,
+ .enable_all = p6_pmu_enable_all,
+ .enable = p6_pmu_enable_counter,
+ .disable = p6_pmu_disable_counter,
+ .eventsel = MSR_P6_EVNTSEL0,
+ .perfctr = MSR_P6_PERFCTR0,
+ .event_map = p6_pmu_event_map,
+ .raw_event = p6_pmu_raw_event,
+ .max_events = ARRAY_SIZE(p6_perfmon_event_map),
+ .apic = 1,
+ .max_period = (1ULL << 31) - 1,
+ .version = 0,
+ .num_counters = 2,
+ /*
+ * Counters have 40 bits implemented. However they are designed such
+ * that bits [32-39] are sign extensions of bit 31. As such the
+ * effective width of a counter for P6-like PMU is 32 bits only.
+ *
+ * See IA-32 Intel Architecture Software developer manual Vol 3B
+ */
+ .counter_bits = 32,
+ .counter_mask = (1ULL << 32) - 1,
+};
+
+static struct x86_pmu intel_pmu = {
+ .name = "Intel",
+ .handle_irq = intel_pmu_handle_irq,
+ .disable_all = intel_pmu_disable_all,
+ .enable_all = intel_pmu_enable_all,
+ .enable = intel_pmu_enable_counter,
+ .disable = intel_pmu_disable_counter,
+ .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
+ .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
+ .event_map = intel_pmu_event_map,
+ .raw_event = intel_pmu_raw_event,
+ .max_events = ARRAY_SIZE(intel_perfmon_event_map),
+ .apic = 1,
+ /*
+ * Intel PMCs cannot be accessed sanely above 32 bit width,
+ * so we install an artificial 1<<31 period regardless of
+ * the generic counter period:
+ */
+ .max_period = (1ULL << 31) - 1,
+};
+
+static struct x86_pmu amd_pmu = {
+ .name = "AMD",
+ .handle_irq = amd_pmu_handle_irq,
+ .disable_all = amd_pmu_disable_all,
+ .enable_all = amd_pmu_enable_all,
+ .enable = amd_pmu_enable_counter,
+ .disable = amd_pmu_disable_counter,
+ .eventsel = MSR_K7_EVNTSEL0,
+ .perfctr = MSR_K7_PERFCTR0,
+ .event_map = amd_pmu_event_map,
+ .raw_event = amd_pmu_raw_event,
+ .max_events = ARRAY_SIZE(amd_perfmon_event_map),
+ .num_counters = 4,
+ .counter_bits = 48,
+ .counter_mask = (1ULL << 48) - 1,
+ .apic = 1,
+ /* use highest bit to detect overflow */
+ .max_period = (1ULL << 47) - 1,
+};
+
+static int p6_pmu_init(void)
+{
+ switch (boot_cpu_data.x86_model) {
+ case 1:
+ case 3: /* Pentium Pro */
+ case 5:
+ case 6: /* Pentium II */
+ case 7:
+ case 8:
+ case 11: /* Pentium III */
+ break;
+ case 9:
+ case 13:
+ /* Pentium M */
+ break;
+ default:
+ pr_cont("unsupported p6 CPU model %d ",
+ boot_cpu_data.x86_model);
+ return -ENODEV;
+ }
+
+ x86_pmu = p6_pmu;
+
+ if (!cpu_has_apic) {
+ pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
+ pr_info("no hardware sampling interrupt available.\n");
+ x86_pmu.apic = 0;
+ }
+
+ return 0;
+}
+
+static int intel_pmu_init(void)
+{
+ union cpuid10_edx edx;
+ union cpuid10_eax eax;
+ unsigned int unused;
+ unsigned int ebx;
+ int version;
+
+ if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+ /* check for P6 processor family */
+ if (boot_cpu_data.x86 == 6) {
+ return p6_pmu_init();
+ } else {
+ return -ENODEV;
+ }
+ }
+
+ /*
+ * Check whether the Architectural PerfMon supports
+ * Branch Misses Retired Event or not.
+ */
+ cpuid(10, &eax.full, &ebx, &unused, &edx.full);
+ if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
+ return -ENODEV;
+
+ version = eax.split.version_id;
+ if (version < 2)
+ return -ENODEV;
+
+ x86_pmu = intel_pmu;
+ x86_pmu.version = version;
+ x86_pmu.num_counters = eax.split.num_counters;
+ x86_pmu.counter_bits = eax.split.bit_width;
+ x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
+
+ /*
+ * Quirk: v2 perfmon does not report fixed-purpose counters, so
+ * assume at least 3 counters:
+ */
+ x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
+
+ /*
+ * Install the hw-cache-events table:
+ */
+ switch (boot_cpu_data.x86_model) {
+ case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+ case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
+ case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
+ case 29: /* six-core 45 nm xeon "Dunnington" */
+ memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ pr_cont("Core2 events, ");
+ break;
+ default:
+ case 26:
+ memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ pr_cont("Nehalem/Corei7 events, ");
+ break;
+ case 28:
+ memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ pr_cont("Atom events, ");
+ break;
+ }
+ return 0;
+}
+
+static int amd_pmu_init(void)
+{
+ /* Performance-monitoring supported from K7 and later: */
+ if (boot_cpu_data.x86 < 6)
+ return -ENODEV;
+
+ x86_pmu = amd_pmu;
+
+ /* Events are common for all AMDs */
+ memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ return 0;
+}
+
+void __init init_hw_perf_counters(void)
+{
+ int err;
+
+ pr_info("Performance Counters: ");
+
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ err = intel_pmu_init();
+ break;
+ case X86_VENDOR_AMD:
+ err = amd_pmu_init();
+ break;
+ default:
+ return;
+ }
+ if (err != 0) {
+ pr_cont("no PMU driver, software counters only.\n");
+ return;
+ }
+
+ pr_cont("%s PMU driver.\n", x86_pmu.name);
+
+ if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
+ WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
+ x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
+ x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
+ }
+ perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
+ perf_max_counters = x86_pmu.num_counters;
+
+ if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
+ WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
+ x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
+ x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
+ }
+
+ perf_counter_mask |=
+ ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
+ x86_pmu.intel_ctrl = perf_counter_mask;
+
+ perf_counters_lapic_init();
+ register_die_notifier(&perf_counter_nmi_notifier);
+
+ pr_info("... version: %d\n", x86_pmu.version);
+ pr_info("... bit width: %d\n", x86_pmu.counter_bits);
+ pr_info("... generic counters: %d\n", x86_pmu.num_counters);
+ pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask);
+ pr_info("... max period: %016Lx\n", x86_pmu.max_period);
+ pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed);
+ pr_info("... counter mask: %016Lx\n", perf_counter_mask);
+}
+
+static inline void x86_pmu_read(struct perf_counter *counter)
+{
+ x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
+}
+
+static const struct pmu pmu = {
+ .enable = x86_pmu_enable,
+ .disable = x86_pmu_disable,
+ .read = x86_pmu_read,
+ .unthrottle = x86_pmu_unthrottle,
+};
+
+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
+{
+ int err;
+
+ err = __hw_perf_counter_init(counter);
+ if (err)
+ return ERR_PTR(err);
+
+ return &pmu;
+}
+
+/*
+ * callchain support
+ */
+
+static inline
+void callchain_store(struct perf_callchain_entry *entry, u64 ip)
+{
+ if (entry->nr < PERF_MAX_STACK_DEPTH)
+ entry->ip[entry->nr++] = ip;
+}
+
+static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
+static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
+static DEFINE_PER_CPU(int, in_nmi_frame);
+
+
+static void
+backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+ /* Ignore warnings */
+}
+
+static void backtrace_warning(void *data, char *msg)
+{
+ /* Ignore warnings */
+}
+
+static int backtrace_stack(void *data, char *name)
+{
+ per_cpu(in_nmi_frame, smp_processor_id()) =
+ x86_is_stack_id(NMI_STACK, name);
+
+ return 0;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+ struct perf_callchain_entry *entry = data;
+
+ if (per_cpu(in_nmi_frame, smp_processor_id()))
+ return;
+
+ if (reliable)
+ callchain_store(entry, addr);
+}
+
+static const struct stacktrace_ops backtrace_ops = {
+ .warning = backtrace_warning,
+ .warning_symbol = backtrace_warning_symbol,
+ .stack = backtrace_stack,
+ .address = backtrace_address,
+};
+
+#include "../dumpstack.h"
+
+static void
+perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+ callchain_store(entry, PERF_CONTEXT_KERNEL);
+ callchain_store(entry, regs->ip);
+
+ dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
+}
+
+/*
+ * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
+ */
+static unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
+{
+ unsigned long offset, addr = (unsigned long)from;
+ int type = in_nmi() ? KM_NMI : KM_IRQ0;
+ unsigned long size, len = 0;
+ struct page *page;
+ void *map;
+ int ret;
+
+ do {
+ ret = __get_user_pages_fast(addr, 1, 0, &page);
+ if (!ret)
+ break;
+
+ offset = addr & (PAGE_SIZE - 1);
+ size = min(PAGE_SIZE - offset, n - len);
+
+ map = kmap_atomic(page, type);
+ memcpy(to, map+offset, size);
+ kunmap_atomic(map, type);
+ put_page(page);
+
+ len += size;
+ to += size;
+ addr += size;
+
+ } while (len < n);
+
+ return len;
+}
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+ unsigned long bytes;
+
+ bytes = copy_from_user_nmi(frame, fp, sizeof(*frame));
+
+ return bytes == sizeof(*frame);
+}
+
+static void
+perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+ struct stack_frame frame;
+ const void __user *fp;
+
+ if (!user_mode(regs))
+ regs = task_pt_regs(current);
+
+ fp = (void __user *)regs->bp;
+
+ callchain_store(entry, PERF_CONTEXT_USER);
+ callchain_store(entry, regs->ip);
+
+ while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ frame.next_frame = NULL;
+ frame.return_address = 0;
+
+ if (!copy_stack_frame(fp, &frame))
+ break;
+
+ if ((unsigned long)fp < regs->sp)
+ break;
+
+ callchain_store(entry, frame.return_address);
+ fp = frame.next_frame;
+ }
+}
+
+static void
+perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+ int is_user;
+
+ if (!regs)
+ return;
+
+ is_user = user_mode(regs);
+
+ if (!current || current->pid == 0)
+ return;
+
+ if (is_user && current->state != TASK_RUNNING)
+ return;
+
+ if (!is_user)
+ perf_callchain_kernel(regs, entry);
+
+ if (current->mm)
+ perf_callchain_user(regs, entry);
+}
+
+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+ struct perf_callchain_entry *entry;
+
+ if (in_nmi())
+ entry = &__get_cpu_var(nmi_entry);
+ else
+ entry = &__get_cpu_var(irq_entry);
+
+ entry->nr = 0;
+
+ perf_do_callchain(regs, entry);
+
+ return entry;
+}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f6c70a164e3..e60ed740d2b 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -19,8 +19,8 @@
#include <linux/nmi.h>
#include <linux/kprobes.h>
-#include <asm/genapic.h>
-#include <asm/intel_arch_perfmon.h>
+#include <asm/apic.h>
+#include <asm/perf_counter.h>
struct nmi_watchdog_ctlblk {
unsigned int cccr_msr;
@@ -716,11 +716,15 @@ static void probe_nmi_watchdog(void)
wd_ops = &k7_wd_ops;
break;
case X86_VENDOR_INTEL:
- /*
- * Work around Core Duo (Yonah) errata AE49 where perfctr1
- * doesn't have a working enable bit.
+ /* Work around where perfctr1 doesn't have a working enable
+ * bit as described in the following errata:
+ * AE49 Core Duo and Intel Core Solo 65 nm
+ * AN49 Intel Pentium Dual-Core
+ * AF49 Dual-Core Intel Xeon Processor LV
*/
- if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) {
+ if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) ||
+ ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 &&
+ boot_cpu_data.x86_mask == 4))) {
intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
}
@@ -799,8 +803,3 @@ int __kprobes lapic_wd_event(unsigned nmi_hz)
wd_ops->rearm(wd, nmi_hz);
return 1;
}
-
-int lapic_watchdog_ok(void)
-{
- return wd_ops != NULL;
-}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 2ac1f0c2beb..b07af886124 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -182,6 +182,11 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
.notifier_call = cpuid_class_cpu_callback,
};
+static char *cpuid_nodename(struct device *dev)
+{
+ return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
+}
+
static int __init cpuid_init(void)
{
int i, err = 0;
@@ -198,6 +203,7 @@ static int __init cpuid_init(void)
err = PTR_ERR(cpuid_class);
goto out_chrdev;
}
+ cpuid_class->nodename = cpuid_nodename;
for_each_online_cpu(i) {
err = cpuid_device_create(i);
if (err != 0)
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index ff958248e61..5e409dc298a 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -27,6 +27,7 @@
#include <asm/cpu.h>
#include <asm/reboot.h>
#include <asm/virtext.h>
+#include <asm/iommu.h>
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
@@ -103,5 +104,10 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
+
+#ifdef CONFIG_X86_64
+ pci_iommu_shutdown();
+#endif
+
crash_save_cpu(regs, safe_smp_processor_id());
}
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 87b67e3a765..48bfe138603 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -19,45 +19,61 @@
* Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
*/
-
-#include <asm/ds.h>
-
-#include <linux/errno.h>
+#include <linux/kernel.h>
#include <linux/string.h>
-#include <linux/slab.h>
+#include <linux/errno.h>
#include <linux/sched.h>
+#include <linux/slab.h>
#include <linux/mm.h>
-#include <linux/kernel.h>
+#include <linux/trace_clock.h>
+
+#include <asm/ds.h>
+#include "ds_selftest.h"
/*
- * The configuration for a particular DS hardware implementation.
+ * The configuration for a particular DS hardware implementation:
*/
struct ds_configuration {
- /* the name of the configuration */
- const char *name;
- /* the size of one pointer-typed field in the DS structure and
- in the BTS and PEBS buffers in bytes;
- this covers the first 8 DS fields related to buffer management. */
- unsigned char sizeof_field;
- /* the size of a BTS/PEBS record in bytes */
- unsigned char sizeof_rec[2];
- /* a series of bit-masks to control various features indexed
- * by enum ds_feature */
- unsigned long ctl[dsf_ctl_max];
+ /* The name of the configuration: */
+ const char *name;
+
+ /* The size of pointer-typed fields in DS, BTS, and PEBS: */
+ unsigned char sizeof_ptr_field;
+
+ /* The size of a BTS/PEBS record in bytes: */
+ unsigned char sizeof_rec[2];
+
+ /* The number of pebs counter reset values in the DS structure. */
+ unsigned char nr_counter_reset;
+
+ /* Control bit-masks indexed by enum ds_feature: */
+ unsigned long ctl[dsf_ctl_max];
};
-static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
+static struct ds_configuration ds_cfg __read_mostly;
+
+
+/* Maximal size of a DS configuration: */
+#define MAX_SIZEOF_DS 0x80
-#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id())
+/* Maximal size of a BTS record: */
+#define MAX_SIZEOF_BTS (3 * 8)
-#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */
-#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */
-#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */
+/* BTS and PEBS buffer alignment: */
+#define DS_ALIGNMENT (1 << 3)
-#define BTS_CONTROL \
- (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\
- ds_cfg.ctl[dsf_bts_overflow])
+/* Number of buffer pointers in DS: */
+#define NUM_DS_PTR_FIELDS 8
+/* Size of a pebs reset value in DS: */
+#define PEBS_RESET_FIELD_SIZE 8
+
+/* Mask of control bits in the DS MSR register: */
+#define BTS_CONTROL \
+ ( ds_cfg.ctl[dsf_bts] | \
+ ds_cfg.ctl[dsf_bts_kernel] | \
+ ds_cfg.ctl[dsf_bts_user] | \
+ ds_cfg.ctl[dsf_bts_overflow] )
/*
* A BTS or PEBS tracer.
@@ -66,29 +82,36 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
* to identify tracers.
*/
struct ds_tracer {
- /* the DS context (partially) owned by this tracer */
- struct ds_context *context;
- /* the buffer provided on ds_request() and its size in bytes */
- void *buffer;
- size_t size;
+ /* The DS context (partially) owned by this tracer. */
+ struct ds_context *context;
+ /* The buffer provided on ds_request() and its size in bytes. */
+ void *buffer;
+ size_t size;
};
struct bts_tracer {
- /* the common DS part */
- struct ds_tracer ds;
- /* the trace including the DS configuration */
- struct bts_trace trace;
- /* buffer overflow notification function */
- bts_ovfl_callback_t ovfl;
+ /* The common DS part: */
+ struct ds_tracer ds;
+
+ /* The trace including the DS configuration: */
+ struct bts_trace trace;
+
+ /* Buffer overflow notification function: */
+ bts_ovfl_callback_t ovfl;
+
+ /* Active flags affecting trace collection. */
+ unsigned int flags;
};
struct pebs_tracer {
- /* the common DS part */
- struct ds_tracer ds;
- /* the trace including the DS configuration */
- struct pebs_trace trace;
- /* buffer overflow notification function */
- pebs_ovfl_callback_t ovfl;
+ /* The common DS part: */
+ struct ds_tracer ds;
+
+ /* The trace including the DS configuration: */
+ struct pebs_trace trace;
+
+ /* Buffer overflow notification function: */
+ pebs_ovfl_callback_t ovfl;
};
/*
@@ -97,6 +120,7 @@ struct pebs_tracer {
*
* The DS configuration consists of the following fields; different
* architetures vary in the size of those fields.
+ *
* - double-word aligned base linear address of the BTS buffer
* - write pointer into the BTS buffer
* - end linear address of the BTS buffer (one byte beyond the end of
@@ -135,21 +159,22 @@ enum ds_field {
};
enum ds_qualifier {
- ds_bts = 0,
+ ds_bts = 0,
ds_pebs
};
-static inline unsigned long ds_get(const unsigned char *base,
- enum ds_qualifier qual, enum ds_field field)
+static inline unsigned long
+ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
{
- base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+ base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
return *(unsigned long *)base;
}
-static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
- enum ds_field field, unsigned long value)
+static inline void
+ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
+ unsigned long value)
{
- base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+ base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
(*(unsigned long *)base) = value;
}
@@ -159,7 +184,6 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
*/
static DEFINE_SPINLOCK(ds_lock);
-
/*
* We either support (system-wide) per-cpu or per-thread allocation.
* We distinguish the two based on the task_struct pointer, where a
@@ -178,12 +202,28 @@ static DEFINE_SPINLOCK(ds_lock);
*/
static atomic_t tracers = ATOMIC_INIT(0);
-static inline void get_tracer(struct task_struct *task)
+static inline int get_tracer(struct task_struct *task)
{
- if (task)
+ int error;
+
+ spin_lock_irq(&ds_lock);
+
+ if (task) {
+ error = -EPERM;
+ if (atomic_read(&tracers) < 0)
+ goto out;
atomic_inc(&tracers);
- else
+ } else {
+ error = -EPERM;
+ if (atomic_read(&tracers) > 0)
+ goto out;
atomic_dec(&tracers);
+ }
+
+ error = 0;
+out:
+ spin_unlock_irq(&ds_lock);
+ return error;
}
static inline void put_tracer(struct task_struct *task)
@@ -194,14 +234,6 @@ static inline void put_tracer(struct task_struct *task)
atomic_inc(&tracers);
}
-static inline int check_tracer(struct task_struct *task)
-{
- return task ?
- (atomic_read(&tracers) >= 0) :
- (atomic_read(&tracers) <= 0);
-}
-
-
/*
* The DS context is either attached to a thread or to a cpu:
* - in the former case, the thread_struct contains a pointer to the
@@ -213,61 +245,58 @@ static inline int check_tracer(struct task_struct *task)
* deallocated when the last user puts the context.
*/
struct ds_context {
- /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
- unsigned char ds[MAX_SIZEOF_DS];
- /* the owner of the BTS and PEBS configuration, respectively */
- struct bts_tracer *bts_master;
- struct pebs_tracer *pebs_master;
- /* use count */
- unsigned long count;
- /* a pointer to the context location inside the thread_struct
- * or the per_cpu context array */
- struct ds_context **this;
- /* a pointer to the task owning this context, or NULL, if the
- * context is owned by a cpu */
- struct task_struct *task;
-};
+ /* The DS configuration; goes into MSR_IA32_DS_AREA: */
+ unsigned char ds[MAX_SIZEOF_DS];
+
+ /* The owner of the BTS and PEBS configuration, respectively: */
+ struct bts_tracer *bts_master;
+ struct pebs_tracer *pebs_master;
-static DEFINE_PER_CPU(struct ds_context *, system_context_array);
+ /* Use count: */
+ unsigned long count;
-#define system_context per_cpu(system_context_array, smp_processor_id())
+ /* Pointer to the context pointer field: */
+ struct ds_context **this;
+
+ /* The traced task; NULL for cpu tracing: */
+ struct task_struct *task;
+
+ /* The traced cpu; only valid if task is NULL: */
+ int cpu;
+};
+static DEFINE_PER_CPU(struct ds_context *, cpu_context);
-static inline struct ds_context *ds_get_context(struct task_struct *task)
+
+static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
{
struct ds_context **p_context =
- (task ? &task->thread.ds_ctx : &system_context);
+ (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu));
struct ds_context *context = NULL;
struct ds_context *new_context = NULL;
- unsigned long irq;
/* Chances are small that we already have a context. */
new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
if (!new_context)
return NULL;
- spin_lock_irqsave(&ds_lock, irq);
+ spin_lock_irq(&ds_lock);
context = *p_context;
- if (!context) {
+ if (likely(!context)) {
context = new_context;
context->this = p_context;
context->task = task;
+ context->cpu = cpu;
context->count = 0;
- if (task)
- set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
-
- if (!task || (task == current))
- wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds);
-
*p_context = context;
}
context->count++;
- spin_unlock_irqrestore(&ds_lock, irq);
+ spin_unlock_irq(&ds_lock);
if (context != new_context)
kfree(new_context);
@@ -275,8 +304,9 @@ static inline struct ds_context *ds_get_context(struct task_struct *task)
return context;
}
-static inline void ds_put_context(struct ds_context *context)
+static void ds_put_context(struct ds_context *context)
{
+ struct task_struct *task;
unsigned long irq;
if (!context)
@@ -291,17 +321,55 @@ static inline void ds_put_context(struct ds_context *context)
*(context->this) = NULL;
- if (context->task)
- clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
+ task = context->task;
+
+ if (task)
+ clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
- if (!context->task || (context->task == current))
- wrmsrl(MSR_IA32_DS_AREA, 0);
+ /*
+ * We leave the (now dangling) pointer to the DS configuration in
+ * the DS_AREA msr. This is as good or as bad as replacing it with
+ * NULL - the hardware would crash if we enabled tracing.
+ *
+ * This saves us some problems with having to write an msr on a
+ * different cpu while preventing others from doing the same for the
+ * next context for that same cpu.
+ */
spin_unlock_irqrestore(&ds_lock, irq);
+ /* The context might still be in use for context switching. */
+ if (task && (task != current))
+ wait_task_context_switch(task);
+
kfree(context);
}
+static void ds_install_ds_area(struct ds_context *context)
+{
+ unsigned long ds;
+
+ ds = (unsigned long)context->ds;
+
+ /*
+ * There is a race between the bts master and the pebs master.
+ *
+ * The thread/cpu access is synchronized via get/put_cpu() for
+ * task tracing and via wrmsr_on_cpu for cpu tracing.
+ *
+ * If bts and pebs are collected for the same task or same cpu,
+ * the same confiuration is written twice.
+ */
+ if (context->task) {
+ get_cpu();
+ if (context->task == current)
+ wrmsrl(MSR_IA32_DS_AREA, ds);
+ set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
+ put_cpu();
+ } else
+ wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
+ (u32)((u64)ds), (u32)((u64)ds >> 32));
+}
/*
* Call the tracer's callback on a buffer overflow.
@@ -332,9 +400,9 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
* The remainder of any partially written record is zeroed out.
*
* context: the DS context
- * qual: the buffer type
- * record: the data to write
- * size: the size of the data
+ * qual: the buffer type
+ * record: the data to write
+ * size: the size of the data
*/
static int ds_write(struct ds_context *context, enum ds_qualifier qual,
const void *record, size_t size)
@@ -349,14 +417,14 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
unsigned long write_size, adj_write_size;
/*
- * write as much as possible without producing an
+ * Write as much as possible without producing an
* overflow interrupt.
*
- * interrupt_threshold must either be
+ * Interrupt_threshold must either be
* - bigger than absolute_maximum or
* - point to a record between buffer_base and absolute_maximum
*
- * index points to a valid record.
+ * Index points to a valid record.
*/
base = ds_get(context->ds, qual, ds_buffer_base);
index = ds_get(context->ds, qual, ds_index);
@@ -365,8 +433,10 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
write_end = min(end, int_th);
- /* if we are already beyond the interrupt threshold,
- * we fill the entire buffer */
+ /*
+ * If we are already beyond the interrupt threshold,
+ * we fill the entire buffer.
+ */
if (write_end <= index)
write_end = end;
@@ -383,7 +453,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
adj_write_size *= ds_cfg.sizeof_rec[qual];
- /* zero out trailing bytes */
+ /* Zero out trailing bytes. */
memset((char *)index + write_size, 0,
adj_write_size - write_size);
index += adj_write_size;
@@ -410,7 +480,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
* Later architectures use 64bit pointers throughout, whereas earlier
* architectures use 32bit pointers in 32bit mode.
*
- * We compute the base address for the first 8 fields based on:
+ * We compute the base address for the fields based on:
* - the field size stored in the DS configuration
* - the relative field position
*
@@ -431,23 +501,23 @@ enum bts_field {
bts_to,
bts_flags,
- bts_qual = bts_from,
- bts_jiffies = bts_to,
- bts_pid = bts_flags,
+ bts_qual = bts_from,
+ bts_clock = bts_to,
+ bts_pid = bts_flags,
- bts_qual_mask = (bts_qual_max - 1),
- bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
+ bts_qual_mask = (bts_qual_max - 1),
+ bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
};
static inline unsigned long bts_get(const char *base, enum bts_field field)
{
- base += (ds_cfg.sizeof_field * field);
+ base += (ds_cfg.sizeof_ptr_field * field);
return *(unsigned long *)base;
}
static inline void bts_set(char *base, enum bts_field field, unsigned long val)
{
- base += (ds_cfg.sizeof_field * field);;
+ base += (ds_cfg.sizeof_ptr_field * field);;
(*(unsigned long *)base) = val;
}
@@ -463,8 +533,8 @@ static inline void bts_set(char *base, enum bts_field field, unsigned long val)
*
* return: bytes read/written on success; -Eerrno, otherwise
*/
-static int bts_read(struct bts_tracer *tracer, const void *at,
- struct bts_struct *out)
+static int
+bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
{
if (!tracer)
return -EINVAL;
@@ -478,8 +548,8 @@ static int bts_read(struct bts_tracer *tracer, const void *at,
memset(out, 0, sizeof(*out));
if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
- out->variant.timestamp.jiffies = bts_get(at, bts_jiffies);
- out->variant.timestamp.pid = bts_get(at, bts_pid);
+ out->variant.event.clock = bts_get(at, bts_clock);
+ out->variant.event.pid = bts_get(at, bts_pid);
} else {
out->qualifier = bts_branch;
out->variant.lbr.from = bts_get(at, bts_from);
@@ -516,8 +586,8 @@ static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
case bts_task_arrives:
case bts_task_departs:
bts_set(raw, bts_qual, (bts_escape | in->qualifier));
- bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies);
- bts_set(raw, bts_pid, in->variant.timestamp.pid);
+ bts_set(raw, bts_clock, in->variant.event.clock);
+ bts_set(raw, bts_pid, in->variant.event.pid);
break;
default:
return -EINVAL;
@@ -555,7 +625,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
unsigned int flags) {
unsigned long buffer, adj;
- /* adjust the buffer address and size to meet alignment
+ /*
+ * Adjust the buffer address and size to meet alignment
* constraints:
* - buffer is double-word aligned
* - size is multiple of record size
@@ -577,9 +648,11 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
trace->begin = (void *)buffer;
trace->top = trace->begin;
trace->end = (void *)(buffer + size);
- /* The value for 'no threshold' is -1, which will set the
+ /*
+ * The value for 'no threshold' is -1, which will set the
* threshold outside of the buffer, just like we want it.
*/
+ ith *= ds_cfg.sizeof_rec[qual];
trace->ith = (void *)(buffer + size - ith);
trace->flags = flags;
@@ -588,18 +661,27 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
enum ds_qualifier qual, struct task_struct *task,
- void *base, size_t size, size_t th, unsigned int flags)
+ int cpu, void *base, size_t size, size_t th)
{
struct ds_context *context;
int error;
+ size_t req_size;
+
+ error = -EOPNOTSUPP;
+ if (!ds_cfg.sizeof_rec[qual])
+ goto out;
error = -EINVAL;
if (!base)
goto out;
- /* we require some space to do alignment adjustments below */
+ req_size = ds_cfg.sizeof_rec[qual];
+ /* We might need space for alignment adjustments. */
+ if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
+ req_size += DS_ALIGNMENT;
+
error = -EINVAL;
- if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
+ if (size < req_size)
goto out;
if (th != (size_t)-1) {
@@ -614,182 +696,318 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
tracer->size = size;
error = -ENOMEM;
- context = ds_get_context(task);
+ context = ds_get_context(task, cpu);
if (!context)
goto out;
tracer->context = context;
- ds_init_ds_trace(trace, qual, base, size, th, flags);
+ /*
+ * Defer any tracer-specific initialization work for the context until
+ * context ownership has been clarified.
+ */
error = 0;
out:
return error;
}
-struct bts_tracer *ds_request_bts(struct task_struct *task,
- void *base, size_t size,
- bts_ovfl_callback_t ovfl, size_t th,
- unsigned int flags)
+static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
+ void *base, size_t size,
+ bts_ovfl_callback_t ovfl, size_t th,
+ unsigned int flags)
{
struct bts_tracer *tracer;
- unsigned long irq;
int error;
+ /* Buffer overflow notification is not yet implemented. */
error = -EOPNOTSUPP;
- if (!ds_cfg.ctl[dsf_bts])
+ if (ovfl)
goto out;
- /* buffer overflow notification is not yet implemented */
- error = -EOPNOTSUPP;
- if (ovfl)
+ error = get_tracer(task);
+ if (error < 0)
goto out;
error = -ENOMEM;
tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
if (!tracer)
- goto out;
+ goto out_put_tracer;
tracer->ovfl = ovfl;
+ /* Do some more error checking and acquire a tracing context. */
error = ds_request(&tracer->ds, &tracer->trace.ds,
- ds_bts, task, base, size, th, flags);
+ ds_bts, task, cpu, base, size, th);
if (error < 0)
goto out_tracer;
-
- spin_lock_irqsave(&ds_lock, irq);
-
- error = -EPERM;
- if (!check_tracer(task))
- goto out_unlock;
- get_tracer(task);
+ /* Claim the bts part of the tracing context we acquired above. */
+ spin_lock_irq(&ds_lock);
error = -EPERM;
if (tracer->ds.context->bts_master)
- goto out_put_tracer;
+ goto out_unlock;
tracer->ds.context->bts_master = tracer;
- spin_unlock_irqrestore(&ds_lock, irq);
+ spin_unlock_irq(&ds_lock);
+ /*
+ * Now that we own the bts part of the context, let's complete the
+ * initialization for that part.
+ */
+ ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
+ ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+ ds_install_ds_area(tracer->ds.context);
tracer->trace.read = bts_read;
tracer->trace.write = bts_write;
- ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+ /* Start tracing. */
ds_resume_bts(tracer);
return tracer;
- out_put_tracer:
- put_tracer(task);
out_unlock:
- spin_unlock_irqrestore(&ds_lock, irq);
+ spin_unlock_irq(&ds_lock);
ds_put_context(tracer->ds.context);
out_tracer:
kfree(tracer);
+ out_put_tracer:
+ put_tracer(task);
out:
return ERR_PTR(error);
}
-struct pebs_tracer *ds_request_pebs(struct task_struct *task,
- void *base, size_t size,
- pebs_ovfl_callback_t ovfl, size_t th,
- unsigned int flags)
+struct bts_tracer *ds_request_bts_task(struct task_struct *task,
+ void *base, size_t size,
+ bts_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags)
+{
+ return ds_request_bts(task, 0, base, size, ovfl, th, flags);
+}
+
+struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
+ bts_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags)
+{
+ return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
+}
+
+static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
+ void *base, size_t size,
+ pebs_ovfl_callback_t ovfl, size_t th,
+ unsigned int flags)
{
struct pebs_tracer *tracer;
- unsigned long irq;
int error;
- /* buffer overflow notification is not yet implemented */
+ /* Buffer overflow notification is not yet implemented. */
error = -EOPNOTSUPP;
if (ovfl)
goto out;
+ error = get_tracer(task);
+ if (error < 0)
+ goto out;
+
error = -ENOMEM;
tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
if (!tracer)
- goto out;
+ goto out_put_tracer;
tracer->ovfl = ovfl;
+ /* Do some more error checking and acquire a tracing context. */
error = ds_request(&tracer->ds, &tracer->trace.ds,
- ds_pebs, task, base, size, th, flags);
+ ds_pebs, task, cpu, base, size, th);
if (error < 0)
goto out_tracer;
- spin_lock_irqsave(&ds_lock, irq);
-
- error = -EPERM;
- if (!check_tracer(task))
- goto out_unlock;
- get_tracer(task);
+ /* Claim the pebs part of the tracing context we acquired above. */
+ spin_lock_irq(&ds_lock);
error = -EPERM;
if (tracer->ds.context->pebs_master)
- goto out_put_tracer;
+ goto out_unlock;
tracer->ds.context->pebs_master = tracer;
- spin_unlock_irqrestore(&ds_lock, irq);
+ spin_unlock_irq(&ds_lock);
+ /*
+ * Now that we own the pebs part of the context, let's complete the
+ * initialization for that part.
+ */
+ ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
+ ds_install_ds_area(tracer->ds.context);
+
+ /* Start tracing. */
ds_resume_pebs(tracer);
return tracer;
- out_put_tracer:
- put_tracer(task);
out_unlock:
- spin_unlock_irqrestore(&ds_lock, irq);
+ spin_unlock_irq(&ds_lock);
ds_put_context(tracer->ds.context);
out_tracer:
kfree(tracer);
+ out_put_tracer:
+ put_tracer(task);
out:
return ERR_PTR(error);
}
-void ds_release_bts(struct bts_tracer *tracer)
+struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
+ void *base, size_t size,
+ pebs_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags)
{
- if (!tracer)
- return;
+ return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
+}
- ds_suspend_bts(tracer);
+struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
+ pebs_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags)
+{
+ return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
+}
+
+static void ds_free_bts(struct bts_tracer *tracer)
+{
+ struct task_struct *task;
+
+ task = tracer->ds.context->task;
WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
tracer->ds.context->bts_master = NULL;
- put_tracer(tracer->ds.context->task);
+ /* Make sure tracing stopped and the tracer is not in use. */
+ if (task && (task != current))
+ wait_task_context_switch(task);
+
ds_put_context(tracer->ds.context);
+ put_tracer(task);
kfree(tracer);
}
+void ds_release_bts(struct bts_tracer *tracer)
+{
+ might_sleep();
+
+ if (!tracer)
+ return;
+
+ ds_suspend_bts(tracer);
+ ds_free_bts(tracer);
+}
+
+int ds_release_bts_noirq(struct bts_tracer *tracer)
+{
+ struct task_struct *task;
+ unsigned long irq;
+ int error;
+
+ if (!tracer)
+ return 0;
+
+ task = tracer->ds.context->task;
+
+ local_irq_save(irq);
+
+ error = -EPERM;
+ if (!task &&
+ (tracer->ds.context->cpu != smp_processor_id()))
+ goto out;
+
+ error = -EPERM;
+ if (task && (task != current))
+ goto out;
+
+ ds_suspend_bts_noirq(tracer);
+ ds_free_bts(tracer);
+
+ error = 0;
+ out:
+ local_irq_restore(irq);
+ return error;
+}
+
+static void update_task_debugctlmsr(struct task_struct *task,
+ unsigned long debugctlmsr)
+{
+ task->thread.debugctlmsr = debugctlmsr;
+
+ get_cpu();
+ if (task == current)
+ update_debugctlmsr(debugctlmsr);
+ put_cpu();
+}
+
void ds_suspend_bts(struct bts_tracer *tracer)
{
struct task_struct *task;
+ unsigned long debugctlmsr;
+ int cpu;
if (!tracer)
return;
+ tracer->flags = 0;
+
task = tracer->ds.context->task;
+ cpu = tracer->ds.context->cpu;
- if (!task || (task == current))
- update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL);
+ WARN_ON(!task && irqs_disabled());
- if (task) {
- task->thread.debugctlmsr &= ~BTS_CONTROL;
+ debugctlmsr = (task ?
+ task->thread.debugctlmsr :
+ get_debugctlmsr_on_cpu(cpu));
+ debugctlmsr &= ~BTS_CONTROL;
- if (!task->thread.debugctlmsr)
- clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
- }
+ if (task)
+ update_task_debugctlmsr(task, debugctlmsr);
+ else
+ update_debugctlmsr_on_cpu(cpu, debugctlmsr);
}
-void ds_resume_bts(struct bts_tracer *tracer)
+int ds_suspend_bts_noirq(struct bts_tracer *tracer)
{
struct task_struct *task;
- unsigned long control;
+ unsigned long debugctlmsr, irq;
+ int cpu, error = 0;
if (!tracer)
- return;
+ return 0;
+
+ tracer->flags = 0;
task = tracer->ds.context->task;
+ cpu = tracer->ds.context->cpu;
+
+ local_irq_save(irq);
+
+ error = -EPERM;
+ if (!task && (cpu != smp_processor_id()))
+ goto out;
+
+ debugctlmsr = (task ?
+ task->thread.debugctlmsr :
+ get_debugctlmsr());
+ debugctlmsr &= ~BTS_CONTROL;
+
+ if (task)
+ update_task_debugctlmsr(task, debugctlmsr);
+ else
+ update_debugctlmsr(debugctlmsr);
+
+ error = 0;
+ out:
+ local_irq_restore(irq);
+ return error;
+}
+
+static unsigned long ds_bts_control(struct bts_tracer *tracer)
+{
+ unsigned long control;
control = ds_cfg.ctl[dsf_bts];
if (!(tracer->trace.ds.flags & BTS_KERNEL))
@@ -797,41 +1015,149 @@ void ds_resume_bts(struct bts_tracer *tracer)
if (!(tracer->trace.ds.flags & BTS_USER))
control |= ds_cfg.ctl[dsf_bts_user];
- if (task) {
- task->thread.debugctlmsr |= control;
- set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
- }
-
- if (!task || (task == current))
- update_debugctlmsr(get_debugctlmsr() | control);
+ return control;
}
-void ds_release_pebs(struct pebs_tracer *tracer)
+void ds_resume_bts(struct bts_tracer *tracer)
{
+ struct task_struct *task;
+ unsigned long debugctlmsr;
+ int cpu;
+
if (!tracer)
return;
- ds_suspend_pebs(tracer);
+ tracer->flags = tracer->trace.ds.flags;
+
+ task = tracer->ds.context->task;
+ cpu = tracer->ds.context->cpu;
+
+ WARN_ON(!task && irqs_disabled());
+
+ debugctlmsr = (task ?
+ task->thread.debugctlmsr :
+ get_debugctlmsr_on_cpu(cpu));
+ debugctlmsr |= ds_bts_control(tracer);
+
+ if (task)
+ update_task_debugctlmsr(task, debugctlmsr);
+ else
+ update_debugctlmsr_on_cpu(cpu, debugctlmsr);
+}
+
+int ds_resume_bts_noirq(struct bts_tracer *tracer)
+{
+ struct task_struct *task;
+ unsigned long debugctlmsr, irq;
+ int cpu, error = 0;
+
+ if (!tracer)
+ return 0;
+
+ tracer->flags = tracer->trace.ds.flags;
+
+ task = tracer->ds.context->task;
+ cpu = tracer->ds.context->cpu;
+
+ local_irq_save(irq);
+
+ error = -EPERM;
+ if (!task && (cpu != smp_processor_id()))
+ goto out;
+
+ debugctlmsr = (task ?
+ task->thread.debugctlmsr :
+ get_debugctlmsr());
+ debugctlmsr |= ds_bts_control(tracer);
+
+ if (task)
+ update_task_debugctlmsr(task, debugctlmsr);
+ else
+ update_debugctlmsr(debugctlmsr);
+
+ error = 0;
+ out:
+ local_irq_restore(irq);
+ return error;
+}
+
+static void ds_free_pebs(struct pebs_tracer *tracer)
+{
+ struct task_struct *task;
+
+ task = tracer->ds.context->task;
WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
tracer->ds.context->pebs_master = NULL;
- put_tracer(tracer->ds.context->task);
ds_put_context(tracer->ds.context);
+ put_tracer(task);
kfree(tracer);
}
+void ds_release_pebs(struct pebs_tracer *tracer)
+{
+ might_sleep();
+
+ if (!tracer)
+ return;
+
+ ds_suspend_pebs(tracer);
+ ds_free_pebs(tracer);
+}
+
+int ds_release_pebs_noirq(struct pebs_tracer *tracer)
+{
+ struct task_struct *task;
+ unsigned long irq;
+ int error;
+
+ if (!tracer)
+ return 0;
+
+ task = tracer->ds.context->task;
+
+ local_irq_save(irq);
+
+ error = -EPERM;
+ if (!task &&
+ (tracer->ds.context->cpu != smp_processor_id()))
+ goto out;
+
+ error = -EPERM;
+ if (task && (task != current))
+ goto out;
+
+ ds_suspend_pebs_noirq(tracer);
+ ds_free_pebs(tracer);
+
+ error = 0;
+ out:
+ local_irq_restore(irq);
+ return error;
+}
+
void ds_suspend_pebs(struct pebs_tracer *tracer)
{
}
+int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
+{
+ return 0;
+}
+
void ds_resume_pebs(struct pebs_tracer *tracer)
{
}
+int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
+{
+ return 0;
+}
+
const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
{
if (!tracer)
@@ -847,8 +1173,12 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
return NULL;
ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
- tracer->trace.reset_value =
- *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
+
+ tracer->trace.counters = ds_cfg.nr_counter_reset;
+ memcpy(tracer->trace.counter_reset,
+ tracer->ds.context->ds +
+ (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
+ ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
return &tracer->trace;
}
@@ -873,18 +1203,24 @@ int ds_reset_pebs(struct pebs_tracer *tracer)
tracer->trace.ds.top = tracer->trace.ds.begin;
- ds_set(tracer->ds.context->ds, ds_bts, ds_index,
+ ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
(unsigned long)tracer->trace.ds.top);
return 0;
}
-int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
+int ds_set_pebs_reset(struct pebs_tracer *tracer,
+ unsigned int counter, u64 value)
{
if (!tracer)
return -EINVAL;
- *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value;
+ if (ds_cfg.nr_counter_reset < counter)
+ return -EINVAL;
+
+ *(u64 *)(tracer->ds.context->ds +
+ (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
+ (counter * PEBS_RESET_FIELD_SIZE)) = value;
return 0;
}
@@ -894,73 +1230,117 @@ static const struct ds_configuration ds_cfg_netburst = {
.ctl[dsf_bts] = (1 << 2) | (1 << 3),
.ctl[dsf_bts_kernel] = (1 << 5),
.ctl[dsf_bts_user] = (1 << 6),
-
- .sizeof_field = sizeof(long),
- .sizeof_rec[ds_bts] = sizeof(long) * 3,
-#ifdef __i386__
- .sizeof_rec[ds_pebs] = sizeof(long) * 10,
-#else
- .sizeof_rec[ds_pebs] = sizeof(long) * 18,
-#endif
+ .nr_counter_reset = 1,
};
static const struct ds_configuration ds_cfg_pentium_m = {
.name = "Pentium M",
.ctl[dsf_bts] = (1 << 6) | (1 << 7),
-
- .sizeof_field = sizeof(long),
- .sizeof_rec[ds_bts] = sizeof(long) * 3,
-#ifdef __i386__
- .sizeof_rec[ds_pebs] = sizeof(long) * 10,
-#else
- .sizeof_rec[ds_pebs] = sizeof(long) * 18,
-#endif
+ .nr_counter_reset = 1,
};
static const struct ds_configuration ds_cfg_core2_atom = {
.name = "Core 2/Atom",
.ctl[dsf_bts] = (1 << 6) | (1 << 7),
.ctl[dsf_bts_kernel] = (1 << 9),
.ctl[dsf_bts_user] = (1 << 10),
-
- .sizeof_field = 8,
- .sizeof_rec[ds_bts] = 8 * 3,
- .sizeof_rec[ds_pebs] = 8 * 18,
+ .nr_counter_reset = 1,
+};
+static const struct ds_configuration ds_cfg_core_i7 = {
+ .name = "Core i7",
+ .ctl[dsf_bts] = (1 << 6) | (1 << 7),
+ .ctl[dsf_bts_kernel] = (1 << 9),
+ .ctl[dsf_bts_user] = (1 << 10),
+ .nr_counter_reset = 4,
};
static void
-ds_configure(const struct ds_configuration *cfg)
+ds_configure(const struct ds_configuration *cfg,
+ struct cpuinfo_x86 *cpu)
{
+ unsigned long nr_pebs_fields = 0;
+
+ printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
+
+#ifdef __i386__
+ nr_pebs_fields = 10;
+#else
+ nr_pebs_fields = 18;
+#endif
+
+ /*
+ * Starting with version 2, architectural performance
+ * monitoring supports a format specifier.
+ */
+ if ((cpuid_eax(0xa) & 0xff) > 1) {
+ unsigned long perf_capabilities, format;
+
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
+
+ format = (perf_capabilities >> 8) & 0xf;
+
+ switch (format) {
+ case 0:
+ nr_pebs_fields = 18;
+ break;
+ case 1:
+ nr_pebs_fields = 22;
+ break;
+ default:
+ printk(KERN_INFO
+ "[ds] unknown PEBS format: %lu\n", format);
+ nr_pebs_fields = 0;
+ break;
+ }
+ }
+
memset(&ds_cfg, 0, sizeof(ds_cfg));
ds_cfg = *cfg;
- printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name);
+ ds_cfg.sizeof_ptr_field =
+ (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
+
+ ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3;
+ ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
- if (!cpu_has_bts) {
- ds_cfg.ctl[dsf_bts] = 0;
+ if (!cpu_has(cpu, X86_FEATURE_BTS)) {
+ ds_cfg.sizeof_rec[ds_bts] = 0;
printk(KERN_INFO "[ds] bts not available\n");
}
- if (!cpu_has_pebs)
+ if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
+ ds_cfg.sizeof_rec[ds_pebs] = 0;
printk(KERN_INFO "[ds] pebs not available\n");
+ }
+
+ printk(KERN_INFO "[ds] sizes: address: %u bit, ",
+ 8 * ds_cfg.sizeof_ptr_field);
+ printk("bts/pebs record: %u/%u bytes\n",
+ ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
- WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field));
+ WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
}
void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
{
+ /* Only configure the first cpu. Others are identical. */
+ if (ds_cfg.name)
+ return;
+
switch (c->x86) {
case 0x6:
switch (c->x86_model) {
case 0x9:
case 0xd: /* Pentium M */
- ds_configure(&ds_cfg_pentium_m);
+ ds_configure(&ds_cfg_pentium_m, c);
break;
case 0xf:
case 0x17: /* Core2 */
case 0x1c: /* Atom */
- ds_configure(&ds_cfg_core2_atom);
+ ds_configure(&ds_cfg_core2_atom, c);
+ break;
+ case 0x1a: /* Core i7 */
+ ds_configure(&ds_cfg_core_i7, c);
break;
- case 0x1a: /* i7 */
default:
- /* sorry, don't know about them */
+ /* Sorry, don't know about them. */
break;
}
break;
@@ -969,64 +1349,89 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
case 0x0:
case 0x1:
case 0x2: /* Netburst */
- ds_configure(&ds_cfg_netburst);
+ ds_configure(&ds_cfg_netburst, c);
break;
default:
- /* sorry, don't know about them */
+ /* Sorry, don't know about them. */
break;
}
break;
default:
- /* sorry, don't know about them */
+ /* Sorry, don't know about them. */
break;
}
}
+static inline void ds_take_timestamp(struct ds_context *context,
+ enum bts_qualifier qualifier,
+ struct task_struct *task)
+{
+ struct bts_tracer *tracer = context->bts_master;
+ struct bts_struct ts;
+
+ /* Prevent compilers from reading the tracer pointer twice. */
+ barrier();
+
+ if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
+ return;
+
+ memset(&ts, 0, sizeof(ts));
+ ts.qualifier = qualifier;
+ ts.variant.event.clock = trace_clock_global();
+ ts.variant.event.pid = task->pid;
+
+ bts_write(tracer, &ts);
+}
+
/*
* Change the DS configuration from tracing prev to tracing next.
*/
void ds_switch_to(struct task_struct *prev, struct task_struct *next)
{
- struct ds_context *prev_ctx = prev->thread.ds_ctx;
- struct ds_context *next_ctx = next->thread.ds_ctx;
+ struct ds_context *prev_ctx = prev->thread.ds_ctx;
+ struct ds_context *next_ctx = next->thread.ds_ctx;
+ unsigned long debugctlmsr = next->thread.debugctlmsr;
+
+ /* Make sure all data is read before we start. */
+ barrier();
if (prev_ctx) {
update_debugctlmsr(0);
- if (prev_ctx->bts_master &&
- (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
- struct bts_struct ts = {
- .qualifier = bts_task_departs,
- .variant.timestamp.jiffies = jiffies_64,
- .variant.timestamp.pid = prev->pid
- };
- bts_write(prev_ctx->bts_master, &ts);
- }
+ ds_take_timestamp(prev_ctx, bts_task_departs, prev);
}
if (next_ctx) {
- if (next_ctx->bts_master &&
- (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
- struct bts_struct ts = {
- .qualifier = bts_task_arrives,
- .variant.timestamp.jiffies = jiffies_64,
- .variant.timestamp.pid = next->pid
- };
- bts_write(next_ctx->bts_master, &ts);
- }
+ ds_take_timestamp(next_ctx, bts_task_arrives, next);
wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
}
- update_debugctlmsr(next->thread.debugctlmsr);
+ update_debugctlmsr(debugctlmsr);
}
-void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
+static __init int ds_selftest(void)
{
- clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR);
- tsk->thread.ds_ctx = NULL;
-}
+ if (ds_cfg.sizeof_rec[ds_bts]) {
+ int error;
-void ds_exit_thread(struct task_struct *tsk)
-{
+ error = ds_selftest_bts();
+ if (error) {
+ WARN(1, "[ds] selftest failed. disabling bts.\n");
+ ds_cfg.sizeof_rec[ds_bts] = 0;
+ }
+ }
+
+ if (ds_cfg.sizeof_rec[ds_pebs]) {
+ int error;
+
+ error = ds_selftest_pebs();
+ if (error) {
+ WARN(1, "[ds] selftest failed. disabling pebs.\n");
+ ds_cfg.sizeof_rec[ds_pebs] = 0;
+ }
+ }
+
+ return 0;
}
+device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
new file mode 100644
index 00000000000..6bc7c199ab9
--- /dev/null
+++ b/arch/x86/kernel/ds_selftest.c
@@ -0,0 +1,408 @@
+/*
+ * Debug Store support - selftest
+ *
+ *
+ * Copyright (C) 2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2009
+ */
+
+#include "ds_selftest.h"
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+
+#include <asm/ds.h>
+
+
+#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */
+#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */
+
+struct ds_selftest_bts_conf {
+ struct bts_tracer *tracer;
+ int error;
+ int (*suspend)(struct bts_tracer *);
+ int (*resume)(struct bts_tracer *);
+};
+
+static int ds_selftest_bts_consistency(const struct bts_trace *trace)
+{
+ int error = 0;
+
+ if (!trace) {
+ printk(KERN_CONT "failed to access trace...");
+ /* Bail out. Other tests are pointless. */
+ return -1;
+ }
+
+ if (!trace->read) {
+ printk(KERN_CONT "bts read not available...");
+ error = -1;
+ }
+
+ /* Do some sanity checks on the trace configuration. */
+ if (!trace->ds.n) {
+ printk(KERN_CONT "empty bts buffer...");
+ error = -1;
+ }
+ if (!trace->ds.size) {
+ printk(KERN_CONT "bad bts trace setup...");
+ error = -1;
+ }
+ if (trace->ds.end !=
+ (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
+ printk(KERN_CONT "bad bts buffer setup...");
+ error = -1;
+ }
+ /*
+ * We allow top in [begin; end], since its not clear when the
+ * overflow adjustment happens: after the increment or before the
+ * write.
+ */
+ if ((trace->ds.top < trace->ds.begin) ||
+ (trace->ds.end < trace->ds.top)) {
+ printk(KERN_CONT "bts top out of bounds...");
+ error = -1;
+ }
+
+ return error;
+}
+
+static int ds_selftest_bts_read(struct bts_tracer *tracer,
+ const struct bts_trace *trace,
+ const void *from, const void *to)
+{
+ const unsigned char *at;
+
+ /*
+ * Check a few things which do not belong to this test.
+ * They should be covered by other tests.
+ */
+ if (!trace)
+ return -1;
+
+ if (!trace->read)
+ return -1;
+
+ if (to < from)
+ return -1;
+
+ if (from < trace->ds.begin)
+ return -1;
+
+ if (trace->ds.end < to)
+ return -1;
+
+ if (!trace->ds.size)
+ return -1;
+
+ /* Now to the test itself. */
+ for (at = from; (void *)at < to; at += trace->ds.size) {
+ struct bts_struct bts;
+ unsigned long index;
+ int error;
+
+ if (((void *)at - trace->ds.begin) % trace->ds.size) {
+ printk(KERN_CONT
+ "read from non-integer index...");
+ return -1;
+ }
+ index = ((void *)at - trace->ds.begin) / trace->ds.size;
+
+ memset(&bts, 0, sizeof(bts));
+ error = trace->read(tracer, at, &bts);
+ if (error < 0) {
+ printk(KERN_CONT
+ "error reading bts trace at [%lu] (0x%p)...",
+ index, at);
+ return error;
+ }
+
+ switch (bts.qualifier) {
+ case BTS_BRANCH:
+ break;
+ default:
+ printk(KERN_CONT
+ "unexpected bts entry %llu at [%lu] (0x%p)...",
+ bts.qualifier, index, at);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static void ds_selftest_bts_cpu(void *arg)
+{
+ struct ds_selftest_bts_conf *conf = arg;
+ const struct bts_trace *trace;
+ void *top;
+
+ if (IS_ERR(conf->tracer)) {
+ conf->error = PTR_ERR(conf->tracer);
+ conf->tracer = NULL;
+
+ printk(KERN_CONT
+ "initialization failed (err: %d)...", conf->error);
+ return;
+ }
+
+ /* We should meanwhile have enough trace. */
+ conf->error = conf->suspend(conf->tracer);
+ if (conf->error < 0)
+ return;
+
+ /* Let's see if we can access the trace. */
+ trace = ds_read_bts(conf->tracer);
+
+ conf->error = ds_selftest_bts_consistency(trace);
+ if (conf->error < 0)
+ return;
+
+ /* If everything went well, we should have a few trace entries. */
+ if (trace->ds.top == trace->ds.begin) {
+ /*
+ * It is possible but highly unlikely that we got a
+ * buffer overflow and end up at exactly the same
+ * position we started from.
+ * Let's issue a warning, but continue.
+ */
+ printk(KERN_CONT "no trace/overflow...");
+ }
+
+ /* Let's try to read the trace we collected. */
+ conf->error =
+ ds_selftest_bts_read(conf->tracer, trace,
+ trace->ds.begin, trace->ds.top);
+ if (conf->error < 0)
+ return;
+
+ /*
+ * Let's read the trace again.
+ * Since we suspended tracing, we should get the same result.
+ */
+ top = trace->ds.top;
+
+ trace = ds_read_bts(conf->tracer);
+ conf->error = ds_selftest_bts_consistency(trace);
+ if (conf->error < 0)
+ return;
+
+ if (top != trace->ds.top) {
+ printk(KERN_CONT "suspend not working...");
+ conf->error = -1;
+ return;
+ }
+
+ /* Let's collect some more trace - see if resume is working. */
+ conf->error = conf->resume(conf->tracer);
+ if (conf->error < 0)
+ return;
+
+ conf->error = conf->suspend(conf->tracer);
+ if (conf->error < 0)
+ return;
+
+ trace = ds_read_bts(conf->tracer);
+
+ conf->error = ds_selftest_bts_consistency(trace);
+ if (conf->error < 0)
+ return;
+
+ if (trace->ds.top == top) {
+ /*
+ * It is possible but highly unlikely that we got a
+ * buffer overflow and end up at exactly the same
+ * position we started from.
+ * Let's issue a warning and check the full trace.
+ */
+ printk(KERN_CONT
+ "no resume progress/overflow...");
+
+ conf->error =
+ ds_selftest_bts_read(conf->tracer, trace,
+ trace->ds.begin, trace->ds.end);
+ } else if (trace->ds.top < top) {
+ /*
+ * We had a buffer overflow - the entire buffer should
+ * contain trace records.
+ */
+ conf->error =
+ ds_selftest_bts_read(conf->tracer, trace,
+ trace->ds.begin, trace->ds.end);
+ } else {
+ /*
+ * It is quite likely that the buffer did not overflow.
+ * Let's just check the delta trace.
+ */
+ conf->error =
+ ds_selftest_bts_read(conf->tracer, trace, top,
+ trace->ds.top);
+ }
+ if (conf->error < 0)
+ return;
+
+ conf->error = 0;
+}
+
+static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
+{
+ ds_suspend_bts(tracer);
+ return 0;
+}
+
+static int ds_resume_bts_wrap(struct bts_tracer *tracer)
+{
+ ds_resume_bts(tracer);
+ return 0;
+}
+
+static void ds_release_bts_noirq_wrap(void *tracer)
+{
+ (void)ds_release_bts_noirq(tracer);
+}
+
+static int ds_selftest_bts_bad_release_noirq(int cpu,
+ struct bts_tracer *tracer)
+{
+ int error = -EPERM;
+
+ /* Try to release the tracer on the wrong cpu. */
+ get_cpu();
+ if (cpu != smp_processor_id()) {
+ error = ds_release_bts_noirq(tracer);
+ if (error != -EPERM)
+ printk(KERN_CONT "release on wrong cpu...");
+ }
+ put_cpu();
+
+ return error ? 0 : -1;
+}
+
+static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
+{
+ struct bts_tracer *tracer;
+ int error;
+
+ /* Try to request cpu tracing while task tracing is active. */
+ tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
+ (size_t)-1, BTS_KERNEL);
+ error = PTR_ERR(tracer);
+ if (!IS_ERR(tracer)) {
+ ds_release_bts(tracer);
+ error = 0;
+ }
+
+ if (error != -EPERM)
+ printk(KERN_CONT "cpu/task tracing overlap...");
+
+ return error ? 0 : -1;
+}
+
+static int ds_selftest_bts_bad_request_task(void *buffer)
+{
+ struct bts_tracer *tracer;
+ int error;
+
+ /* Try to request cpu tracing while task tracing is active. */
+ tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
+ (size_t)-1, BTS_KERNEL);
+ error = PTR_ERR(tracer);
+ if (!IS_ERR(tracer)) {
+ error = 0;
+ ds_release_bts(tracer);
+ }
+
+ if (error != -EPERM)
+ printk(KERN_CONT "task/cpu tracing overlap...");
+
+ return error ? 0 : -1;
+}
+
+int ds_selftest_bts(void)
+{
+ struct ds_selftest_bts_conf conf;
+ unsigned char buffer[BUFFER_SIZE], *small_buffer;
+ unsigned long irq;
+ int cpu;
+
+ printk(KERN_INFO "[ds] bts selftest...");
+ conf.error = 0;
+
+ small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ conf.suspend = ds_suspend_bts_wrap;
+ conf.resume = ds_resume_bts_wrap;
+ conf.tracer =
+ ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
+ NULL, (size_t)-1, BTS_KERNEL);
+ ds_selftest_bts_cpu(&conf);
+ if (conf.error >= 0)
+ conf.error = ds_selftest_bts_bad_request_task(buffer);
+ ds_release_bts(conf.tracer);
+ if (conf.error < 0)
+ goto out;
+
+ conf.suspend = ds_suspend_bts_noirq;
+ conf.resume = ds_resume_bts_noirq;
+ conf.tracer =
+ ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
+ NULL, (size_t)-1, BTS_KERNEL);
+ smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
+ if (conf.error >= 0) {
+ conf.error =
+ ds_selftest_bts_bad_release_noirq(cpu,
+ conf.tracer);
+ /* We must not release the tracer twice. */
+ if (conf.error < 0)
+ conf.tracer = NULL;
+ }
+ if (conf.error >= 0)
+ conf.error = ds_selftest_bts_bad_request_task(buffer);
+ smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
+ conf.tracer, 1);
+ if (conf.error < 0)
+ goto out;
+ }
+
+ conf.suspend = ds_suspend_bts_wrap;
+ conf.resume = ds_resume_bts_wrap;
+ conf.tracer =
+ ds_request_bts_task(current, buffer, BUFFER_SIZE,
+ NULL, (size_t)-1, BTS_KERNEL);
+ ds_selftest_bts_cpu(&conf);
+ if (conf.error >= 0)
+ conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
+ ds_release_bts(conf.tracer);
+ if (conf.error < 0)
+ goto out;
+
+ conf.suspend = ds_suspend_bts_noirq;
+ conf.resume = ds_resume_bts_noirq;
+ conf.tracer =
+ ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
+ NULL, (size_t)-1, BTS_KERNEL);
+ local_irq_save(irq);
+ ds_selftest_bts_cpu(&conf);
+ if (conf.error >= 0)
+ conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
+ ds_release_bts_noirq(conf.tracer);
+ local_irq_restore(irq);
+ if (conf.error < 0)
+ goto out;
+
+ conf.error = 0;
+ out:
+ put_online_cpus();
+ printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
+
+ return conf.error;
+}
+
+int ds_selftest_pebs(void)
+{
+ return 0;
+}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
new file mode 100644
index 00000000000..2ba8745c666
--- /dev/null
+++ b/arch/x86/kernel/ds_selftest.h
@@ -0,0 +1,15 @@
+/*
+ * Debug Store support - selftest
+ *
+ *
+ * Copyright (C) 2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2009
+ */
+
+#ifdef CONFIG_X86_DS_SELFTEST
+extern int ds_selftest_bts(void);
+extern int ds_selftest_pebs(void);
+#else
+static inline int ds_selftest_bts(void) { return 0; }
+static inline int ds_selftest_pebs(void) { return 0; }
+#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 95ea5fa7d44..c8405718a4c 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -22,6 +22,7 @@
#include "dumpstack.h"
int panic_on_unrecovered_nmi;
+int panic_on_io_nmi;
unsigned int code_bytes = 64;
int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
static int die_counter;
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index da87590b869..81086c227ab 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *sp, unsigned long bp, char *log_lvl);
extern unsigned int code_bytes;
-extern int kstack_depth_to_print;
/* The form of the top of the frame on the stack */
struct stack_frame {
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index d593cd1f58d..bca5fba91c9 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -19,6 +19,12 @@
#include "dumpstack.h"
+/* Just a stub for now */
+int x86_is_stack_id(int id, char *name)
+{
+ return 0;
+}
+
void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
const struct stacktrace_ops *ops, void *data)
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index d35db5993fd..54b0a327676 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -19,10 +19,8 @@
#include "dumpstack.h"
-static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
- unsigned *usedp, char **idp)
-{
- static char ids[][8] = {
+
+static char x86_stack_ids[][8] = {
[DEBUG_STACK - 1] = "#DB",
[NMI_STACK - 1] = "NMI",
[DOUBLEFAULT_STACK - 1] = "#DF",
@@ -33,6 +31,15 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
#endif
};
+
+int x86_is_stack_id(int id, char *name)
+{
+ return x86_stack_ids[id - 1] == name;
+}
+
+static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
+ unsigned *usedp, char **idp)
+{
unsigned k;
/*
@@ -61,7 +68,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
if (*usedp & (1U << k))
break;
*usedp |= 1U << k;
- *idp = ids[k];
+ *idp = x86_stack_ids[k];
return (unsigned long *)end;
}
/*
@@ -81,12 +88,13 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
do {
++j;
end -= EXCEPTION_STKSZ;
- ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
+ x86_stack_ids[j][4] = '1' +
+ (j - N_EXCEPTION_STACKS);
} while (stack < end - EXCEPTION_STKSZ);
if (*usedp & (1U << j))
break;
*usedp |= 1U << j;
- *idp = ids[j];
+ *idp = x86_stack_ids[j];
return (unsigned long *)end;
}
#endif
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 00628130292..5cb5725b2ba 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
*/
__init void e820_setup_gap(void)
{
- unsigned long gapstart, gapsize, round;
+ unsigned long gapstart, gapsize;
int found;
gapstart = 0x10000000;
@@ -627,22 +627,16 @@ __init void e820_setup_gap(void)
#ifdef CONFIG_X86_64
if (!found) {
gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
- printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
- "address range\n"
- KERN_ERR "PCI: Unassigned devices with 32bit resource "
- "registers may break!\n");
+ printk(KERN_ERR
+ "PCI: Warning: Cannot find a gap in the 32bit address range\n"
+ "PCI: Unassigned devices with 32bit resource registers may break!\n");
}
#endif
/*
- * See how much we want to round up: start off with
- * rounding to the next 1MB area.
+ * e820_reserve_resources_late protect stolen RAM already
*/
- round = 0x100000;
- while ((gapsize >> 4) > round)
- round += round;
- /* Fun with two's complement */
- pci_mem_start = (gapstart + round) & -round;
+ pci_mem_start = gapstart;
printk(KERN_INFO
"Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
@@ -1371,6 +1365,25 @@ void __init e820_reserve_resources(void)
}
}
+/* How much should we pad RAM ending depending on where it is? */
+static unsigned long ram_alignment(resource_size_t pos)
+{
+ unsigned long mb = pos >> 20;
+
+ /* To 64kB in the first megabyte */
+ if (!mb)
+ return 64*1024;
+
+ /* To 1MB in the first 16MB */
+ if (mb < 16)
+ return 1024*1024;
+
+ /* To 32MB for anything above that */
+ return 32*1024*1024;
+}
+
+#define MAX_RESOURCE_SIZE ((resource_size_t)-1)
+
void __init e820_reserve_resources_late(void)
{
int i;
@@ -1382,6 +1395,26 @@ void __init e820_reserve_resources_late(void)
insert_resource_expand_to_fit(&iomem_resource, res);
res++;
}
+
+ /*
+ * Try to bump up RAM regions to reasonable boundaries to
+ * avoid stolen RAM:
+ */
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *entry = &e820.map[i];
+ u64 start, end;
+
+ if (entry->type != E820_RAM)
+ continue;
+ start = entry->addr + entry->size;
+ end = round_up(start, ram_alignment(start)) - 1;
+ if (end > MAX_RESOURCE_SIZE)
+ end = MAX_RESOURCE_SIZE;
+ if (start >= end)
+ continue;
+ reserve_region_with_split(&iomem_resource, start, end,
+ "RAM buffer");
+ }
}
char *__init default_machine_specific_memory_setup(void)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 76b8cd953de..ebdb85cf268 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -97,6 +97,7 @@ static void __init nvidia_bugs(int num, int slot, int func)
}
#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
+#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
{
u32 d;
@@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
d &= 0xff;
return d;
}
+#endif
static void __init ati_bugs(int num, int slot, int func)
{
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 1736acc4d7a..fe26ba3e345 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -240,10 +240,35 @@ static void __init do_add_efi_memmap(void)
unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
int e820_type;
- if (md->attribute & EFI_MEMORY_WB)
- e820_type = E820_RAM;
- else
+ switch (md->type) {
+ case EFI_LOADER_CODE:
+ case EFI_LOADER_DATA:
+ case EFI_BOOT_SERVICES_CODE:
+ case EFI_BOOT_SERVICES_DATA:
+ case EFI_CONVENTIONAL_MEMORY:
+ if (md->attribute & EFI_MEMORY_WB)
+ e820_type = E820_RAM;
+ else
+ e820_type = E820_RESERVED;
+ break;
+ case EFI_ACPI_RECLAIM_MEMORY:
+ e820_type = E820_ACPI;
+ break;
+ case EFI_ACPI_MEMORY_NVS:
+ e820_type = E820_NVS;
+ break;
+ case EFI_UNUSABLE_MEMORY:
+ e820_type = E820_UNUSABLE;
+ break;
+ default:
+ /*
+ * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE
+ * EFI_RUNTIME_SERVICES_DATA EFI_MEMORY_MAPPED_IO
+ * EFI_MEMORY_MAPPED_IO_PORT_SPACE EFI_PAL_CODE
+ */
e820_type = E820_RESERVED;
+ break;
+ }
e820_add_region(start, size, e820_type);
}
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
@@ -329,7 +354,7 @@ void __init efi_init(void)
*/
c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
if (c16) {
- for (i = 0; i < sizeof(vendor) && *c16; ++i)
+ for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
vendor[i] = *c16++;
vendor[i] = '\0';
} else
@@ -487,7 +512,7 @@ void __init efi_enter_virtual_mode(void)
&& end_pfn <= max_pfn_mapped))
va = __va(md->phys_addr);
else
- va = efi_ioremap(md->phys_addr, size);
+ va = efi_ioremap(md->phys_addr, size, md->type);
md->virt_addr = (u64) (unsigned long) va;
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 22c3b7828c5..ac0621a7ac3 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -98,10 +98,14 @@ void __init efi_call_phys_epilog(void)
early_runtime_code_mapping_set_exec(0);
}
-void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
+void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
+ u32 type)
{
unsigned long last_map_pfn;
+ if (type == EFI_MEMORY_MAPPED_IO)
+ return ioremap(phys_addr, size);
+
last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
return NULL;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c929add475c..c097e7d607c 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -48,7 +48,6 @@
#include <asm/segment.h>
#include <asm/smp.h>
#include <asm/page_types.h>
-#include <asm/desc.h>
#include <asm/percpu.h>
#include <asm/dwarf2.h>
#include <asm/processor-flags.h>
@@ -84,7 +83,7 @@
#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
#else
#define preempt_stop(clobbers)
-#define resume_kernel restore_nocheck
+#define resume_kernel restore_all
#endif
.macro TRACE_IRQS_IRET
@@ -372,7 +371,7 @@ END(ret_from_exception)
ENTRY(resume_kernel)
DISABLE_INTERRUPTS(CLBR_ANY)
cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
- jnz restore_nocheck
+ jnz restore_all
need_resched:
movl TI_flags(%ebp), %ecx # need_resched set ?
testb $_TIF_NEED_RESCHED, %cl
@@ -540,6 +539,8 @@ syscall_exit:
jne syscall_exit_work
restore_all:
+ TRACE_IRQS_IRET
+restore_all_notrace:
movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
# Warning: PT_OLDSS(%esp) contains the wrong/random values if we
# are returning to the kernel.
@@ -551,8 +552,6 @@ restore_all:
CFI_REMEMBER_STATE
je ldt_ss # returning to user-space with LDT SS
restore_nocheck:
- TRACE_IRQS_IRET
-restore_nocheck_notrace:
RESTORE_REGS 4 # skip orig_eax/error_code
CFI_ADJUST_CFA_OFFSET -4
irq_return:
@@ -588,22 +587,34 @@ ldt_ss:
jne restore_nocheck
#endif
- /* If returning to userspace with 16bit stack,
- * try to fix the higher word of ESP, as the CPU
- * won't restore it.
- * This is an "official" bug of all the x86-compatible
- * CPUs, which we can try to work around to make
- * dosemu and wine happy. */
- movl PT_OLDESP(%esp), %eax
- movl %esp, %edx
- call patch_espfix_desc
+/*
+ * Setup and switch to ESPFIX stack
+ *
+ * We're returning to userspace with a 16 bit stack. The CPU will not
+ * restore the high word of ESP for us on executing iret... This is an
+ * "official" bug of all the x86-compatible CPUs, which we can work
+ * around to make dosemu and wine happy. We do this by preloading the
+ * high word of ESP with the high word of the userspace ESP while
+ * compensating for the offset by changing to the ESPFIX segment with
+ * a base address that matches for the difference.
+ */
+ mov %esp, %edx /* load kernel esp */
+ mov PT_OLDESP(%esp), %eax /* load userspace esp */
+ mov %dx, %ax /* eax: new kernel esp */
+ sub %eax, %edx /* offset (low word is 0) */
+ PER_CPU(gdt_page, %ebx)
+ shr $16, %edx
+ mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */
+ mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */
pushl $__ESPFIX_SS
CFI_ADJUST_CFA_OFFSET 4
- pushl %eax
+ push %eax /* new kernel esp */
CFI_ADJUST_CFA_OFFSET 4
+ /* Disable interrupts, but do not irqtrace this section: we
+ * will soon execute iret and the tracer was already set to
+ * the irqstate after the iret */
DISABLE_INTERRUPTS(CLBR_EAX)
- TRACE_IRQS_OFF
- lss (%esp), %esp
+ lss (%esp), %esp /* switch to espfix segment */
CFI_ADJUST_CFA_OFFSET -8
jmp restore_nocheck
CFI_ENDPROC
@@ -716,15 +727,24 @@ PTREGSCALL(vm86)
PTREGSCALL(vm86old)
.macro FIXUP_ESPFIX_STACK
- /* since we are on a wrong stack, we cant make it a C code :( */
+/*
+ * Switch back for ESPFIX stack to the normal zerobased stack
+ *
+ * We can't call C functions using the ESPFIX stack. This code reads
+ * the high word of the segment base from the GDT and swiches to the
+ * normal stack and adjusts ESP with the matching offset.
+ */
+ /* fixup the stack */
PER_CPU(gdt_page, %ebx)
- GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
- addl %esp, %eax
+ mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */
+ mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */
+ shl $16, %eax
+ addl %esp, %eax /* the adjusted stack pointer */
pushl $__KERNEL_DS
CFI_ADJUST_CFA_OFFSET 4
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
- lss (%esp), %esp
+ lss (%esp), %esp /* switch to the normal stack segment */
CFI_ADJUST_CFA_OFFSET -8
.endm
.macro UNWIND_ESPFIX_STACK
@@ -1154,6 +1174,7 @@ ENTRY(ftrace_graph_caller)
pushl %edx
movl 0xc(%esp), %edx
lea 0x4(%ebp), %eax
+ movl (%ebp), %ecx
subl $MCOUNT_INSN_SIZE, %edx
call prepare_ftrace_return
popl %edx
@@ -1168,6 +1189,7 @@ return_to_handler:
pushl %eax
pushl %ecx
pushl %edx
+ movl %ebp, %eax
call ftrace_return_to_handler
movl %eax, 0xc(%esp)
popl %edx
@@ -1329,7 +1351,7 @@ nmi_stack_correct:
xorl %edx,%edx # zero error code
movl %esp,%eax # pt_regs pointer
call do_nmi
- jmp restore_nocheck_notrace
+ jmp restore_all_notrace
CFI_ENDPROC
nmi_stack_fixup:
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 38946c6e843..c251be74510 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -135,6 +135,7 @@ ENTRY(ftrace_graph_caller)
leaq 8(%rbp), %rdi
movq 0x38(%rsp), %rsi
+ movq (%rbp), %rdx
subq $MCOUNT_INSN_SIZE, %rsi
call prepare_ftrace_return
@@ -147,27 +148,15 @@ END(ftrace_graph_caller)
GLOBAL(return_to_handler)
subq $80, %rsp
+ /* Save the return values */
movq %rax, (%rsp)
- movq %rcx, 8(%rsp)
- movq %rdx, 16(%rsp)
- movq %rsi, 24(%rsp)
- movq %rdi, 32(%rsp)
- movq %r8, 40(%rsp)
- movq %r9, 48(%rsp)
- movq %r10, 56(%rsp)
- movq %r11, 64(%rsp)
+ movq %rdx, 8(%rsp)
+ movq %rbp, %rdi
call ftrace_return_to_handler
movq %rax, 72(%rsp)
- movq 64(%rsp), %r11
- movq 56(%rsp), %r10
- movq 48(%rsp), %r9
- movq 40(%rsp), %r8
- movq 32(%rsp), %rdi
- movq 24(%rsp), %rsi
- movq 16(%rsp), %rdx
- movq 8(%rsp), %rcx
+ movq 8(%rsp), %rdx
movq (%rsp), %rax
addq $72, %rsp
retq
@@ -976,6 +965,8 @@ END(\sym)
#ifdef CONFIG_SMP
apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
+apicinterrupt REBOOT_VECTOR \
+ reboot_interrupt smp_reboot_interrupt
#endif
#ifdef CONFIG_X86_UV
@@ -1007,10 +998,15 @@ apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
#endif
apicinterrupt THRESHOLD_APIC_VECTOR \
- threshold_interrupt mce_threshold_interrupt
+ threshold_interrupt smp_threshold_interrupt
apicinterrupt THERMAL_APIC_VECTOR \
thermal_interrupt smp_thermal_interrupt
+#ifdef CONFIG_X86_MCE
+apicinterrupt MCE_SELF_VECTOR \
+ mce_self_interrupt smp_mce_self_interrupt
+#endif
+
#ifdef CONFIG_SMP
apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
call_function_single_interrupt smp_call_function_single_interrupt
@@ -1025,6 +1021,11 @@ apicinterrupt ERROR_APIC_VECTOR \
apicinterrupt SPURIOUS_APIC_VECTOR \
spurious_interrupt smp_spurious_interrupt
+#ifdef CONFIG_PERF_COUNTERS
+apicinterrupt LOCAL_PENDING_VECTOR \
+ perf_pending_interrupt smp_perf_pending_interrupt
+#endif
+
/*
* Exception entry points.
*/
@@ -1379,10 +1380,15 @@ END(xen_failsafe_callback)
paranoidzeroentry_ist debug do_debug DEBUG_STACK
paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
paranoiderrorentry stack_segment do_stack_segment
+#ifdef CONFIG_XEN
+zeroentry xen_debug do_debug
+zeroentry xen_int3 do_int3
+errorentry xen_stack_segment do_stack_segment
+#endif
errorentry general_protection do_general_protection
errorentry page_fault do_page_fault
#ifdef CONFIG_X86_MCE
-paranoidzeroentry machine_check do_machine_check
+paranoidzeroentry machine_check *machine_check_vector(%rip)
#endif
/*
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index b79c5533c42..d94e1ea3b9f 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -408,7 +408,8 @@ int ftrace_disable_ftrace_graph_caller(void)
* Hook the return address and push it in the stack of return addrs
* in current thread info.
*/
-void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
+void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
+ unsigned long frame_pointer)
{
unsigned long old;
int faulted;
@@ -453,7 +454,8 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
return;
}
- if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) {
+ if (ftrace_push_return_trace(old, self_addr, &trace.depth,
+ frame_pointer) == -EBUSY) {
*parent = old;
return;
}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 30683883e0c..cc827ac9e8d 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -13,7 +13,6 @@
#include <asm/segment.h>
#include <asm/page_types.h>
#include <asm/pgtable_types.h>
-#include <asm/desc.h>
#include <asm/cache.h>
#include <asm/thread_info.h>
#include <asm/asm-offsets.h>
@@ -262,9 +261,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
* which will be freed later
*/
-#ifndef CONFIG_HOTPLUG_CPU
-.section .init.text,"ax",@progbits
-#endif
+__CPUINIT
#ifdef CONFIG_SMP
ENTRY(startup_32_smp)
@@ -603,18 +600,11 @@ ignore_int:
#endif
iret
-.section .cpuinit.data,"wa"
+ __REFDATA
.align 4
ENTRY(initial_code)
.long i386_start_kernel
-.section .text
-/*
- * Real beginning of normal "text" segment
- */
-ENTRY(stext)
-ENTRY(_stext)
-
/*
* BSS section
*/
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 54b29bb24e7..fa54f78e2a0 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -12,7 +12,6 @@
#include <linux/linkage.h>
#include <linux/threads.h>
#include <linux/init.h>
-#include <asm/desc.h>
#include <asm/segment.h>
#include <asm/pgtable.h>
#include <asm/page.h>
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 81408b93f88..dedc2bddf7a 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -510,7 +510,8 @@ static int hpet_setup_irq(struct hpet_dev *dev)
{
if (request_irq(dev->irq, hpet_interrupt_handler,
- IRQF_DISABLED|IRQF_NOBALANCING, dev->name, dev))
+ IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
+ dev->name, dev))
return -1;
disable_irq(dev->irq);
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index c2e0bb0890d..5cf36c053ac 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -7,6 +7,7 @@
#include <linux/spinlock.h>
#include <linux/jiffies.h>
#include <linux/module.h>
+#include <linux/timex.h>
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/io.h>
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index df3bf269bea..270ff83efc1 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -12,7 +12,6 @@
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
/*
* Initial thread structure.
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c3fe010d74c..b0cdde6932f 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -12,6 +12,8 @@
#include <asm/io_apic.h>
#include <asm/irq.h>
#include <asm/idle.h>
+#include <asm/mce.h>
+#include <asm/hw_irq.h>
atomic_t irq_err_count;
@@ -24,9 +26,9 @@ void (*generic_interrupt_extension)(void) = NULL;
*/
void ack_bad_irq(unsigned int irq)
{
- printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
+ if (printk_ratelimit())
+ pr_err("unexpected IRQ trap at vector %02x\n", irq);
-#ifdef CONFIG_X86_LOCAL_APIC
/*
* Currently unexpected vectors happen only on SMP and APIC.
* We _must_ ack these because every local APIC has only N
@@ -36,9 +38,7 @@ void ack_bad_irq(unsigned int irq)
* completely.
* But only ack when the APIC is enabled -AK
*/
- if (cpu_has_apic)
- ack_APIC_irq();
-#endif
+ ack_APIC_irq();
}
#define irq_stats(x) (&per_cpu(irq_stat, x))
@@ -63,6 +63,14 @@ static int show_other_interrupts(struct seq_file *p, int prec)
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
seq_printf(p, " Spurious interrupts\n");
+ seq_printf(p, "%*s: ", prec, "CNT");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
+ seq_printf(p, " Performance counter interrupts\n");
+ seq_printf(p, "%*s: ", prec, "PND");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
+ seq_printf(p, " Performance pending work\n");
#endif
if (generic_interrupt_extension) {
seq_printf(p, "%*s: ", prec, "PLT");
@@ -89,13 +97,23 @@ static int show_other_interrupts(struct seq_file *p, int prec)
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
seq_printf(p, " Thermal event interrupts\n");
-# ifdef CONFIG_X86_64
+# ifdef CONFIG_X86_MCE_THRESHOLD
seq_printf(p, "%*s: ", prec, "THR");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
seq_printf(p, " Threshold APIC interrupts\n");
# endif
#endif
+#ifdef CONFIG_X86_NEW_MCE
+ seq_printf(p, "%*s: ", prec, "MCE");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
+ seq_printf(p, " Machine check exceptions\n");
+ seq_printf(p, "%*s: ", prec, "MCP");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
+ seq_printf(p, " Machine check polls\n");
+#endif
seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
#if defined(CONFIG_X86_IO_APIC)
seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
@@ -166,6 +184,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
#ifdef CONFIG_X86_LOCAL_APIC
sum += irq_stats(cpu)->apic_timer_irqs;
sum += irq_stats(cpu)->irq_spurious_count;
+ sum += irq_stats(cpu)->apic_perf_irqs;
+ sum += irq_stats(cpu)->apic_pending_irqs;
#endif
if (generic_interrupt_extension)
sum += irq_stats(cpu)->generic_irqs;
@@ -176,9 +196,13 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
#endif
#ifdef CONFIG_X86_MCE
sum += irq_stats(cpu)->irq_thermal_count;
-# ifdef CONFIG_X86_64
+# ifdef CONFIG_X86_MCE_THRESHOLD
sum += irq_stats(cpu)->irq_threshold_count;
+# endif
#endif
+#ifdef CONFIG_X86_NEW_MCE
+ sum += per_cpu(mce_exception_count, cpu);
+ sum += per_cpu(mce_poll_count, cpu);
#endif
return sum;
}
@@ -213,14 +237,11 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
irq = __get_cpu_var(vector_irq)[vector];
if (!handle_irq(irq, regs)) {
-#ifdef CONFIG_X86_64
- if (!disable_apic)
- ack_APIC_irq();
-#endif
+ ack_APIC_irq();
if (printk_ratelimit())
- printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n",
- __func__, smp_processor_id(), vector, irq);
+ pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
+ __func__, smp_processor_id(), vector, irq);
}
irq_exit();
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit.c
index 368b0a8836f..92b7703d3d5 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit.c
@@ -1,20 +1,25 @@
+#include <linux/linkage.h>
#include <linux/errno.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/ioport.h>
#include <linux/interrupt.h>
+#include <linux/timex.h>
#include <linux/slab.h>
#include <linux/random.h>
+#include <linux/kprobes.h>
#include <linux/init.h>
#include <linux/kernel_stat.h>
#include <linux/sysdev.h>
#include <linux/bitops.h>
+#include <linux/acpi.h>
#include <linux/io.h>
#include <linux/delay.h>
#include <asm/atomic.h>
#include <asm/system.h>
#include <asm/timer.h>
+#include <asm/hw_irq.h>
#include <asm/pgtable.h>
#include <asm/desc.h>
#include <asm/apic.h>
@@ -22,7 +27,23 @@
#include <asm/i8259.h>
#include <asm/traps.h>
+/*
+ * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
+ * (these are usually mapped to vectors 0x30-0x3f)
+ */
+
+/*
+ * The IO-APIC gives us many more interrupt sources. Most of these
+ * are unused but an SMP system is supposed to have enough memory ...
+ * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
+ * across the spectrum, so we really want to be prepared to get all
+ * of these. Plus, more powerful systems might have more than 64
+ * IO-APIC registers.
+ *
+ * (these are usually mapped into the 0x30-0xff vector range)
+ */
+#ifdef CONFIG_X86_32
/*
* Note that on a 486, we don't want to do a SIGFPE on an irq13
* as the irq is unreliable, and exception 16 works correctly
@@ -52,30 +73,7 @@ static struct irqaction fpu_irq = {
.handler = math_error_irq,
.name = "fpu",
};
-
-void __init init_ISA_irqs(void)
-{
- int i;
-
-#ifdef CONFIG_X86_LOCAL_APIC
- init_bsp_APIC();
#endif
- init_8259A(0);
-
- /*
- * 16 old-style INTA-cycle interrupts:
- */
- for (i = 0; i < NR_IRQS_LEGACY; i++) {
- struct irq_desc *desc = irq_to_desc(i);
-
- desc->status = IRQ_DISABLED;
- desc->action = NULL;
- desc->depth = 1;
-
- set_irq_chip_and_handler_name(i, &i8259A_chip,
- handle_level_irq, "XT");
- }
-}
/*
* IRQ2 is cascade interrupt to second interrupt controller
@@ -118,29 +116,37 @@ int vector_used_by_percpu_irq(unsigned int vector)
return 0;
}
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-void __init native_init_IRQ(void)
+static void __init init_ISA_irqs(void)
{
int i;
- /* Execute any quirks before the call gates are initialised: */
- x86_quirk_pre_intr_init();
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
+ init_bsp_APIC();
+#endif
+ init_8259A(0);
/*
- * Cover the whole vector space, no vector can escape
- * us. (some of these will be overridden and become
- * 'special' SMP interrupts)
+ * 16 old-style INTA-cycle interrupts:
*/
- for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
- /* SYSCALL_VECTOR was reserved in trap_init. */
- if (i != SYSCALL_VECTOR)
- set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+ for (i = 0; i < NR_IRQS_LEGACY; i++) {
+ struct irq_desc *desc = irq_to_desc(i);
+
+ desc->status = IRQ_DISABLED;
+ desc->action = NULL;
+ desc->depth = 1;
+
+ set_irq_chip_and_handler_name(i, &i8259A_chip,
+ handle_level_irq, "XT");
}
+}
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
+static void __init smp_intr_init(void)
+{
+#ifdef CONFIG_SMP
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
/*
* The reschedule interrupt is a CPU-to-CPU reschedule-helper
* IPI, driven by wakeup.
@@ -160,16 +166,35 @@ void __init native_init_IRQ(void)
/* IPI for generic function call */
alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
- /* IPI for single call function */
+ /* IPI for generic single function call */
alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
- call_function_single_interrupt);
+ call_function_single_interrupt);
/* Low priority IPI to cleanup after moving an irq */
set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
+
+ /* IPI used for rebooting/stopping */
+ alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt);
#endif
+#endif /* CONFIG_SMP */
+}
+
+static void __init apic_intr_init(void)
+{
+ smp_intr_init();
-#ifdef CONFIG_X86_LOCAL_APIC
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
+#endif
+#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC)
+ alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
+#endif
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
/* self generated IPI for local APIC timer */
alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
@@ -179,16 +204,59 @@ void __init native_init_IRQ(void)
/* IPI vectors for APIC spurious and error interrupts */
alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+
+ /* Performance monitoring interrupts: */
+# ifdef CONFIG_PERF_COUNTERS
+ alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
+# endif
+
#endif
+}
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
- /* thermal monitor LVT interrupt */
- alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+/**
+ * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
+ *
+ * Description:
+ * Perform any necessary interrupt initialisation prior to setting up
+ * the "ordinary" interrupt call gates. For legacy reasons, the ISA
+ * interrupts should be initialised here if the machine emulates a PC
+ * in any way.
+ **/
+static void __init x86_quirk_pre_intr_init(void)
+{
+#ifdef CONFIG_X86_32
+ if (x86_quirks->arch_pre_intr_init) {
+ if (x86_quirks->arch_pre_intr_init())
+ return;
+ }
#endif
+ init_ISA_irqs();
+}
+
+void __init native_init_IRQ(void)
+{
+ int i;
+
+ /* Execute any quirks before the call gates are initialised: */
+ x86_quirk_pre_intr_init();
+
+ apic_intr_init();
+
+ /*
+ * Cover the whole vector space, no vector can escape
+ * us. (some of these will be overridden and become
+ * 'special' SMP interrupts)
+ */
+ for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+ /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
+ if (!test_bit(i, used_vectors))
+ set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+ }
if (!acpi_ioapic)
setup_irq(2, &irq2);
+#ifdef CONFIG_X86_32
/*
* Call quirks after call gates are initialised (usually add in
* the architecture specific gates):
@@ -203,4 +271,5 @@ void __init native_init_IRQ(void)
setup_irq(FPU_IRQ, &fpu_irq);
irq_ctx_init(smp_processor_id());
+#endif
}
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
deleted file mode 100644
index 8cd10537fd4..00000000000
--- a/arch/x86/kernel/irqinit_64.c
+++ /dev/null
@@ -1,177 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/timex.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/bitops.h>
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/hw_irq.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/apic.h>
-#include <asm/i8259.h>
-
-/*
- * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
- * (these are usually mapped to vectors 0x30-0x3f)
- */
-
-/*
- * The IO-APIC gives us many more interrupt sources. Most of these
- * are unused but an SMP system is supposed to have enough memory ...
- * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
- * across the spectrum, so we really want to be prepared to get all
- * of these. Plus, more powerful systems might have more than 64
- * IO-APIC registers.
- *
- * (these are usually mapped into the 0x30-0xff vector range)
- */
-
-/*
- * IRQ2 is cascade interrupt to second interrupt controller
- */
-
-static struct irqaction irq2 = {
- .handler = no_action,
- .name = "cascade",
-};
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
- [0 ... IRQ0_VECTOR - 1] = -1,
- [IRQ0_VECTOR] = 0,
- [IRQ1_VECTOR] = 1,
- [IRQ2_VECTOR] = 2,
- [IRQ3_VECTOR] = 3,
- [IRQ4_VECTOR] = 4,
- [IRQ5_VECTOR] = 5,
- [IRQ6_VECTOR] = 6,
- [IRQ7_VECTOR] = 7,
- [IRQ8_VECTOR] = 8,
- [IRQ9_VECTOR] = 9,
- [IRQ10_VECTOR] = 10,
- [IRQ11_VECTOR] = 11,
- [IRQ12_VECTOR] = 12,
- [IRQ13_VECTOR] = 13,
- [IRQ14_VECTOR] = 14,
- [IRQ15_VECTOR] = 15,
- [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
-};
-
-int vector_used_by_percpu_irq(unsigned int vector)
-{
- int cpu;
-
- for_each_online_cpu(cpu) {
- if (per_cpu(vector_irq, cpu)[vector] != -1)
- return 1;
- }
-
- return 0;
-}
-
-static void __init init_ISA_irqs(void)
-{
- int i;
-
- init_bsp_APIC();
- init_8259A(0);
-
- for (i = 0; i < NR_IRQS_LEGACY; i++) {
- struct irq_desc *desc = irq_to_desc(i);
-
- desc->status = IRQ_DISABLED;
- desc->action = NULL;
- desc->depth = 1;
-
- /*
- * 16 old-style INTA-cycle interrupts:
- */
- set_irq_chip_and_handler_name(i, &i8259A_chip,
- handle_level_irq, "XT");
- }
-}
-
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-static void __init smp_intr_init(void)
-{
-#ifdef CONFIG_SMP
- /*
- * The reschedule interrupt is a CPU-to-CPU reschedule-helper
- * IPI, driven by wakeup.
- */
- alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
- /* IPIs for invalidation */
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
-
- /* IPI for generic function call */
- alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
- /* IPI for generic single function call */
- alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
- call_function_single_interrupt);
-
- /* Low priority IPI to cleanup after moving an irq */
- set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
- set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
-#endif
-}
-
-static void __init apic_intr_init(void)
-{
- smp_intr_init();
-
- alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
- alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
-
- /* self generated IPI for local APIC timer */
- alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
- /* generic IPI for platform specific use */
- alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
-
- /* IPI vectors for APIC spurious and error interrupts */
- alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
- alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-}
-
-void __init native_init_IRQ(void)
-{
- int i;
-
- init_ISA_irqs();
- /*
- * Cover the whole vector space, no vector can escape
- * us. (some of these will be overridden and become
- * 'special' SMP interrupts)
- */
- for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
- int vector = FIRST_EXTERNAL_VECTOR + i;
- if (vector != IA32_SYSCALL_VECTOR)
- set_intr_gate(vector, interrupt[i]);
- }
-
- apic_intr_init();
-
- if (!acpi_ioapic)
- setup_irq(2, &irq2);
-}
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index b1f4dffb919..8d82a77a3f3 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -142,7 +142,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
gdb_regs32[GDB_CS] = __KERNEL_CS;
gdb_regs32[GDB_SS] = __KERNEL_DS;
- gdb_regs[GDB_PC] = p->thread.ip;
+ gdb_regs[GDB_PC] = 0;
gdb_regs[GDB_R8] = 0;
gdb_regs[GDB_R9] = 0;
gdb_regs[GDB_R10] = 0;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33019ddb56b..c664d515f61 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,6 +27,7 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/hardirq.h>
+#include <asm/timer.h>
#define MMU_QUEUE_SIZE 1024
@@ -195,11 +196,11 @@ static void kvm_leave_lazy_mmu(void)
struct kvm_para_state *state = kvm_para_state();
mmu_queue_flush(state);
- paravirt_leave_lazy(paravirt_get_lazy_mode());
+ paravirt_leave_lazy_mmu();
state->mode = paravirt_get_lazy_mode();
}
-static void paravirt_ops_setup(void)
+static void __init paravirt_ops_setup(void)
{
pv_info.name = "KVM";
pv_info.paravirt_enabled = 1;
@@ -230,6 +231,9 @@ static void paravirt_ops_setup(void)
pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
}
+#ifdef CONFIG_X86_IO_APIC
+ no_timer_check = 1;
+#endif
}
void __init kvm_guest_init(void)
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 846510b78a0..2a62d843f01 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -347,7 +347,7 @@ static irqreturn_t mfgpt_tick(int irq, void *dev_id)
static struct irqaction mfgptirq = {
.handler = mfgpt_tick,
- .flags = IRQF_DISABLED | IRQF_NOBALANCING,
+ .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
.name = "mfgpt-timer"
};
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 453b5795a5c..366baa17991 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -13,25 +13,13 @@
* Licensed under the terms of the GNU General Public
* License version 2. See file COPYING for details.
*/
-#include <linux/platform_device.h>
-#include <linux/capability.h>
-#include <linux/miscdevice.h>
#include <linux/firmware.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
#include <linux/pci_ids.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
#include <linux/pci.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
#include <asm/microcode.h>
#include <asm/processor.h>
@@ -79,9 +67,6 @@ struct microcode_amd {
#define UCODE_CONTAINER_SECTION_HDR 8
#define UCODE_CONTAINER_HEADER_SIZE 12
-/* serialize access to the physical write */
-static DEFINE_SPINLOCK(microcode_update_lock);
-
static struct equiv_cpu_entry *equiv_cpu_table;
static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
@@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
return 1;
}
-static void apply_microcode_amd(int cpu)
+static int apply_microcode_amd(int cpu)
{
- unsigned long flags;
u32 rev, dummy;
int cpu_num = raw_smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
@@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu)
BUG_ON(cpu_num != cpu);
if (mc_amd == NULL)
- return;
+ return 0;
- spin_lock_irqsave(&microcode_update_lock, flags);
wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
/* get patch id after patching */
rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
- spin_unlock_irqrestore(&microcode_update_lock, flags);
/* check current patch id and patch's id for match */
if (rev != mc_amd->hdr.patch_id) {
printk(KERN_ERR "microcode: CPU%d: update failed "
"(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
- return;
+ return -1;
}
printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
cpu, rev);
uci->cpu_sig.rev = rev;
+
+ return 0;
}
static int get_ucode_data(void *to, const u8 *from, size_t n)
@@ -257,13 +241,12 @@ static int install_equiv_cpu_table(const u8 *buf)
static void free_equiv_cpu_table(void)
{
- if (equiv_cpu_table) {
- vfree(equiv_cpu_table);
- equiv_cpu_table = NULL;
- }
+ vfree(equiv_cpu_table);
+ equiv_cpu_table = NULL;
}
-static int generic_load_microcode(int cpu, const u8 *data, size_t size)
+static enum ucode_state
+generic_load_microcode(int cpu, const u8 *data, size_t size)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
const u8 *ucode_ptr = data;
@@ -272,12 +255,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
int new_rev = uci->cpu_sig.rev;
unsigned int leftover;
unsigned long offset;
+ enum ucode_state state = UCODE_OK;
offset = install_equiv_cpu_table(ucode_ptr);
if (!offset) {
printk(KERN_ERR "microcode: failed to create "
"equivalent cpu table\n");
- return -EINVAL;
+ return UCODE_ERROR;
}
ucode_ptr += offset;
@@ -293,8 +277,7 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
mc_header = (struct microcode_header_amd *)mc;
if (get_matching_microcode(cpu, mc, new_rev)) {
- if (new_mc)
- vfree(new_mc);
+ vfree(new_mc);
new_rev = mc_header->patch_id;
new_mc = mc;
} else
@@ -306,34 +289,32 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
if (new_mc) {
if (!leftover) {
- if (uci->mc)
- vfree(uci->mc);
+ vfree(uci->mc);
uci->mc = new_mc;
pr_debug("microcode: CPU%d found a matching microcode "
"update with version 0x%x (current=0x%x)\n",
cpu, new_rev, uci->cpu_sig.rev);
- } else
+ } else {
vfree(new_mc);
- }
+ state = UCODE_ERROR;
+ }
+ } else
+ state = UCODE_NFOUND;
free_equiv_cpu_table();
- return (int)leftover;
+ return state;
}
-static int request_microcode_fw(int cpu, struct device *device)
+static enum ucode_state request_microcode_fw(int cpu, struct device *device)
{
const char *fw_name = "amd-ucode/microcode_amd.bin";
const struct firmware *firmware;
- int ret;
-
- /* We should bind the task to the CPU */
- BUG_ON(cpu != raw_smp_processor_id());
+ enum ucode_state ret;
- ret = request_firmware(&firmware, fw_name, device);
- if (ret) {
+ if (request_firmware(&firmware, fw_name, device)) {
printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
- return ret;
+ return UCODE_NFOUND;
}
ret = generic_load_microcode(cpu, firmware->data, firmware->size);
@@ -343,11 +324,12 @@ static int request_microcode_fw(int cpu, struct device *device)
return ret;
}
-static int request_microcode_user(int cpu, const void __user *buf, size_t size)
+static enum ucode_state
+request_microcode_user(int cpu, const void __user *buf, size_t size)
{
printk(KERN_INFO "microcode: AMD microcode update via "
"/dev/cpu/microcode not supported\n");
- return -1;
+ return UCODE_ERROR;
}
static void microcode_fini_cpu_amd(int cpu)
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 98c470c069d..9371448290a 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -71,27 +71,18 @@
* Thanks to Stuart Swales for pointing out this bug.
*/
#include <linux/platform_device.h>
-#include <linux/capability.h>
#include <linux/miscdevice.h>
-#include <linux/firmware.h>
+#include <linux/capability.h>
#include <linux/smp_lock.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <asm/microcode.h>
#include <asm/processor.h>
-#include <asm/msr.h>
MODULE_DESCRIPTION("Microcode Update Driver");
MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -101,36 +92,110 @@ MODULE_LICENSE("GPL");
static struct microcode_ops *microcode_ops;
-/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
+/*
+ * Synchronization.
+ *
+ * All non cpu-hotplug-callback call sites use:
+ *
+ * - microcode_mutex to synchronize with each other;
+ * - get/put_online_cpus() to synchronize with
+ * the cpu-hotplug-callback call sites.
+ *
+ * We guarantee that only a single cpu is being
+ * updated at any particular moment of time.
+ */
static DEFINE_MUTEX(microcode_mutex);
struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
EXPORT_SYMBOL_GPL(ucode_cpu_info);
+/*
+ * Operations that are run on a target cpu:
+ */
+
+struct cpu_info_ctx {
+ struct cpu_signature *cpu_sig;
+ int err;
+};
+
+static void collect_cpu_info_local(void *arg)
+{
+ struct cpu_info_ctx *ctx = arg;
+
+ ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
+ ctx->cpu_sig);
+}
+
+static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
+{
+ struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
+ int ret;
+
+ ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
+ if (!ret)
+ ret = ctx.err;
+
+ return ret;
+}
+
+static int collect_cpu_info(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ int ret;
+
+ memset(uci, 0, sizeof(*uci));
+
+ ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
+ if (!ret)
+ uci->valid = 1;
+
+ return ret;
+}
+
+struct apply_microcode_ctx {
+ int err;
+};
+
+static void apply_microcode_local(void *arg)
+{
+ struct apply_microcode_ctx *ctx = arg;
+
+ ctx->err = microcode_ops->apply_microcode(smp_processor_id());
+}
+
+static int apply_microcode_on_target(int cpu)
+{
+ struct apply_microcode_ctx ctx = { .err = 0 };
+ int ret;
+
+ ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1);
+ if (!ret)
+ ret = ctx.err;
+
+ return ret;
+}
+
#ifdef CONFIG_MICROCODE_OLD_INTERFACE
static int do_microcode_update(const void __user *buf, size_t size)
{
- cpumask_t old;
int error = 0;
int cpu;
- old = current->cpus_allowed;
-
for_each_online_cpu(cpu) {
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ enum ucode_state ustate;
if (!uci->valid)
continue;
- set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
- error = microcode_ops->request_microcode_user(cpu, buf, size);
- if (error < 0)
- goto out;
- if (!error)
- microcode_ops->apply_microcode(cpu);
+ ustate = microcode_ops->request_microcode_user(cpu, buf, size);
+ if (ustate == UCODE_ERROR) {
+ error = -1;
+ break;
+ } else if (ustate == UCODE_OK)
+ apply_microcode_on_target(cpu);
}
-out:
- set_cpus_allowed_ptr(current, &old);
+
return error;
}
@@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2)
static ssize_t microcode_write(struct file *file, const char __user *buf,
size_t len, loff_t *ppos)
{
- ssize_t ret;
+ ssize_t ret = -EINVAL;
if ((len >> PAGE_SHIFT) > num_physpages) {
- printk(KERN_ERR "microcode: too much data (max %ld pages)\n",
- num_physpages);
- return -EINVAL;
+ pr_err("microcode: too much data (max %ld pages)\n", num_physpages);
+ return ret;
}
get_online_cpus();
mutex_lock(&microcode_mutex);
- ret = do_microcode_update(buf, len);
- if (!ret)
+ if (do_microcode_update(buf, len) == 0)
ret = (ssize_t)len;
mutex_unlock(&microcode_mutex);
@@ -165,15 +228,16 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
}
static const struct file_operations microcode_fops = {
- .owner = THIS_MODULE,
- .write = microcode_write,
- .open = microcode_open,
+ .owner = THIS_MODULE,
+ .write = microcode_write,
+ .open = microcode_open,
};
static struct miscdevice microcode_dev = {
- .minor = MICROCODE_MINOR,
- .name = "microcode",
- .fops = &microcode_fops,
+ .minor = MICROCODE_MINOR,
+ .name = "microcode",
+ .devnode = "cpu/microcode",
+ .fops = &microcode_fops,
};
static int __init microcode_dev_init(void)
@@ -182,9 +246,7 @@ static int __init microcode_dev_init(void)
error = misc_register(&microcode_dev);
if (error) {
- printk(KERN_ERR
- "microcode: can't misc_register on minor=%d\n",
- MICROCODE_MINOR);
+ pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR);
return error;
}
@@ -205,42 +267,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
/* fake device for request_firmware */
static struct platform_device *microcode_pdev;
-static long reload_for_cpu(void *unused)
+static int reload_for_cpu(int cpu)
{
- struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id();
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
int err = 0;
mutex_lock(&microcode_mutex);
if (uci->valid) {
- err = microcode_ops->request_microcode_fw(smp_processor_id(),
- &microcode_pdev->dev);
- if (!err)
- microcode_ops->apply_microcode(smp_processor_id());
+ enum ucode_state ustate;
+
+ ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
+ if (ustate == UCODE_OK)
+ apply_microcode_on_target(cpu);
+ else
+ if (ustate == UCODE_ERROR)
+ err = -EINVAL;
}
mutex_unlock(&microcode_mutex);
+
return err;
}
static ssize_t reload_store(struct sys_device *dev,
struct sysdev_attribute *attr,
- const char *buf, size_t sz)
+ const char *buf, size_t size)
{
- char *end;
- unsigned long val = simple_strtoul(buf, &end, 0);
- int err = 0;
+ unsigned long val;
int cpu = dev->id;
+ int ret = 0;
+ char *end;
+ val = simple_strtoul(buf, &end, 0);
if (end == buf)
return -EINVAL;
+
if (val == 1) {
get_online_cpus();
if (cpu_online(cpu))
- err = work_on_cpu(cpu, reload_for_cpu, NULL);
+ ret = reload_for_cpu(cpu);
put_online_cpus();
}
- if (err)
- return err;
- return sz;
+
+ if (!ret)
+ ret = size;
+
+ return ret;
}
static ssize_t version_show(struct sys_device *dev,
@@ -271,11 +342,11 @@ static struct attribute *mc_default_attrs[] = {
};
static struct attribute_group mc_attr_group = {
- .attrs = mc_default_attrs,
- .name = "microcode",
+ .attrs = mc_default_attrs,
+ .name = "microcode",
};
-static void __microcode_fini_cpu(int cpu)
+static void microcode_fini_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -283,103 +354,68 @@ static void __microcode_fini_cpu(int cpu)
uci->valid = 0;
}
-static void microcode_fini_cpu(int cpu)
-{
- mutex_lock(&microcode_mutex);
- __microcode_fini_cpu(cpu);
- mutex_unlock(&microcode_mutex);
-}
-
-static void collect_cpu_info(int cpu)
+static enum ucode_state microcode_resume_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- memset(uci, 0, sizeof(*uci));
- if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig))
- uci->valid = 1;
+ if (!uci->mc)
+ return UCODE_NFOUND;
+
+ pr_debug("microcode: CPU%d updated upon resume\n", cpu);
+ apply_microcode_on_target(cpu);
+
+ return UCODE_OK;
}
-static int microcode_resume_cpu(int cpu)
+static enum ucode_state microcode_init_cpu(int cpu)
{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- struct cpu_signature nsig;
+ enum ucode_state ustate;
- pr_debug("microcode: CPU%d resumed\n", cpu);
+ if (collect_cpu_info(cpu))
+ return UCODE_ERROR;
- if (!uci->mc)
- return 1;
+ /* --dimm. Trigger a delayed update? */
+ if (system_state != SYSTEM_RUNNING)
+ return UCODE_NFOUND;
- /*
- * Let's verify that the 'cached' ucode does belong
- * to this cpu (a bit of paranoia):
- */
- if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
- __microcode_fini_cpu(cpu);
- printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n",
- cpu);
- return -1;
- }
+ ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
- if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) {
- __microcode_fini_cpu(cpu);
- printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n",
- cpu);
- /* Should we look for a new ucode here? */
- return 1;
+ if (ustate == UCODE_OK) {
+ pr_debug("microcode: CPU%d updated upon init\n", cpu);
+ apply_microcode_on_target(cpu);
}
- return 0;
+ return ustate;
}
-static long microcode_update_cpu(void *unused)
+static enum ucode_state microcode_update_cpu(int cpu)
{
- struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id();
- int err = 0;
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ enum ucode_state ustate;
- /*
- * Check if the system resume is in progress (uci->valid != NULL),
- * otherwise just request a firmware:
- */
- if (uci->valid) {
- err = microcode_resume_cpu(smp_processor_id());
- } else {
- collect_cpu_info(smp_processor_id());
- if (uci->valid && system_state == SYSTEM_RUNNING)
- err = microcode_ops->request_microcode_fw(
- smp_processor_id(),
- &microcode_pdev->dev);
- }
- if (!err)
- microcode_ops->apply_microcode(smp_processor_id());
- return err;
-}
+ if (uci->valid)
+ ustate = microcode_resume_cpu(cpu);
+ else
+ ustate = microcode_init_cpu(cpu);
-static int microcode_init_cpu(int cpu)
-{
- int err;
- mutex_lock(&microcode_mutex);
- err = work_on_cpu(cpu, microcode_update_cpu, NULL);
- mutex_unlock(&microcode_mutex);
-
- return err;
+ return ustate;
}
static int mc_sysdev_add(struct sys_device *sys_dev)
{
int err, cpu = sys_dev->id;
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
if (!cpu_online(cpu))
return 0;
pr_debug("microcode: CPU%d added\n", cpu);
- memset(uci, 0, sizeof(*uci));
err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
if (err)
return err;
- err = microcode_init_cpu(cpu);
+ if (microcode_init_cpu(cpu) == UCODE_ERROR)
+ err = -EINVAL;
return err;
}
@@ -400,19 +436,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
static int mc_sysdev_resume(struct sys_device *dev)
{
int cpu = dev->id;
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
if (!cpu_online(cpu))
return 0;
- /* only CPU 0 will apply ucode here */
- microcode_update_cpu(NULL);
+ /*
+ * All non-bootup cpus are still disabled,
+ * so only CPU 0 will apply ucode here.
+ *
+ * Moreover, there can be no concurrent
+ * updates from any other places at this point.
+ */
+ WARN_ON(cpu != 0);
+
+ if (uci->valid && uci->mc)
+ microcode_ops->apply_microcode(cpu);
+
return 0;
}
static struct sysdev_driver mc_sysdev_driver = {
- .add = mc_sysdev_add,
- .remove = mc_sysdev_remove,
- .resume = mc_sysdev_resume,
+ .add = mc_sysdev_add,
+ .remove = mc_sysdev_remove,
+ .resume = mc_sysdev_resume,
};
static __cpuinit int
@@ -425,15 +472,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- if (microcode_init_cpu(cpu))
- printk(KERN_ERR "microcode: failed to init CPU%d\n",
- cpu);
+ microcode_update_cpu(cpu);
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
pr_debug("microcode: CPU%d added\n", cpu);
if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
- printk(KERN_ERR "microcode: Failed to create the sysfs "
- "group for CPU%d\n", cpu);
+ pr_err("microcode: Failed to create group for CPU%d\n", cpu);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
@@ -465,13 +509,10 @@ static int __init microcode_init(void)
microcode_ops = init_amd_microcode();
if (!microcode_ops) {
- printk(KERN_ERR "microcode: no support for this CPU vendor\n");
+ pr_err("microcode: no support for this CPU vendor\n");
return -ENODEV;
}
- error = microcode_dev_init();
- if (error)
- return error;
microcode_pdev = platform_device_register_simple("microcode", -1,
NULL, 0);
if (IS_ERR(microcode_pdev)) {
@@ -480,23 +521,31 @@ static int __init microcode_init(void)
}
get_online_cpus();
+ mutex_lock(&microcode_mutex);
+
error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+
+ mutex_unlock(&microcode_mutex);
put_online_cpus();
+
if (error) {
- microcode_dev_exit();
platform_device_unregister(microcode_pdev);
return error;
}
+ error = microcode_dev_init();
+ if (error)
+ return error;
+
register_hotcpu_notifier(&mc_cpu_notifier);
- printk(KERN_INFO
- "Microcode Update Driver: v" MICROCODE_VERSION
+ pr_info("Microcode Update Driver: v" MICROCODE_VERSION
" <tigran@aivazian.fsnet.co.uk>,"
" Peter Oruba\n");
return 0;
}
+module_init(microcode_init);
static void __exit microcode_exit(void)
{
@@ -505,16 +554,17 @@ static void __exit microcode_exit(void)
unregister_hotcpu_notifier(&mc_cpu_notifier);
get_online_cpus();
+ mutex_lock(&microcode_mutex);
+
sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+
+ mutex_unlock(&microcode_mutex);
put_online_cpus();
platform_device_unregister(microcode_pdev);
microcode_ops = NULL;
- printk(KERN_INFO
- "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
+ pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
}
-
-module_init(microcode_init);
module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 149b9ec7c1a..0d334ddd0a9 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,24 +70,11 @@
* Fix sigmatch() macro to handle old CPUs with pf == 0.
* Thanks to Stuart Swales for pointing out this bug.
*/
-#include <linux/platform_device.h>
-#include <linux/capability.h>
-#include <linux/miscdevice.h>
#include <linux/firmware.h>
-#include <linux/smp_lock.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
+#include <linux/vmalloc.h>
#include <asm/microcode.h>
#include <asm/processor.h>
@@ -150,13 +137,9 @@ struct extended_sigtable {
#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
-/* serialize access to the physical write to MSR 0x79 */
-static DEFINE_SPINLOCK(microcode_update_lock);
-
static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
{
struct cpuinfo_x86 *c = &cpu_data(cpu_num);
- unsigned long flags;
unsigned int val[2];
memset(csig, 0, sizeof(*csig));
@@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
csig->pf = 1 << ((val[1] >> 18) & 7);
}
- /* serialize access to the physical write to MSR 0x79 */
- spin_lock_irqsave(&microcode_update_lock, flags);
-
wrmsr(MSR_IA32_UCODE_REV, 0, 0);
/* see notes above for revision 1.07. Apparent chip bug */
sync_core();
/* get the current revision from MSR 0x8B */
rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
- spin_unlock_irqrestore(&microcode_update_lock, flags);
- pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
- csig->sig, csig->pf, csig->rev);
+ printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
+ cpu_num, csig->sig, csig->pf, csig->rev);
return 0;
}
@@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
return 0;
}
-static void apply_microcode(int cpu)
+static int apply_microcode(int cpu)
{
struct microcode_intel *mc_intel;
struct ucode_cpu_info *uci;
- unsigned long flags;
unsigned int val[2];
int cpu_num;
@@ -334,10 +312,7 @@ static void apply_microcode(int cpu)
BUG_ON(cpu_num != cpu);
if (mc_intel == NULL)
- return;
-
- /* serialize access to the physical write to MSR 0x79 */
- spin_lock_irqsave(&microcode_update_lock, flags);
+ return 0;
/* write microcode via MSR 0x79 */
wrmsr(MSR_IA32_UCODE_WRITE,
@@ -351,30 +326,32 @@ static void apply_microcode(int cpu)
/* get the current revision from MSR 0x8B */
rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
- spin_unlock_irqrestore(&microcode_update_lock, flags);
if (val[1] != mc_intel->hdr.rev) {
- printk(KERN_ERR "microcode: CPU%d update from revision "
- "0x%x to 0x%x failed\n",
- cpu_num, uci->cpu_sig.rev, val[1]);
- return;
+ printk(KERN_ERR "microcode: CPU%d update "
+ "to revision 0x%x failed\n",
+ cpu_num, mc_intel->hdr.rev);
+ return -1;
}
- printk(KERN_INFO "microcode: CPU%d updated from revision "
- "0x%x to 0x%x, date = %04x-%02x-%02x \n",
- cpu_num, uci->cpu_sig.rev, val[1],
+ printk(KERN_INFO "microcode: CPU%d updated to revision "
+ "0x%x, date = %04x-%02x-%02x \n",
+ cpu_num, val[1],
mc_intel->hdr.date & 0xffff,
mc_intel->hdr.date >> 24,
(mc_intel->hdr.date >> 16) & 0xff);
uci->cpu_sig.rev = val[1];
+
+ return 0;
}
-static int generic_load_microcode(int cpu, void *data, size_t size,
- int (*get_ucode_data)(void *, const void *, size_t))
+static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
+ int (*get_ucode_data)(void *, const void *, size_t))
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
u8 *ucode_ptr = data, *new_mc = NULL, *mc;
int new_rev = uci->cpu_sig.rev;
unsigned int leftover = size;
+ enum ucode_state state = UCODE_OK;
while (leftover) {
struct microcode_header_intel mc_header;
@@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
leftover -= mc_size;
}
- if (!new_mc)
+ if (leftover) {
+ if (new_mc)
+ vfree(new_mc);
+ state = UCODE_ERROR;
goto out;
+ }
- if (leftover) {
- vfree(new_mc);
+ if (!new_mc) {
+ state = UCODE_NFOUND;
goto out;
}
@@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
pr_debug("microcode: CPU%d found a matching microcode update with"
" version 0x%x (current=0x%x)\n",
cpu, new_rev, uci->cpu_sig.rev);
-
- out:
- return (int)leftover;
+out:
+ return state;
}
static int get_ucode_fw(void *to, const void *from, size_t n)
@@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n)
return 0;
}
-static int request_microcode_fw(int cpu, struct device *device)
+static enum ucode_state request_microcode_fw(int cpu, struct device *device)
{
char name[30];
struct cpuinfo_x86 *c = &cpu_data(cpu);
const struct firmware *firmware;
- int ret;
+ enum ucode_state ret;
- /* We should bind the task to the CPU */
- BUG_ON(cpu != raw_smp_processor_id());
sprintf(name, "intel-ucode/%02x-%02x-%02x",
c->x86, c->x86_model, c->x86_mask);
- ret = request_firmware(&firmware, name, device);
- if (ret) {
+
+ if (request_firmware(&firmware, name, device)) {
pr_debug("microcode: data file %s load failed\n", name);
- return ret;
+ return UCODE_NFOUND;
}
ret = generic_load_microcode(cpu, (void *)firmware->data,
@@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n)
return copy_from_user(to, from, n);
}
-static int request_microcode_user(int cpu, const void __user *buf, size_t size)
+static enum ucode_state
+request_microcode_user(int cpu, const void __user *buf, size_t size)
{
- /* We should bind the task to the CPU */
- BUG_ON(cpu != raw_smp_processor_id());
-
return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
}
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module.c
index c23880b90b5..89f386f044e 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module.c
@@ -1,6 +1,5 @@
-/* Kernel module help for x86-64
+/* Kernel module help for x86.
Copyright (C) 2001 Rusty Russell.
- Copyright (C) 2002,2003 Andi Kleen, SuSE Labs.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -22,23 +21,18 @@
#include <linux/fs.h>
#include <linux/string.h>
#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
#include <linux/bug.h>
+#include <linux/mm.h>
#include <asm/system.h>
#include <asm/page.h>
#include <asm/pgtable.h>
+#if 0
+#define DEBUGP printk
+#else
#define DEBUGP(fmt...)
-
-#ifndef CONFIG_UML
-void module_free(struct module *mod, void *module_region)
-{
- vfree(module_region);
- /* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
-}
+#endif
void *module_alloc(unsigned long size)
{
@@ -54,9 +48,15 @@ void *module_alloc(unsigned long size)
if (!area)
return NULL;
- return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC);
+ return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM,
+ PAGE_KERNEL_EXEC);
+}
+
+/* Free memory returned from module_alloc */
+void module_free(struct module *mod, void *module_region)
+{
+ vfree(module_region);
}
-#endif
/* We don't need anything special. */
int module_frob_arch_sections(Elf_Ehdr *hdr,
@@ -67,6 +67,58 @@ int module_frob_arch_sections(Elf_Ehdr *hdr,
return 0;
}
+#ifdef CONFIG_X86_32
+int apply_relocate(Elf32_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+ unsigned int i;
+ Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
+ Elf32_Sym *sym;
+ uint32_t *location;
+
+ DEBUGP("Applying relocate section %u to %u\n", relsec,
+ sechdrs[relsec].sh_info);
+ for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+ /* This is where to make the change */
+ location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+ + rel[i].r_offset;
+ /* This is the symbol it is referring to. Note that all
+ undefined symbols have been resolved. */
+ sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
+ + ELF32_R_SYM(rel[i].r_info);
+
+ switch (ELF32_R_TYPE(rel[i].r_info)) {
+ case R_386_32:
+ /* We add the value into the location given */
+ *location += sym->st_value;
+ break;
+ case R_386_PC32:
+ /* Add the value, subtract its postition */
+ *location += sym->st_value - (uint32_t)location;
+ break;
+ default:
+ printk(KERN_ERR "module %s: Unknown relocation: %u\n",
+ me->name, ELF32_R_TYPE(rel[i].r_info));
+ return -ENOEXEC;
+ }
+ }
+ return 0;
+}
+
+int apply_relocate_add(Elf32_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+ printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
+ me->name);
+ return -ENOEXEC;
+}
+#else /*X86_64*/
int apply_relocate_add(Elf64_Shdr *sechdrs,
const char *strtab,
unsigned int symindex,
@@ -147,6 +199,8 @@ int apply_relocate(Elf_Shdr *sechdrs,
return -ENOSYS;
}
+#endif
+
int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c
deleted file mode 100644
index 0edd819050e..00000000000
--- a/arch/x86/kernel/module_32.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Kernel module help for i386.
- Copyright (C) 2001 Rusty Russell.
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-*/
-#include <linux/moduleloader.h>
-#include <linux/elf.h>
-#include <linux/vmalloc.h>
-#include <linux/fs.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/bug.h>
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(fmt...)
-#endif
-
-void *module_alloc(unsigned long size)
-{
- if (size == 0)
- return NULL;
- return vmalloc_exec(size);
-}
-
-
-/* Free memory returned from module_alloc */
-void module_free(struct module *mod, void *module_region)
-{
- vfree(module_region);
- /* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
-}
-
-/* We don't need anything special. */
-int module_frob_arch_sections(Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs,
- char *secstrings,
- struct module *mod)
-{
- return 0;
-}
-
-int apply_relocate(Elf32_Shdr *sechdrs,
- const char *strtab,
- unsigned int symindex,
- unsigned int relsec,
- struct module *me)
-{
- unsigned int i;
- Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
- Elf32_Sym *sym;
- uint32_t *location;
-
- DEBUGP("Applying relocate section %u to %u\n", relsec,
- sechdrs[relsec].sh_info);
- for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
- /* This is where to make the change */
- location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
- + rel[i].r_offset;
- /* This is the symbol it is referring to. Note that all
- undefined symbols have been resolved. */
- sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
- + ELF32_R_SYM(rel[i].r_info);
-
- switch (ELF32_R_TYPE(rel[i].r_info)) {
- case R_386_32:
- /* We add the value into the location given */
- *location += sym->st_value;
- break;
- case R_386_PC32:
- /* Add the value, subtract its postition */
- *location += sym->st_value - (uint32_t)location;
- break;
- default:
- printk(KERN_ERR "module %s: Unknown relocation: %u\n",
- me->name, ELF32_R_TYPE(rel[i].r_info));
- return -ENOEXEC;
- }
- }
- return 0;
-}
-
-int apply_relocate_add(Elf32_Shdr *sechdrs,
- const char *strtab,
- unsigned int symindex,
- unsigned int relsec,
- struct module *me)
-{
- printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
- me->name);
- return -ENOEXEC;
-}
-
-int module_finalize(const Elf_Ehdr *hdr,
- const Elf_Shdr *sechdrs,
- struct module *me)
-{
- const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
- *para = NULL;
- char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
-
- for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
- if (!strcmp(".text", secstrings + s->sh_name))
- text = s;
- if (!strcmp(".altinstructions", secstrings + s->sh_name))
- alt = s;
- if (!strcmp(".smp_locks", secstrings + s->sh_name))
- locks = s;
- if (!strcmp(".parainstructions", secstrings + s->sh_name))
- para = s;
- }
-
- if (alt) {
- /* patch .altinstructions */
- void *aseg = (void *)alt->sh_addr;
- apply_alternatives(aseg, aseg + alt->sh_size);
- }
- if (locks && text) {
- void *lseg = (void *)locks->sh_addr;
- void *tseg = (void *)text->sh_addr;
- alternatives_smp_module_add(me, me->name,
- lseg, lseg + locks->sh_size,
- tseg, tseg + text->sh_size);
- }
-
- if (para) {
- void *pseg = (void *)para->sh_addr;
- apply_paravirt(pseg, pseg + para->sh_size);
- }
-
- return module_bug_finalize(hdr, sechdrs, me);
-}
-
-void module_arch_cleanup(struct module *mod)
-{
- alternatives_smp_module_del(mod);
- module_bug_cleanup(mod);
-}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 70fd7e414c1..651c93b2886 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -17,6 +17,7 @@
#include <linux/acpi.h>
#include <linux/module.h>
#include <linux/smp.h>
+#include <linux/pci.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
@@ -870,24 +871,17 @@ static
inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
#endif /* CONFIG_X86_IO_APIC */
-static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length,
- int count)
+static int
+check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
{
- if (!mpc_new_phys) {
- pr_info("No spare slots, try to append...take your risk, "
- "new mpc_length %x\n", count);
- } else {
- if (count <= mpc_new_length)
- pr_info("No spare slots, try to append..., "
- "new mpc_length %x\n", count);
- else {
- pr_err("mpc_new_length %lx is too small\n",
- mpc_new_length);
- return -1;
- }
+ int ret = 0;
+
+ if (!mpc_new_phys || count <= mpc_new_length) {
+ WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
+ return -1;
}
- return 0;
+ return ret;
}
static int __init replace_intsrc_all(struct mpc_table *mpc,
@@ -946,7 +940,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
} else {
struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
count += sizeof(struct mpc_intsrc);
- if (!check_slot(mpc_new_phys, mpc_new_length, count))
+ if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
goto out;
assign_to_mpc_intsrc(&mp_irqs[i], m);
mpc->length = count;
@@ -963,11 +957,14 @@ out:
return 0;
}
-static int __initdata enable_update_mptable;
+int enable_update_mptable;
static int __init update_mptable_setup(char *str)
{
enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+ pci_routeirq = 1;
+#endif
return 0;
}
early_param("update_mptable", update_mptable_setup);
@@ -980,6 +977,9 @@ static int __initdata alloc_mptable;
static int __init parse_alloc_mptable_opt(char *p)
{
enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+ pci_routeirq = 1;
+#endif
alloc_mptable = 1;
if (!p)
return 0;
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 3cf3413ec62..98fd6cd4e3a 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -196,6 +196,11 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
.notifier_call = msr_class_cpu_callback,
};
+static char *msr_nodename(struct device *dev)
+{
+ return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
+}
+
static int __init msr_init(void)
{
int i, err = 0;
@@ -212,6 +217,7 @@ static int __init msr_init(void)
err = PTR_ERR(msr_class);
goto out_chrdev;
}
+ msr_class->nodename = msr_nodename;
for_each_online_cpu(i) {
err = msr_device_create(i);
if (err != 0)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 9faf43bea33..70ec9b951d7 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -248,18 +248,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
static inline void enter_lazy(enum paravirt_lazy_mode mode)
{
- BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
- BUG_ON(preemptible());
+ BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
- __get_cpu_var(paravirt_lazy_mode) = mode;
+ percpu_write(paravirt_lazy_mode, mode);
}
-void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
+static void leave_lazy(enum paravirt_lazy_mode mode)
{
- BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode);
- BUG_ON(preemptible());
+ BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
- __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
+ percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
}
void paravirt_enter_lazy_mmu(void)
@@ -269,22 +267,36 @@ void paravirt_enter_lazy_mmu(void)
void paravirt_leave_lazy_mmu(void)
{
- paravirt_leave_lazy(PARAVIRT_LAZY_MMU);
+ leave_lazy(PARAVIRT_LAZY_MMU);
}
-void paravirt_enter_lazy_cpu(void)
+void paravirt_start_context_switch(struct task_struct *prev)
{
+ BUG_ON(preemptible());
+
+ if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
+ arch_leave_lazy_mmu_mode();
+ set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
+ }
enter_lazy(PARAVIRT_LAZY_CPU);
}
-void paravirt_leave_lazy_cpu(void)
+void paravirt_end_context_switch(struct task_struct *next)
{
- paravirt_leave_lazy(PARAVIRT_LAZY_CPU);
+ BUG_ON(preemptible());
+
+ leave_lazy(PARAVIRT_LAZY_CPU);
+
+ if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
+ arch_enter_lazy_mmu_mode();
}
enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
{
- return __get_cpu_var(paravirt_lazy_mode);
+ if (in_interrupt())
+ return PARAVIRT_LAZY_NONE;
+
+ return percpu_read(paravirt_lazy_mode);
}
void arch_flush_lazy_mmu_mode(void)
@@ -292,7 +304,6 @@ void arch_flush_lazy_mmu_mode(void)
preempt_disable();
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
- WARN_ON(preempt_count() == 1);
arch_leave_lazy_mmu_mode();
arch_enter_lazy_mmu_mode();
}
@@ -300,19 +311,6 @@ void arch_flush_lazy_mmu_mode(void)
preempt_enable();
}
-void arch_flush_lazy_cpu_mode(void)
-{
- preempt_disable();
-
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
- WARN_ON(preempt_count() == 1);
- arch_leave_lazy_cpu_mode();
- arch_enter_lazy_cpu_mode();
- }
-
- preempt_enable();
-}
-
struct pv_info pv_info = {
.name = "bare hardware",
.paravirt_enabled = 0,
@@ -404,10 +402,8 @@ struct pv_cpu_ops pv_cpu_ops = {
.set_iopl_mask = native_set_iopl_mask,
.io_delay = native_io_delay,
- .lazy_mode = {
- .enter = paravirt_nop,
- .leave = paravirt_nop,
- },
+ .start_context_switch = paravirt_nop,
+ .end_context_switch = paravirt_nop,
};
struct pv_apic_ops pv_apic_ops = {
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 755c21e906f..971a3bec47a 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = {
static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
-/* enable this to stress test the chip's TCE cache */
-#ifdef CONFIG_IOMMU_DEBUG
-static int debugging = 1;
-
-static inline unsigned long verify_bit_range(unsigned long* bitmap,
- int expected, unsigned long start, unsigned long end)
-{
- unsigned long idx = start;
-
- BUG_ON(start >= end);
-
- while (idx < end) {
- if (!!test_bit(idx, bitmap) != expected)
- return idx;
- ++idx;
- }
-
- /* all bits have the expected value */
- return ~0UL;
-}
-#else /* debugging is disabled */
-static int debugging;
-
-static inline unsigned long verify_bit_range(unsigned long* bitmap,
- int expected, unsigned long start, unsigned long end)
-{
- return ~0UL;
-}
-
-#endif /* CONFIG_IOMMU_DEBUG */
-
static inline int translation_enabled(struct iommu_table *tbl)
{
/* only PHBs with translation enabled have an IOMMU table */
@@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
{
unsigned long index;
unsigned long end;
- unsigned long badbit;
unsigned long flags;
index = start_addr >> PAGE_SHIFT;
@@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
spin_lock_irqsave(&tbl->it_lock, flags);
- badbit = verify_bit_range(tbl->it_map, 0, index, end);
- if (badbit != ~0UL) {
- if (printk_ratelimit())
- printk(KERN_ERR "Calgary: entry already allocated at "
- "0x%lx tbl %p dma 0x%lx npages %u\n",
- badbit, tbl, start_addr, npages);
- }
-
iommu_area_reserve(tbl->it_map, index, npages);
spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
unsigned int npages)
{
unsigned long entry;
- unsigned long badbit;
unsigned long badend;
unsigned long flags;
@@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
spin_lock_irqsave(&tbl->it_lock, flags);
- badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
- if (badbit != ~0UL) {
- if (printk_ratelimit())
- printk(KERN_ERR "Calgary: bit is off at 0x%lx "
- "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
- badbit, tbl, dma_addr, entry, npages);
- }
-
iommu_area_free(tbl->it_map, entry, npages);
spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -1488,9 +1439,8 @@ void __init detect_calgary(void)
iommu_detected = 1;
calgary_detected = 1;
printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
- printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
- "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
- debugging ? "enabled" : "disabled");
+ printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
+ specified_table_size);
/* swiotlb for devices that aren't behind the Calgary. */
if (max_pfn > MAX_DMA32_PFN)
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 745579bc825..1a041bcf506 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -32,6 +32,8 @@ int no_iommu __read_mostly;
/* Set this to 1 if there is a HW IOMMU in the system */
int iommu_detected __read_mostly = 0;
+int iommu_pass_through;
+
dma_addr_t bad_dma_address __read_mostly = 0;
EXPORT_SYMBOL(bad_dma_address);
@@ -210,6 +212,10 @@ static __init int iommu_setup(char *p)
if (!strncmp(p, "soft", 4))
swiotlb = 1;
#endif
+ if (!strncmp(p, "pt", 2)) {
+ iommu_pass_through = 1;
+ return 1;
+ }
gart_parse_options(p);
@@ -290,6 +296,8 @@ static int __init pci_iommu_init(void)
void pci_iommu_shutdown(void)
{
gart_iommu_shutdown();
+
+ amd_iommu_shutdown();
}
/* Must execute after PCI subsystem */
fs_initcall(pci_iommu_init);
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index b284b58c035..d2e56b8f48e 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -144,48 +144,21 @@ static void flush_gart(void)
}
#ifdef CONFIG_IOMMU_LEAK
-
-#define SET_LEAK(x) \
- do { \
- if (iommu_leak_tab) \
- iommu_leak_tab[x] = __builtin_return_address(0);\
- } while (0)
-
-#define CLEAR_LEAK(x) \
- do { \
- if (iommu_leak_tab) \
- iommu_leak_tab[x] = NULL; \
- } while (0)
-
/* Debugging aid for drivers that don't free their IOMMU tables */
-static void **iommu_leak_tab;
static int leak_trace;
static int iommu_leak_pages = 20;
static void dump_leak(void)
{
- int i;
static int dump;
- if (dump || !iommu_leak_tab)
+ if (dump)
return;
dump = 1;
- show_stack(NULL, NULL);
- /* Very crude. dump some from the end of the table too */
- printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n",
- iommu_leak_pages);
- for (i = 0; i < iommu_leak_pages; i += 2) {
- printk(KERN_DEBUG "%lu: ", iommu_pages-i);
- printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
- 0);
- printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
- }
- printk(KERN_DEBUG "\n");
+ show_stack(NULL, NULL);
+ debug_dma_dump_mappings(NULL);
}
-#else
-# define SET_LEAK(x)
-# define CLEAR_LEAK(x)
#endif
static void iommu_full(struct device *dev, size_t size, int dir)
@@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
for (i = 0; i < npages; i++) {
iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
- SET_LEAK(iommu_page + i);
phys_mem += PAGE_SIZE;
}
return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
@@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
for (i = 0; i < npages; i++) {
iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
- CLEAR_LEAK(iommu_page + i);
}
free_iommu(iommu_page, npages);
}
@@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
while (pages--) {
iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
- SET_LEAK(iommu_page);
addr += PAGE_SIZE;
iommu_page++;
}
@@ -688,8 +658,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
agp_gatt_table = gatt;
- enable_gart_translations();
-
error = sysdev_class_register(&gart_sysdev_class);
if (!error)
error = sysdev_register(&device_gart);
@@ -707,7 +675,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
nommu:
/* Should not happen anymore */
printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
- KERN_WARNING "falling back to iommu=soft.\n");
+ "falling back to iommu=soft.\n");
return -1;
}
@@ -801,11 +769,12 @@ void __init gart_iommu_init(void)
#ifdef CONFIG_IOMMU_LEAK
if (leak_trace) {
- iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
- get_order(iommu_pages*sizeof(void *)));
- if (!iommu_leak_tab)
+ int ret;
+
+ ret = dma_debug_resize_entries(iommu_pages);
+ if (ret)
printk(KERN_DEBUG
- "PCI-DMA: Cannot allocate leak trace area\n");
+ "PCI-DMA: Cannot trace all the entries\n");
}
#endif
@@ -845,6 +814,14 @@ void __init gart_iommu_init(void)
* the pages as Not-Present:
*/
wbinvd();
+
+ /*
+ * Now all caches are flushed and we can safely enable
+ * GART hardware. Doing it early leaves the possibility
+ * of stale cache entries that can lead to GART PTE
+ * errors.
+ */
+ enable_gart_translations();
/*
* Try to workaround a bug (thanks to BenH):
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 221a3853e26..6af96ee4420 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
return paddr;
}
-phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
+phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
{
return baddr;
}
@@ -71,7 +71,8 @@ void __init pci_swiotlb_init(void)
{
/* don't initialize swiotlb if iommu=off (no_iommu=1) */
#ifdef CONFIG_X86_64
- if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)
+ if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) ||
+ iommu_pass_through)
swiotlb = 1;
#endif
if (swiotlb_force)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ca989158e84..071166a4ba8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -8,12 +8,15 @@
#include <linux/module.h>
#include <linux/pm.h>
#include <linux/clockchips.h>
+#include <linux/random.h>
#include <trace/power.h>
#include <asm/system.h>
#include <asm/apic.h>
+#include <asm/syscalls.h>
#include <asm/idle.h>
#include <asm/uaccess.h>
#include <asm/i387.h>
+#include <asm/ds.h>
unsigned long idle_halt;
EXPORT_SYMBOL(idle_halt);
@@ -45,6 +48,8 @@ void free_thread_xstate(struct task_struct *tsk)
kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
tsk->thread.xstate = NULL;
}
+
+ WARN(tsk->thread.ds_ctx, "leaking DS context\n");
}
void free_thread_info(struct thread_info *ti)
@@ -58,7 +63,7 @@ void arch_task_cache_init(void)
task_xstate_cachep =
kmem_cache_create("task_xstate", xstate_size,
__alignof__(union thread_xstate),
- SLAB_PANIC, NULL);
+ SLAB_PANIC | SLAB_NOTRACK, NULL);
}
/*
@@ -83,8 +88,6 @@ void exit_thread(void)
put_cpu();
kfree(bp);
}
-
- ds_exit_thread(current);
}
void flush_thread(void)
@@ -516,16 +519,12 @@ static void c1e_idle(void)
if (!cpumask_test_cpu(cpu, c1e_mask)) {
cpumask_set_cpu(cpu, c1e_mask);
/*
- * Force broadcast so ACPI can not interfere. Needs
- * to run with interrupts enabled as it uses
- * smp_function_call.
+ * Force broadcast so ACPI can not interfere.
*/
- local_irq_enable();
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
&cpu);
printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
cpu);
- local_irq_disable();
}
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
@@ -613,3 +612,16 @@ static int __init idle_setup(char *str)
}
early_param("idle", idle_setup);
+unsigned long arch_align_stack(unsigned long sp)
+{
+ if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+ sp -= get_random_int() % 8192;
+ return sp & ~0xf;
+}
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+ unsigned long range_end = mm->brk + 0x02000000;
+ return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
+
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 76f8f84043a..59f4524984a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -9,8 +9,6 @@
* This file handles the architecture-dependent parts of process handling..
*/
-#include <stdarg.h>
-
#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
@@ -33,7 +31,6 @@
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/ptrace.h>
-#include <linux/random.h>
#include <linux/personality.h>
#include <linux/tick.h>
#include <linux/percpu.h>
@@ -290,7 +287,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
p->thread.io_bitmap_max = 0;
}
- ds_copy_thread(p, current);
+ clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
+ p->thread.ds_ctx = NULL;
clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
p->thread.debugctlmsr = 0;
@@ -407,7 +405,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* done before math_state_restore, so the TS bit is up
* to date.
*/
- arch_leave_lazy_cpu_mode();
+ arch_end_context_switch(next_p);
/* If the task has used fpu the last 5 timeslices, just do a full
* restore of the math state immediately to avoid the trap; the
@@ -497,15 +495,3 @@ unsigned long get_wchan(struct task_struct *p)
return 0;
}
-unsigned long arch_align_stack(unsigned long sp)
-{
- if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
- sp -= get_random_int() % 8192;
- return sp & ~0xf;
-}
-
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
- unsigned long range_end = mm->brk + 0x02000000;
- return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
-}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b751a41392b..ebefb5407b9 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -14,8 +14,6 @@
* This file handles the architecture-dependent parts of process handling..
*/
-#include <stdarg.h>
-
#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
@@ -32,7 +30,6 @@
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/ptrace.h>
-#include <linux/random.h>
#include <linux/notifier.h>
#include <linux/kprobes.h>
#include <linux/kdebug.h>
@@ -335,7 +332,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
goto out;
}
- ds_copy_thread(p, me);
+ clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
+ p->thread.ds_ctx = NULL;
clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
p->thread.debugctlmsr = 0;
@@ -428,7 +426,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* done before math_state_restore, so the TS bit is up
* to date.
*/
- arch_leave_lazy_cpu_mode();
+ arch_end_context_switch(next_p);
/*
* Switch FS and GS.
@@ -660,15 +658,3 @@ long sys_arch_prctl(int code, unsigned long addr)
return do_arch_prctl(current, code, addr);
}
-unsigned long arch_align_stack(unsigned long sp)
-{
- if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
- sp -= get_random_int() % 8192;
- return sp & ~0xf;
-}
-
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
- unsigned long range_end = mm->brk + 0x02000000;
- return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
-}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 23b7c8f017e..09ecbde91c1 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,6 +21,7 @@
#include <linux/audit.h>
#include <linux/seccomp.h>
#include <linux/signal.h>
+#include <linux/workqueue.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -578,17 +579,130 @@ static int ioperm_get(struct task_struct *target,
}
#ifdef CONFIG_X86_PTRACE_BTS
+/*
+ * A branch trace store context.
+ *
+ * Contexts may only be installed by ptrace_bts_config() and only for
+ * ptraced tasks.
+ *
+ * Contexts are destroyed when the tracee is detached from the tracer.
+ * The actual destruction work requires interrupts enabled, so the
+ * work is deferred and will be scheduled during __ptrace_unlink().
+ *
+ * Contexts hold an additional task_struct reference on the traced
+ * task, as well as a reference on the tracer's mm.
+ *
+ * Ptrace already holds a task_struct for the duration of ptrace operations,
+ * but since destruction is deferred, it may be executed after both
+ * tracer and tracee exited.
+ */
+struct bts_context {
+ /* The branch trace handle. */
+ struct bts_tracer *tracer;
+
+ /* The buffer used to store the branch trace and its size. */
+ void *buffer;
+ unsigned int size;
+
+ /* The mm that paid for the above buffer. */
+ struct mm_struct *mm;
+
+ /* The task this context belongs to. */
+ struct task_struct *task;
+
+ /* The signal to send on a bts buffer overflow. */
+ unsigned int bts_ovfl_signal;
+
+ /* The work struct to destroy a context. */
+ struct work_struct work;
+};
+
+static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
+{
+ void *buffer = NULL;
+ int err = -ENOMEM;
+
+ err = account_locked_memory(current->mm, current->signal->rlim, size);
+ if (err < 0)
+ return err;
+
+ buffer = kzalloc(size, GFP_KERNEL);
+ if (!buffer)
+ goto out_refund;
+
+ context->buffer = buffer;
+ context->size = size;
+ context->mm = get_task_mm(current);
+
+ return 0;
+
+ out_refund:
+ refund_locked_memory(current->mm, size);
+ return err;
+}
+
+static inline void free_bts_buffer(struct bts_context *context)
+{
+ if (!context->buffer)
+ return;
+
+ kfree(context->buffer);
+ context->buffer = NULL;
+
+ refund_locked_memory(context->mm, context->size);
+ context->size = 0;
+
+ mmput(context->mm);
+ context->mm = NULL;
+}
+
+static void free_bts_context_work(struct work_struct *w)
+{
+ struct bts_context *context;
+
+ context = container_of(w, struct bts_context, work);
+
+ ds_release_bts(context->tracer);
+ put_task_struct(context->task);
+ free_bts_buffer(context);
+ kfree(context);
+}
+
+static inline void free_bts_context(struct bts_context *context)
+{
+ INIT_WORK(&context->work, free_bts_context_work);
+ schedule_work(&context->work);
+}
+
+static inline struct bts_context *alloc_bts_context(struct task_struct *task)
+{
+ struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
+ if (context) {
+ context->task = task;
+ task->bts = context;
+
+ get_task_struct(task);
+ }
+
+ return context;
+}
+
static int ptrace_bts_read_record(struct task_struct *child, size_t index,
struct bts_struct __user *out)
{
+ struct bts_context *context;
const struct bts_trace *trace;
struct bts_struct bts;
const unsigned char *at;
int error;
- trace = ds_read_bts(child->bts);
+ context = child->bts;
+ if (!context)
+ return -ESRCH;
+
+ trace = ds_read_bts(context->tracer);
if (!trace)
- return -EPERM;
+ return -ESRCH;
at = trace->ds.top - ((index + 1) * trace->ds.size);
if ((void *)at < trace->ds.begin)
@@ -597,7 +711,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
if (!trace->read)
return -EOPNOTSUPP;
- error = trace->read(child->bts, at, &bts);
+ error = trace->read(context->tracer, at, &bts);
if (error < 0)
return error;
@@ -611,13 +725,18 @@ static int ptrace_bts_drain(struct task_struct *child,
long size,
struct bts_struct __user *out)
{
+ struct bts_context *context;
const struct bts_trace *trace;
const unsigned char *at;
int error, drained = 0;
- trace = ds_read_bts(child->bts);
+ context = child->bts;
+ if (!context)
+ return -ESRCH;
+
+ trace = ds_read_bts(context->tracer);
if (!trace)
- return -EPERM;
+ return -ESRCH;
if (!trace->read)
return -EOPNOTSUPP;
@@ -628,9 +747,8 @@ static int ptrace_bts_drain(struct task_struct *child,
for (at = trace->ds.begin; (void *)at < trace->ds.top;
out++, drained++, at += trace->ds.size) {
struct bts_struct bts;
- int error;
- error = trace->read(child->bts, at, &bts);
+ error = trace->read(context->tracer, at, &bts);
if (error < 0)
return error;
@@ -640,35 +758,18 @@ static int ptrace_bts_drain(struct task_struct *child,
memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
- error = ds_reset_bts(child->bts);
+ error = ds_reset_bts(context->tracer);
if (error < 0)
return error;
return drained;
}
-static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
-{
- child->bts_buffer = alloc_locked_buffer(size);
- if (!child->bts_buffer)
- return -ENOMEM;
-
- child->bts_size = size;
-
- return 0;
-}
-
-static void ptrace_bts_free_buffer(struct task_struct *child)
-{
- free_locked_buffer(child->bts_buffer, child->bts_size);
- child->bts_buffer = NULL;
- child->bts_size = 0;
-}
-
static int ptrace_bts_config(struct task_struct *child,
long cfg_size,
const struct ptrace_bts_config __user *ucfg)
{
+ struct bts_context *context;
struct ptrace_bts_config cfg;
unsigned int flags = 0;
@@ -678,28 +779,33 @@ static int ptrace_bts_config(struct task_struct *child,
if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
return -EFAULT;
- if (child->bts) {
- ds_release_bts(child->bts);
- child->bts = NULL;
- }
+ context = child->bts;
+ if (!context)
+ context = alloc_bts_context(child);
+ if (!context)
+ return -ENOMEM;
if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
if (!cfg.signal)
return -EINVAL;
- child->thread.bts_ovfl_signal = cfg.signal;
return -EOPNOTSUPP;
+ context->bts_ovfl_signal = cfg.signal;
}
- if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
- (cfg.size != child->bts_size)) {
- int error;
+ ds_release_bts(context->tracer);
+ context->tracer = NULL;
- ptrace_bts_free_buffer(child);
+ if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
+ int err;
- error = ptrace_bts_allocate_buffer(child, cfg.size);
- if (error < 0)
- return error;
+ free_bts_buffer(context);
+ if (!cfg.size)
+ return 0;
+
+ err = alloc_bts_buffer(context, cfg.size);
+ if (err < 0)
+ return err;
}
if (cfg.flags & PTRACE_BTS_O_TRACE)
@@ -708,15 +814,14 @@ static int ptrace_bts_config(struct task_struct *child,
if (cfg.flags & PTRACE_BTS_O_SCHED)
flags |= BTS_TIMESTAMPS;
- child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size,
- /* ovfl = */ NULL, /* th = */ (size_t)-1,
- flags);
- if (IS_ERR(child->bts)) {
- int error = PTR_ERR(child->bts);
-
- ptrace_bts_free_buffer(child);
- child->bts = NULL;
+ context->tracer =
+ ds_request_bts_task(child, context->buffer, context->size,
+ NULL, (size_t)-1, flags);
+ if (unlikely(IS_ERR(context->tracer))) {
+ int error = PTR_ERR(context->tracer);
+ free_bts_buffer(context);
+ context->tracer = NULL;
return error;
}
@@ -727,20 +832,25 @@ static int ptrace_bts_status(struct task_struct *child,
long cfg_size,
struct ptrace_bts_config __user *ucfg)
{
+ struct bts_context *context;
const struct bts_trace *trace;
struct ptrace_bts_config cfg;
+ context = child->bts;
+ if (!context)
+ return -ESRCH;
+
if (cfg_size < sizeof(cfg))
return -EIO;
- trace = ds_read_bts(child->bts);
+ trace = ds_read_bts(context->tracer);
if (!trace)
- return -EPERM;
+ return -ESRCH;
memset(&cfg, 0, sizeof(cfg));
- cfg.size = trace->ds.end - trace->ds.begin;
- cfg.signal = child->thread.bts_ovfl_signal;
- cfg.bts_size = sizeof(struct bts_struct);
+ cfg.size = trace->ds.end - trace->ds.begin;
+ cfg.signal = context->bts_ovfl_signal;
+ cfg.bts_size = sizeof(struct bts_struct);
if (cfg.signal)
cfg.flags |= PTRACE_BTS_O_SIGNAL;
@@ -759,80 +869,51 @@ static int ptrace_bts_status(struct task_struct *child,
static int ptrace_bts_clear(struct task_struct *child)
{
+ struct bts_context *context;
const struct bts_trace *trace;
- trace = ds_read_bts(child->bts);
+ context = child->bts;
+ if (!context)
+ return -ESRCH;
+
+ trace = ds_read_bts(context->tracer);
if (!trace)
- return -EPERM;
+ return -ESRCH;
memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
- return ds_reset_bts(child->bts);
+ return ds_reset_bts(context->tracer);
}
static int ptrace_bts_size(struct task_struct *child)
{
+ struct bts_context *context;
const struct bts_trace *trace;
- trace = ds_read_bts(child->bts);
+ context = child->bts;
+ if (!context)
+ return -ESRCH;
+
+ trace = ds_read_bts(context->tracer);
if (!trace)
- return -EPERM;
+ return -ESRCH;
return (trace->ds.top - trace->ds.begin) / trace->ds.size;
}
-static void ptrace_bts_fork(struct task_struct *tsk)
-{
- tsk->bts = NULL;
- tsk->bts_buffer = NULL;
- tsk->bts_size = 0;
- tsk->thread.bts_ovfl_signal = 0;
-}
-
-static void ptrace_bts_untrace(struct task_struct *child)
+/*
+ * Called from __ptrace_unlink() after the child has been moved back
+ * to its original parent.
+ */
+void ptrace_bts_untrace(struct task_struct *child)
{
if (unlikely(child->bts)) {
- ds_release_bts(child->bts);
+ free_bts_context(child->bts);
child->bts = NULL;
-
- /* We cannot update total_vm and locked_vm since
- child's mm is already gone. But we can reclaim the
- memory. */
- kfree(child->bts_buffer);
- child->bts_buffer = NULL;
- child->bts_size = 0;
}
}
-
-static void ptrace_bts_detach(struct task_struct *child)
-{
- /*
- * Ptrace_detach() races with ptrace_untrace() in case
- * the child dies and is reaped by another thread.
- *
- * We only do the memory accounting at this point and
- * leave the buffer deallocation and the bts tracer
- * release to ptrace_bts_untrace() which will be called
- * later on with tasklist_lock held.
- */
- release_locked_buffer(child->bts_buffer, child->bts_size);
-}
-#else
-static inline void ptrace_bts_fork(struct task_struct *tsk) {}
-static inline void ptrace_bts_detach(struct task_struct *child) {}
-static inline void ptrace_bts_untrace(struct task_struct *child) {}
#endif /* CONFIG_X86_PTRACE_BTS */
-void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
-{
- ptrace_bts_fork(child);
-}
-
-void x86_ptrace_untrace(struct task_struct *child)
-{
- ptrace_bts_untrace(child);
-}
-
/*
* Called by kernel/ptrace.c when detaching..
*
@@ -844,7 +925,6 @@ void ptrace_disable(struct task_struct *child)
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
#endif
- ptrace_bts_detach(child);
}
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 4f9c55f3a7c..03801f2f761 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -60,7 +60,7 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
"adc %5,%%edx ; "
: "=A" (product), "=r" (tmp1), "=r" (tmp2)
: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
-#elif __x86_64__
+#elif defined(__x86_64__)
__asm__ (
"mul %%rdx ; shrd $32,%%rdx,%%rax"
: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 7563b31b4f0..af71d06624b 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -491,5 +491,42 @@ void force_hpet_resume(void)
break;
}
}
+#endif
+
+#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
+/* Set correct numa_node information for AMD NB functions */
+static void __init quirk_amd_nb_node(struct pci_dev *dev)
+{
+ struct pci_dev *nb_ht;
+ unsigned int devfn;
+ u32 val;
+
+ devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
+ nb_ht = pci_get_slot(dev->bus, devfn);
+ if (!nb_ht)
+ return;
+
+ pci_read_config_dword(nb_ht, 0x60, &val);
+ set_dev_node(&dev->dev, val & 7);
+ pci_dev_put(dev);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,
+ quirk_amd_nb_node);
#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 667188e0b5a..a06e8d10184 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -3,6 +3,7 @@
#include <linux/init.h>
#include <linux/pm.h>
#include <linux/efi.h>
+#include <linux/dmi.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
@@ -17,7 +18,6 @@
#include <asm/cpu.h>
#ifdef CONFIG_X86_32
-# include <linux/dmi.h>
# include <linux/ctype.h>
# include <linux/mc146818rtc.h>
#else
@@ -192,6 +192,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
},
},
+ { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 360",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
+ DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
+ },
+ },
{ /* Handle problems with rebooting on Dell 2400's */
.callback = set_bios_reboot,
.ident = "Dell PowerEdge 2400",
@@ -240,6 +249,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
},
},
+ { /* Handle problems with rebooting on CompuLab SBC-FITPC2 */
+ .callback = set_bios_reboot,
+ .ident = "CompuLab SBC-FITPC2",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "CompuLab"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
+ },
+ },
{ }
};
@@ -387,6 +404,46 @@ EXPORT_SYMBOL(machine_real_restart);
#endif /* CONFIG_X86_32 */
+/*
+ * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
+ */
+static int __init set_pci_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_CF9) {
+ reboot_type = BOOT_CF9;
+ printk(KERN_INFO "%s series board detected. "
+ "Selecting PCI-method for reboots.\n", d->ident);
+ }
+ return 0;
+}
+
+static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
+ { /* Handle problems with rebooting on Apple MacBook5 */
+ .callback = set_pci_reboot,
+ .ident = "Apple MacBook5",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
+ },
+ },
+ { /* Handle problems with rebooting on Apple MacBookPro5 */
+ .callback = set_pci_reboot,
+ .ident = "Apple MacBookPro5",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
+ },
+ },
+ { }
+};
+
+static int __init pci_reboot_init(void)
+{
+ dmi_check_system(pci_reboot_dmi_table);
+ return 0;
+}
+core_initcall(pci_reboot_init);
+
static inline void kb_wait(void)
{
int i;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b4158439bf6..63f32d220ef 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,14 @@
#define ARCH_SETUP
#endif
+/*
+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+ */
+unsigned long max_low_pfn_mapped;
+unsigned long max_pfn_mapped;
+
RESERVE_BRK(dmi_alloc, 65536);
unsigned int boot_cpu_id __read_mostly;
@@ -214,8 +222,8 @@ unsigned long mmu_cr4_features;
unsigned long mmu_cr4_features = X86_CR4_PAE;
#endif
-/* Boot loader ID as an integer, for the benefit of proc_dointvec */
-int bootloader_type;
+/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
+int bootloader_type, bootloader_version;
/*
* Setup options
@@ -281,6 +289,20 @@ void * __init extend_brk(size_t size, size_t align)
return ret;
}
+#ifdef CONFIG_X86_64
+static void __init init_gbpages(void)
+{
+ if (direct_gbpages && cpu_has_gbpages)
+ printk(KERN_INFO "Using GB pages for direct mapping\n");
+ else
+ direct_gbpages = 0;
+}
+#else
+static inline void init_gbpages(void)
+{
+}
+#endif
+
static void __init reserve_brk(void)
{
if (_brk_end > _brk_start)
@@ -293,15 +315,13 @@ static void __init reserve_brk(void)
#ifdef CONFIG_BLK_DEV_INITRD
-#ifdef CONFIG_X86_32
-
#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
static void __init relocate_initrd(void)
{
u64 ramdisk_image = boot_params.hdr.ramdisk_image;
u64 ramdisk_size = boot_params.hdr.ramdisk_size;
- u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+ u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
u64 ramdisk_here;
unsigned long slop, clen, mapaddr;
char *p, *q;
@@ -357,14 +377,13 @@ static void __init relocate_initrd(void)
ramdisk_image, ramdisk_image + ramdisk_size - 1,
ramdisk_here, ramdisk_here + ramdisk_size - 1);
}
-#endif
static void __init reserve_initrd(void)
{
u64 ramdisk_image = boot_params.hdr.ramdisk_image;
u64 ramdisk_size = boot_params.hdr.ramdisk_size;
u64 ramdisk_end = ramdisk_image + ramdisk_size;
- u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+ u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
if (!boot_params.hdr.type_of_loader ||
!ramdisk_image || !ramdisk_size)
@@ -394,14 +413,8 @@ static void __init reserve_initrd(void)
return;
}
-#ifdef CONFIG_X86_32
relocate_initrd();
-#else
- printk(KERN_ERR "initrd extends beyond end of memory "
- "(0x%08llx > 0x%08llx)\ndisabling initrd\n",
- ramdisk_end, end_of_lowmem);
- initrd_start = 0;
-#endif
+
free_early(ramdisk_image, ramdisk_end);
}
#else
@@ -659,6 +672,19 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
},
},
+ {
+ /*
+ * AMI BIOS with low memory corruption was found on Intel DG45ID board.
+ * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
+ * match only DMI_BOARD_NAME and see if there is more bad products
+ * with this vendor.
+ */
+ .callback = dmi_low_memory_corruption,
+ .ident = "AMI BIOS",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
+ },
+ },
#endif
{}
};
@@ -706,6 +732,12 @@ void __init setup_arch(char **cmdline_p)
#endif
saved_video_mode = boot_params.hdr.vid_mode;
bootloader_type = boot_params.hdr.type_of_loader;
+ if ((bootloader_type >> 4) == 0xe) {
+ bootloader_type &= 0xf;
+ bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
+ }
+ bootloader_version = bootloader_type & 0xf;
+ bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
#ifdef CONFIG_BLK_DEV_RAM
rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
@@ -854,14 +886,20 @@ void __init setup_arch(char **cmdline_p)
max_low_pfn = max_pfn;
high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+ max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
#endif
#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
setup_bios_corruption_check();
#endif
+ printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
+ max_pfn_mapped<<PAGE_SHIFT);
+
reserve_brk();
+ init_gbpages();
+
/* max_pfn_mapped is updated here */
max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
max_pfn_mapped = max_low_pfn_mapped;
@@ -997,24 +1035,6 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_X86_32
/**
- * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
- *
- * Description:
- * Perform any necessary interrupt initialisation prior to setting up
- * the "ordinary" interrupt call gates. For legacy reasons, the ISA
- * interrupts should be initialised here if the machine emulates a PC
- * in any way.
- **/
-void __init x86_quirk_pre_intr_init(void)
-{
- if (x86_quirks->arch_pre_intr_init) {
- if (x86_quirks->arch_pre_intr_init())
- return;
- }
- init_ISA_irqs();
-}
-
-/**
* x86_quirk_intr_init - post gate setup interrupt initialisation
*
* Description:
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 8f0e13be36b..07d81916f21 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -124,7 +124,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
}
/*
- * Remap allocator
+ * Large page remap allocator
*
* This allocator uses PMD page as unit. A PMD page is allocated for
* each cpu and each is remapped into vmalloc area using PMD mapping.
@@ -137,105 +137,185 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
* better than only using 4k mappings while still being NUMA friendly.
*/
#ifdef CONFIG_NEED_MULTIPLE_NODES
-static size_t pcpur_size __initdata;
-static void **pcpur_ptrs __initdata;
+struct pcpul_ent {
+ unsigned int cpu;
+ void *ptr;
+};
+
+static size_t pcpul_size;
+static struct pcpul_ent *pcpul_map;
+static struct vm_struct pcpul_vm;
-static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
+static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
{
size_t off = (size_t)pageno << PAGE_SHIFT;
- if (off >= pcpur_size)
+ if (off >= pcpul_size)
return NULL;
- return virt_to_page(pcpur_ptrs[cpu] + off);
+ return virt_to_page(pcpul_map[cpu].ptr + off);
}
-static ssize_t __init setup_pcpu_remap(size_t static_size)
+static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
{
- static struct vm_struct vm;
- size_t ptrs_size, dyn_size;
+ size_t map_size, dyn_size;
unsigned int cpu;
+ int i, j;
ssize_t ret;
- /*
- * If large page isn't supported, there's no benefit in doing
- * this. Also, on non-NUMA, embedding is better.
- *
- * NOTE: disabled for now.
- */
- if (true || !cpu_has_pse || !pcpu_need_numa())
+ if (!chosen) {
+ size_t vm_size = VMALLOC_END - VMALLOC_START;
+ size_t tot_size = nr_cpu_ids * PMD_SIZE;
+
+ /* on non-NUMA, embedding is better */
+ if (!pcpu_need_numa())
+ return -EINVAL;
+
+ /* don't consume more than 20% of vmalloc area */
+ if (tot_size > vm_size / 5) {
+ pr_info("PERCPU: too large chunk size %zuMB for "
+ "large page remap\n", tot_size >> 20);
+ return -EINVAL;
+ }
+ }
+
+ /* need PSE */
+ if (!cpu_has_pse) {
+ pr_warning("PERCPU: lpage allocator requires PSE\n");
return -EINVAL;
+ }
/*
* Currently supports only single page. Supporting multiple
* pages won't be too difficult if it ever becomes necessary.
*/
- pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+ pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
PERCPU_DYNAMIC_RESERVE);
- if (pcpur_size > PMD_SIZE) {
+ if (pcpul_size > PMD_SIZE) {
pr_warning("PERCPU: static data is larger than large page, "
"can't use large page\n");
return -EINVAL;
}
- dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
+ dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
/* allocate pointer array and alloc large pages */
- ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
- pcpur_ptrs = alloc_bootmem(ptrs_size);
+ map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
+ pcpul_map = alloc_bootmem(map_size);
for_each_possible_cpu(cpu) {
- pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
- if (!pcpur_ptrs[cpu])
+ pcpul_map[cpu].cpu = cpu;
+ pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
+ PMD_SIZE);
+ if (!pcpul_map[cpu].ptr) {
+ pr_warning("PERCPU: failed to allocate large page "
+ "for cpu%u\n", cpu);
goto enomem;
+ }
/*
- * Only use pcpur_size bytes and give back the rest.
+ * Only use pcpul_size bytes and give back the rest.
*
* Ingo: The 2MB up-rounding bootmem is needed to make
* sure the partial 2MB page is still fully RAM - it's
* not well-specified to have a PAT-incompatible area
* (unmapped RAM, device memory, etc.) in that hole.
*/
- free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
- PMD_SIZE - pcpur_size);
+ free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
+ PMD_SIZE - pcpul_size);
- memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
+ memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
}
/* allocate address and map */
- vm.flags = VM_ALLOC;
- vm.size = num_possible_cpus() * PMD_SIZE;
- vm_area_register_early(&vm, PMD_SIZE);
+ pcpul_vm.flags = VM_ALLOC;
+ pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
+ vm_area_register_early(&pcpul_vm, PMD_SIZE);
for_each_possible_cpu(cpu) {
- pmd_t *pmd;
+ pmd_t *pmd, pmd_v;
- pmd = populate_extra_pmd((unsigned long)vm.addr
- + cpu * PMD_SIZE);
- set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
- PAGE_KERNEL_LARGE));
+ pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
+ cpu * PMD_SIZE);
+ pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
+ PAGE_KERNEL_LARGE);
+ set_pmd(pmd, pmd_v);
}
/* we're ready, commit */
pr_info("PERCPU: Remapped at %p with large pages, static data "
- "%zu bytes\n", vm.addr, static_size);
+ "%zu bytes\n", pcpul_vm.addr, static_size);
- ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
+ ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
- PMD_SIZE, vm.addr, NULL);
- goto out_free_ar;
+ PMD_SIZE, pcpul_vm.addr, NULL);
+
+ /* sort pcpul_map array for pcpu_lpage_remapped() */
+ for (i = 0; i < nr_cpu_ids - 1; i++)
+ for (j = i + 1; j < nr_cpu_ids; j++)
+ if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
+ struct pcpul_ent tmp = pcpul_map[i];
+ pcpul_map[i] = pcpul_map[j];
+ pcpul_map[j] = tmp;
+ }
+
+ return ret;
enomem:
for_each_possible_cpu(cpu)
- if (pcpur_ptrs[cpu])
- free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
- ret = -ENOMEM;
-out_free_ar:
- free_bootmem(__pa(pcpur_ptrs), ptrs_size);
- return ret;
+ if (pcpul_map[cpu].ptr)
+ free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
+ free_bootmem(__pa(pcpul_map), map_size);
+ return -ENOMEM;
+}
+
+/**
+ * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
+ * @kaddr: the kernel address in question
+ *
+ * Determine whether @kaddr falls in the pcpul recycled area. This is
+ * used by pageattr to detect VM aliases and break up the pcpu PMD
+ * mapping such that the same physical page is not mapped under
+ * different attributes.
+ *
+ * The recycled area is always at the tail of a partially used PMD
+ * page.
+ *
+ * RETURNS:
+ * Address of corresponding remapped pcpu address if match is found;
+ * otherwise, NULL.
+ */
+void *pcpu_lpage_remapped(void *kaddr)
+{
+ void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
+ unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
+ int left = 0, right = nr_cpu_ids - 1;
+ int pos;
+
+ /* pcpul in use at all? */
+ if (!pcpul_map)
+ return NULL;
+
+ /* okay, perform binary search */
+ while (left <= right) {
+ pos = (left + right) / 2;
+
+ if (pcpul_map[pos].ptr < pmd_addr)
+ left = pos + 1;
+ else if (pcpul_map[pos].ptr > pmd_addr)
+ right = pos - 1;
+ else {
+ /* it shouldn't be in the area for the first chunk */
+ WARN_ON(offset < pcpul_size);
+
+ return pcpul_vm.addr +
+ pcpul_map[pos].cpu * PMD_SIZE + offset;
+ }
+ }
+
+ return NULL;
}
#else
-static ssize_t __init setup_pcpu_remap(size_t static_size)
+static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
{
return -EINVAL;
}
@@ -249,7 +329,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
* mapping so that it can use PMD mapping without additional TLB
* pressure.
*/
-static ssize_t __init setup_pcpu_embed(size_t static_size)
+static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
{
size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
@@ -258,7 +338,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
* this. Also, embedding allocation doesn't play well with
* NUMA.
*/
- if (!cpu_has_pse || pcpu_need_numa())
+ if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
return -EINVAL;
return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
@@ -297,7 +377,7 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
pcpu4k_nr_static_pages = PFN_UP(static_size);
/* unaligned allocations can't be freed, round up to page size */
- pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
+ pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids
* sizeof(pcpu4k_pages[0]));
pcpu4k_pages = alloc_bootmem(pages_size);
@@ -308,8 +388,11 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
void *ptr;
ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
- if (!ptr)
+ if (!ptr) {
+ pr_warning("PERCPU: failed to allocate "
+ "4k page for cpu%u\n", cpu);
goto enomem;
+ }
memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
pcpu4k_pages[j++] = virt_to_page(ptr);
@@ -333,6 +416,16 @@ out_free_ar:
return ret;
}
+/* for explicit first chunk allocator selection */
+static char pcpu_chosen_alloc[16] __initdata;
+
+static int __init percpu_alloc_setup(char *str)
+{
+ strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1);
+ return 0;
+}
+early_param("percpu_alloc", percpu_alloc_setup);
+
static inline void setup_percpu_segment(int cpu)
{
#ifdef CONFIG_X86_32
@@ -346,11 +439,6 @@ static inline void setup_percpu_segment(int cpu)
#endif
}
-/*
- * Great future plan:
- * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
- * Always point %gs to its beginning
- */
void __init setup_per_cpu_areas(void)
{
size_t static_size = __per_cpu_end - __per_cpu_start;
@@ -367,9 +455,26 @@ void __init setup_per_cpu_areas(void)
* of large page mappings. Please read comments on top of
* each allocator for details.
*/
- ret = setup_pcpu_remap(static_size);
- if (ret < 0)
- ret = setup_pcpu_embed(static_size);
+ ret = -EINVAL;
+ if (strlen(pcpu_chosen_alloc)) {
+ if (strcmp(pcpu_chosen_alloc, "4k")) {
+ if (!strcmp(pcpu_chosen_alloc, "lpage"))
+ ret = setup_pcpu_lpage(static_size, true);
+ else if (!strcmp(pcpu_chosen_alloc, "embed"))
+ ret = setup_pcpu_embed(static_size, true);
+ else
+ pr_warning("PERCPU: unknown allocator %s "
+ "specified\n", pcpu_chosen_alloc);
+ if (ret < 0)
+ pr_warning("PERCPU: %s allocator failed (%zd), "
+ "falling back to 4k\n",
+ pcpu_chosen_alloc, ret);
+ }
+ } else {
+ ret = setup_pcpu_lpage(static_size, false);
+ if (ret < 0)
+ ret = setup_pcpu_embed(static_size, false);
+ }
if (ret < 0)
ret = setup_pcpu_4k(static_size);
if (ret < 0)
@@ -425,6 +530,14 @@ void __init setup_per_cpu_areas(void)
early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
#endif
+#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
+ /*
+ * make sure boot cpu node_number is right, when boot cpu is on the
+ * node that doesn't have mem installed
+ */
+ per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
+#endif
+
/* Setup node to cpumask map */
setup_node_to_cpumask_map();
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 14425166b8e..4c578751e94 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
* 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
* 2000-2002 x86-64 support by Andi Kleen
*/
-
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/smp.h>
@@ -25,11 +24,11 @@
#include <asm/ucontext.h>
#include <asm/i387.h>
#include <asm/vdso.h>
+#include <asm/mce.h>
#ifdef CONFIG_X86_64
#include <asm/proto.h>
#include <asm/ia32_unistd.h>
-#include <asm/mce.h>
#endif /* CONFIG_X86_64 */
#include <asm/syscall.h>
@@ -857,10 +856,10 @@ static void do_signal(struct pt_regs *regs)
void
do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
-#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
+#ifdef CONFIG_X86_NEW_MCE
/* notify userspace of pending MCEs */
if (thread_info_flags & _TIF_MCE_NOTIFY)
- mce_notify_user();
+ mce_notify_process();
#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
/* deal with pending signal delivery */
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 13f33ea8cca..ec1de97600e 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -150,14 +150,40 @@ void native_send_call_func_ipi(const struct cpumask *mask)
* this function calls the 'stop' function on all other CPUs in the system.
*/
+asmlinkage void smp_reboot_interrupt(void)
+{
+ ack_APIC_irq();
+ irq_enter();
+ stop_this_cpu(NULL);
+ irq_exit();
+}
+
static void native_smp_send_stop(void)
{
unsigned long flags;
+ unsigned long wait;
if (reboot_force)
return;
- smp_call_function(stop_this_cpu, NULL, 0);
+ /*
+ * Use an own vector here because smp_call_function
+ * does lots of things not suitable in a panic situation.
+ * On most systems we could also use an NMI here,
+ * but there are a few systems around where NMI
+ * is problematic so stay with an non NMI for now
+ * (this implies we cannot stop CPUs spinning with irq off
+ * currently)
+ */
+ if (num_online_cpus() > 1) {
+ apic->send_IPI_allbutself(REBOOT_VECTOR);
+
+ /* Don't wait longer than a second */
+ wait = USEC_PER_SEC;
+ while (num_online_cpus() > 1 && wait--)
+ udelay(1);
+ }
+
local_irq_save(flags);
disable_local_APIC();
local_irq_restore(flags);
@@ -172,6 +198,9 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
{
ack_APIC_irq();
inc_irq_stat(irq_resched_count);
+ /*
+ * KVM uses this interrupt to force a cpu out of guest mode
+ */
}
void smp_call_function_interrupt(struct pt_regs *regs)
@@ -193,19 +222,19 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
}
struct smp_ops smp_ops = {
- .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
- .smp_prepare_cpus = native_smp_prepare_cpus,
- .smp_cpus_done = native_smp_cpus_done,
+ .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
+ .smp_prepare_cpus = native_smp_prepare_cpus,
+ .smp_cpus_done = native_smp_cpus_done,
- .smp_send_stop = native_smp_send_stop,
- .smp_send_reschedule = native_smp_send_reschedule,
+ .smp_send_stop = native_smp_send_stop,
+ .smp_send_reschedule = native_smp_send_reschedule,
- .cpu_up = native_cpu_up,
- .cpu_die = native_cpu_die,
- .cpu_disable = native_cpu_disable,
- .play_dead = native_play_dead,
+ .cpu_up = native_cpu_up,
+ .cpu_die = native_cpu_die,
+ .cpu_disable = native_cpu_disable,
+ .play_dead = native_play_dead,
- .send_call_func_ipi = native_send_call_func_ipi,
+ .send_call_func_ipi = native_send_call_func_ipi,
.send_call_func_single_ipi = native_send_call_func_single_ipi,
};
EXPORT_SYMBOL_GPL(smp_ops);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 58d24ef917d..2fecda69ee6 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -504,7 +504,7 @@ void __inquire_remote_apic(int apicid)
* INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
* won't ... remember to clear down the APIC, etc later.
*/
-int __devinit
+int __cpuinit
wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
{
unsigned long send_status, accept_status = 0;
@@ -538,7 +538,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
return (send_status | accept_status);
}
-int __devinit
+static int __cpuinit
wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
{
unsigned long send_status, accept_status = 0;
@@ -822,10 +822,12 @@ do_rest:
/* mark "stuck" area as not stuck */
*((volatile unsigned long *)trampoline_base) = 0;
- /*
- * Cleanup possible dangling ends...
- */
- smpboot_restore_warm_reset_vector();
+ if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
+ /*
+ * Cleanup possible dangling ends...
+ */
+ smpboot_restore_warm_reset_vector();
+ }
return boot_error;
}
@@ -871,7 +873,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
err = do_boot_cpu(apicid, cpu);
- zap_low_mappings();
+ zap_low_mappings(false);
low_mappings = 0;
#else
err = do_boot_cpu(apicid, cpu);
@@ -990,10 +992,12 @@ static int __init smp_sanity_check(unsigned max_cpus)
*/
if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
!cpu_has_apic) {
- printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
- boot_cpu_physical_apicid);
- printk(KERN_ERR "... forcing use of dummy APIC emulation."
+ if (!disable_apic) {
+ pr_err("BIOS bug, local APIC #%d not detected!...\n",
+ boot_cpu_physical_apicid);
+ pr_err("... forcing use of dummy APIC emulation."
"(tell your hw vendor)\n");
+ }
smpboot_clear_io_apic();
arch_disable_smp_support();
return -1;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index f7bddc2e37d..c3eb207181f 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -20,7 +20,7 @@ save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
static int save_stack_stack(void *data, char *name)
{
- return -1;
+ return 0;
}
static void save_stack_address(void *data, unsigned long addr, int reliable)
@@ -77,6 +77,13 @@ void save_stack_trace(struct stack_trace *trace)
}
EXPORT_SYMBOL_GPL(save_stack_trace);
+void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp)
+{
+ dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace);
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+
void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
{
dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index ff5c8736b49..d51321ddafd 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -334,3 +334,5 @@ ENTRY(sys_call_table)
.long sys_inotify_init1
.long sys_preadv
.long sys_pwritev
+ .long sys_rt_tgsigqueueinfo /* 335 */
+ .long sys_perf_counter_open
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 8c7b03b0cfc..77b9689f8ed 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -711,25 +711,31 @@ uv_activation_descriptor_init(int node, int pnode)
unsigned long pa;
unsigned long m;
unsigned long n;
- unsigned long mmr_image;
struct bau_desc *adp;
struct bau_desc *ad2;
- adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node);
+ /*
+ * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
+ * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade
+ */
+ adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
+ UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
BUG_ON(!adp);
pa = uv_gpa(adp); /* need the real nasid*/
n = pa >> uv_nshift;
m = pa & uv_mmask;
- mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE);
- if (mmr_image) {
- uv_write_global_mmr64(pnode, (unsigned long)
- UVH_LB_BAU_SB_DESCRIPTOR_BASE,
- (n << UV_DESC_BASE_PNODE_SHIFT | m));
- }
+ uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
+ (n << UV_DESC_BASE_PNODE_SHIFT | m));
- for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) {
+ /*
+ * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
+ * cpu even though we only use the first one; one descriptor can
+ * describe a broadcast to 256 nodes.
+ */
+ for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
+ i++, ad2++) {
memset(ad2, 0, sizeof(struct bau_desc));
ad2->header.sw_ack_flag = 1;
/*
@@ -738,6 +744,7 @@ uv_activation_descriptor_init(int node, int pnode)
* note that base_dest_nodeid is actually a nasid.
*/
ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
+ ad2->header.dest_subnodeid = 0x10; /* the LB */
ad2->header.command = UV_NET_ENDPOINT_INTD;
ad2->header.int_both = 1;
/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a1d288327ff..5204332f475 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -45,6 +45,7 @@
#include <linux/edac.h>
#endif
+#include <asm/kmemcheck.h>
#include <asm/stacktrace.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
@@ -53,6 +54,7 @@
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/i387.h>
+#include <asm/mce.h>
#include <asm/mach_traps.h>
@@ -64,8 +66,6 @@
#include <asm/setup.h>
#include <asm/traps.h>
-#include "cpu/mcheck/mce.h"
-
asmlinkage int system_call(void);
/* Do we ignore FPU interrupts ? */
@@ -346,6 +346,9 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
show_registers(regs);
+ if (panic_on_io_nmi)
+ panic("NMI IOCK error: Not continuing");
+
/* Re-enable the IOCK line, wait for a few seconds */
reason = (reason & 0xf) | 8;
outb(reason, 0x61);
@@ -534,6 +537,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
get_debugreg(condition, 6);
+ /* Catch kmemcheck conditions first of all! */
+ if (condition & DR_STEP && kmemcheck_trap(regs))
+ return;
+
/*
* The processor cleared BTF, so don't mark that we need it set.
*/
@@ -798,15 +805,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
return new_kesp;
}
-#else
+#endif
+
asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
{
}
-asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
+asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
{
}
-#endif
/*
* 'math_state_restore()' saves the current math information in the
@@ -839,9 +846,6 @@ asmlinkage void math_state_restore(void)
}
clts(); /* Allow maths ops (or we recurse) */
-#ifdef CONFIG_X86_32
- restore_fpu(tsk);
-#else
/*
* Paranoid restore. send a SIGSEGV if we fail to restore the state.
*/
@@ -850,7 +854,7 @@ asmlinkage void math_state_restore(void)
force_sig(SIGSEGV, tsk);
return;
}
-#endif
+
thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
tsk->fpu_counter++;
}
@@ -945,8 +949,13 @@ void __init trap_init(void)
#endif
set_intr_gate(19, &simd_coprocessor_error);
+ /* Reserve all the builtin and the syscall vector: */
+ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
+ set_bit(i, used_vectors);
+
#ifdef CONFIG_IA32_EMULATION
set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
+ set_bit(IA32_SYSCALL_VECTOR, used_vectors);
#endif
#ifdef CONFIG_X86_32
@@ -963,17 +972,9 @@ void __init trap_init(void)
}
set_system_trap_gate(SYSCALL_VECTOR, &system_call);
-#endif
-
- /* Reserve all the builtin and the syscall vector: */
- for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
- set_bit(i, used_vectors);
-
-#ifdef CONFIG_X86_64
- set_bit(IA32_SYSCALL_VECTOR, used_vectors);
-#else
set_bit(SYSCALL_VECTOR, used_vectors);
#endif
+
/*
* Should be a barrier for any external CPU state:
*/
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index d57de05dc43..71f4368b357 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -9,6 +9,7 @@
#include <linux/delay.h>
#include <linux/clocksource.h>
#include <linux/percpu.h>
+#include <linux/timex.h>
#include <asm/hpet.h>
#include <asm/timer.h>
@@ -274,15 +275,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
* use the TSC value at the transitions to calculate a pretty
* good value for the TSC frequencty.
*/
+static inline int pit_verify_msb(unsigned char val)
+{
+ /* Ignore LSB */
+ inb(0x42);
+ return inb(0x42) == val;
+}
+
static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
{
int count;
u64 tsc = 0;
for (count = 0; count < 50000; count++) {
- /* Ignore LSB */
- inb(0x42);
- if (inb(0x42) != val)
+ if (!pit_verify_msb(val))
break;
tsc = get_cycles();
}
@@ -335,8 +341,7 @@ static unsigned long quick_pit_calibrate(void)
* to do that is to just read back the 16-bit counter
* once from the PIT.
*/
- inb(0x42);
- inb(0x42);
+ pit_verify_msb(0);
if (pit_expect_msb(0xff, &tsc, &d1)) {
for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
@@ -347,8 +352,19 @@ static unsigned long quick_pit_calibrate(void)
* Iterate until the error is less than 500 ppm
*/
delta -= tsc;
- if (d1+d2 < delta >> 11)
- goto success;
+ if (d1+d2 >= delta >> 11)
+ continue;
+
+ /*
+ * Check the PIT one more time to verify that
+ * all TSC reads were stable wrt the PIT.
+ *
+ * This also guarantees serialization of the
+ * last cycle read ('d2') in pit_expect_msb.
+ */
+ if (!pit_verify_msb(0xfe - i))
+ break;
+ goto success;
}
}
printk("Fast TSC calibration failed\n");
@@ -384,13 +400,13 @@ unsigned long native_calibrate_tsc(void)
{
u64 tsc1, tsc2, delta, ref1, ref2;
unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
- unsigned long flags, latch, ms, fast_calibrate, tsc_khz;
+ unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz;
int hpet = is_hpet_enabled(), i, loopmin;
- tsc_khz = get_hypervisor_tsc_freq();
- if (tsc_khz) {
+ hv_tsc_khz = get_hypervisor_tsc_freq();
+ if (hv_tsc_khz) {
printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
- return tsc_khz;
+ return hv_tsc_khz;
}
local_irq_save(flags);
@@ -589,22 +605,26 @@ EXPORT_SYMBOL(recalibrate_cpu_khz);
*/
DEFINE_PER_CPU(unsigned long, cyc2ns);
+DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);
static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
{
- unsigned long long tsc_now, ns_now;
+ unsigned long long tsc_now, ns_now, *offset;
unsigned long flags, *scale;
local_irq_save(flags);
sched_clock_idle_sleep_event();
scale = &per_cpu(cyc2ns, cpu);
+ offset = &per_cpu(cyc2ns_offset, cpu);
rdtscll(tsc_now);
ns_now = __cycles_2_ns(tsc_now);
- if (cpu_khz)
+ if (cpu_khz) {
*scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
+ *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR);
+ }
sched_clock_idle_wakeup_event(0);
local_irq_restore(flags);
@@ -631,17 +651,15 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct cpufreq_freqs *freq = data;
- unsigned long *lpj, dummy;
+ unsigned long *lpj;
if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
return 0;
- lpj = &dummy;
- if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+ lpj = &boot_cpu_data.loops_per_jiffy;
#ifdef CONFIG_SMP
+ if (!(freq->flags & CPUFREQ_CONST_LOOPS))
lpj = &cpu_data(freq->cpu).loops_per_jiffy;
-#else
- lpj = &boot_cpu_data.loops_per_jiffy;
#endif
if (!ref_freq) {
@@ -710,7 +728,16 @@ static cycle_t read_tsc(struct clocksource *cs)
#ifdef CONFIG_X86_64
static cycle_t __vsyscall_fn vread_tsc(void)
{
- cycle_t ret = (cycle_t)vget_cycles();
+ cycle_t ret;
+
+ /*
+ * Surround the RDTSC by barriers, to make sure it's not
+ * speculated to outside the seqlock critical section and
+ * does not cause time warps:
+ */
+ rdtsc_barrier();
+ ret = (cycle_t)vget_cycles();
+ rdtsc_barrier();
return ret >= __vsyscall_gtod_data.clock.cycle_last ?
ret : __vsyscall_gtod_data.clock.cycle_last;
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index bf36328f6ef..027b5b49899 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count;
* of a critical section, to be able to prove TSC time-warps:
*/
static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
+
static __cpuinitdata cycles_t last_tsc;
static __cpuinitdata cycles_t max_warp;
static __cpuinitdata int nr_warps;
@@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
return;
if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
- printk(KERN_INFO
- "Skipping synchronization checks as TSC is reliable.\n");
+ pr_info("Skipping synchronization checks as TSC is reliable.\n");
return;
}
- printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
- smp_processor_id(), cpu);
+ pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
+ smp_processor_id(), cpu);
/*
* Reset it - in case this is a second bootup:
@@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu)
if (nr_warps) {
printk("\n");
- printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
- " turning off TSC clock.\n", max_warp);
+ pr_warning("Measured %Ld cycles TSC warp between CPUs, "
+ "turning off TSC clock.\n", max_warp);
mark_tsc_unstable("check_tsc_sync_source failed");
} else {
printk(" passed.\n");
@@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void)
while (atomic_read(&stop_count) != cpus)
cpu_relax();
}
-#undef NR_LOOPS
-
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index d7ac84e7fc1..9c4e6253905 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -287,10 +287,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
info->regs.pt.ds = 0;
info->regs.pt.es = 0;
info->regs.pt.fs = 0;
-
-/* we are clearing gs later just before "jmp resume_userspace",
- * because it is not saved/restored.
- */
+#ifndef CONFIG_X86_32_LAZY_GS
+ info->regs.pt.gs = 0;
+#endif
/*
* The flags register is also special: we cannot trust that the user
@@ -318,9 +317,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
}
/*
- * Save old state, set default return value (%ax) to 0
+ * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
*/
- info->regs32->ax = 0;
+ info->regs32->ax = VM86_SIGNAL;
tsk->thread.saved_sp0 = tsk->thread.sp0;
tsk->thread.saved_fs = info->regs32->fs;
tsk->thread.saved_gs = get_user_gs(info->regs32);
@@ -343,7 +342,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
__asm__ __volatile__(
"movl %0,%%esp\n\t"
"movl %1,%%ebp\n\t"
+#ifdef CONFIG_X86_32_LAZY_GS
"mov %2, %%gs\n\t"
+#endif
"jmp resume_userspace"
: /* no outputs */
:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 95deb9f2211..95a7289e4b0 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -441,7 +441,7 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
ap.ds = __USER_DS;
ap.es = __USER_DS;
ap.fs = __KERNEL_PERCPU;
- ap.gs = 0;
+ ap.gs = __KERNEL_STACK_CANARY;
ap.eflags = 0;
@@ -462,22 +462,28 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
}
#endif
-static void vmi_enter_lazy_cpu(void)
+static void vmi_start_context_switch(struct task_struct *prev)
{
- paravirt_enter_lazy_cpu();
+ paravirt_start_context_switch(prev);
vmi_ops.set_lazy_mode(2);
}
+static void vmi_end_context_switch(struct task_struct *next)
+{
+ vmi_ops.set_lazy_mode(0);
+ paravirt_end_context_switch(next);
+}
+
static void vmi_enter_lazy_mmu(void)
{
paravirt_enter_lazy_mmu();
vmi_ops.set_lazy_mode(1);
}
-static void vmi_leave_lazy(void)
+static void vmi_leave_lazy_mmu(void)
{
- paravirt_leave_lazy(paravirt_get_lazy_mode());
vmi_ops.set_lazy_mode(0);
+ paravirt_leave_lazy_mmu();
}
static inline int __init check_vmi_rom(struct vrom_header *rom)
@@ -711,14 +717,14 @@ static inline int __init activate_vmi(void)
para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
para_fill(pv_cpu_ops.io_delay, IODelay);
- para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu,
+ para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
set_lazy_mode, SetLazyMode);
- para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy,
+ para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
set_lazy_mode, SetLazyMode);
para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
set_lazy_mode, SetLazyMode);
- para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy,
+ para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
set_lazy_mode, SetLazyMode);
/* user and kernel flush are just handled with different flags to FlushTLB */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 849ee611f01..9fc178255c0 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -1,5 +1,394 @@
+/*
+ * ld script for the x86 kernel
+ *
+ * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ * Modernisation, unification and other changes and fixes:
+ * Copyright (C) 2007-2009 Sam Ravnborg <sam@ravnborg.org>
+ *
+ *
+ * Don't define absolute symbols until and unless you know that symbol
+ * value is should remain constant even if kernel image is relocated
+ * at run time. Absolute symbols are not relocated. If symbol value should
+ * change if kernel is relocated, make the symbol section relative and
+ * put it inside the section definition.
+ */
+
#ifdef CONFIG_X86_32
-# include "vmlinux_32.lds.S"
+#define LOAD_OFFSET __PAGE_OFFSET
#else
-# include "vmlinux_64.lds.S"
+#define LOAD_OFFSET __START_KERNEL_map
#endif
+
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/page_types.h>
+#include <asm/cache.h>
+#include <asm/boot.h>
+
+#undef i386 /* in case the preprocessor is a 32bit one */
+
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+
+#ifdef CONFIG_X86_32
+OUTPUT_ARCH(i386)
+ENTRY(phys_startup_32)
+jiffies = jiffies_64;
+#else
+OUTPUT_ARCH(i386:x86-64)
+ENTRY(phys_startup_64)
+jiffies_64 = jiffies;
+#endif
+
+PHDRS {
+ text PT_LOAD FLAGS(5); /* R_E */
+ data PT_LOAD FLAGS(7); /* RWE */
+#ifdef CONFIG_X86_64
+ user PT_LOAD FLAGS(7); /* RWE */
+#ifdef CONFIG_SMP
+ percpu PT_LOAD FLAGS(7); /* RWE */
+#endif
+ init PT_LOAD FLAGS(7); /* RWE */
+#endif
+ note PT_NOTE FLAGS(0); /* ___ */
+}
+
+SECTIONS
+{
+#ifdef CONFIG_X86_32
+ . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
+ phys_startup_32 = startup_32 - LOAD_OFFSET;
+#else
+ . = __START_KERNEL;
+ phys_startup_64 = startup_64 - LOAD_OFFSET;
+#endif
+
+ /* Text and read-only data */
+
+ /* bootstrapping code */
+ .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
+ _text = .;
+ *(.text.head)
+ } :text = 0x9090
+
+ /* The rest of the text */
+ .text : AT(ADDR(.text) - LOAD_OFFSET) {
+#ifdef CONFIG_X86_32
+ /* not really needed, already page aligned */
+ . = ALIGN(PAGE_SIZE);
+ *(.text.page_aligned)
+#endif
+ . = ALIGN(8);
+ _stext = .;
+ TEXT_TEXT
+ SCHED_TEXT
+ LOCK_TEXT
+ KPROBES_TEXT
+ IRQENTRY_TEXT
+ *(.fixup)
+ *(.gnu.warning)
+ /* End of text section */
+ _etext = .;
+ } :text = 0x9090
+
+ NOTES :text :note
+
+ /* Exception table */
+ . = ALIGN(16);
+ __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+ __start___ex_table = .;
+ *(__ex_table)
+ __stop___ex_table = .;
+ } :text = 0x9090
+
+ RO_DATA(PAGE_SIZE)
+
+ /* Data */
+ .data : AT(ADDR(.data) - LOAD_OFFSET) {
+ /* Start of data section */
+ _sdata = .;
+
+ /* init_task */
+ INIT_TASK_DATA(THREAD_SIZE)
+
+#ifdef CONFIG_X86_32
+ /* 32 bit has nosave before _edata */
+ NOSAVE_DATA
+#endif
+
+ PAGE_ALIGNED_DATA(PAGE_SIZE)
+ *(.data.idt)
+
+ CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES)
+
+ DATA_DATA
+ CONSTRUCTORS
+
+ /* rarely changed data like cpu maps */
+ READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES)
+
+ /* End of data section */
+ _edata = .;
+ } :data
+
+#ifdef CONFIG_X86_64
+
+#define VSYSCALL_ADDR (-10*1024*1024)
+#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \
+ PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
+#define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \
+ PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
+
+#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
+#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
+
+#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
+#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
+
+ . = VSYSCALL_ADDR;
+ .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) {
+ *(.vsyscall_0)
+ } :user
+
+ __vsyscall_0 = VSYSCALL_VIRT_ADDR;
+
+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
+ *(.vsyscall_fn)
+ }
+
+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
+ *(.vsyscall_gtod_data)
+ }
+
+ vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
+ .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
+ *(.vsyscall_clock)
+ }
+ vsyscall_clock = VVIRT(.vsyscall_clock);
+
+
+ .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
+ *(.vsyscall_1)
+ }
+ .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
+ *(.vsyscall_2)
+ }
+
+ .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
+ *(.vgetcpu_mode)
+ }
+ vgetcpu_mode = VVIRT(.vgetcpu_mode);
+
+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ .jiffies : AT(VLOAD(.jiffies)) {
+ *(.jiffies)
+ }
+ jiffies = VVIRT(.jiffies);
+
+ .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
+ *(.vsyscall_3)
+ }
+
+ . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
+
+#undef VSYSCALL_ADDR
+#undef VSYSCALL_PHYS_ADDR
+#undef VSYSCALL_VIRT_ADDR
+#undef VLOAD_OFFSET
+#undef VLOAD
+#undef VVIRT_OFFSET
+#undef VVIRT
+
+#endif /* CONFIG_X86_64 */
+
+ /* Init code and data - will be freed after init */
+ . = ALIGN(PAGE_SIZE);
+ .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) {
+ __init_begin = .; /* paired with __init_end */
+ }
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
+ /*
+ * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
+ * output PHDR, so the next output section - .init.text - should
+ * start another segment - init.
+ */
+ PERCPU_VADDR(0, :percpu)
+#endif
+
+ .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+ _sinittext = .;
+ INIT_TEXT
+ _einittext = .;
+ }
+#ifdef CONFIG_X86_64
+ :init
+#endif
+
+ .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
+ INIT_DATA
+ }
+
+ . = ALIGN(16);
+ .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+ __setup_start = .;
+ *(.init.setup)
+ __setup_end = .;
+ }
+ .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+ __initcall_start = .;
+ INITCALLS
+ __initcall_end = .;
+ }
+
+ .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+ __con_initcall_start = .;
+ *(.con_initcall.init)
+ __con_initcall_end = .;
+ }
+
+ .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+ __x86_cpu_dev_start = .;
+ *(.x86_cpu_dev.init)
+ __x86_cpu_dev_end = .;
+ }
+
+ SECURITY_INIT
+
+ . = ALIGN(8);
+ .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+ __parainstructions = .;
+ *(.parainstructions)
+ __parainstructions_end = .;
+ }
+
+ . = ALIGN(8);
+ .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+ __alt_instructions = .;
+ *(.altinstructions)
+ __alt_instructions_end = .;
+ }
+
+ .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+ *(.altinstr_replacement)
+ }
+
+ /*
+ * .exit.text is discard at runtime, not link time, to deal with
+ * references from .altinstructions and .eh_frame
+ */
+ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
+ EXIT_TEXT
+ }
+
+ .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
+ EXIT_DATA
+ }
+
+#ifdef CONFIG_BLK_DEV_INITRD
+ . = ALIGN(PAGE_SIZE);
+ .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+ __initramfs_start = .;
+ *(.init.ramfs)
+ __initramfs_end = .;
+ }
+#endif
+
+#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
+ PERCPU(PAGE_SIZE)
+#endif
+
+ . = ALIGN(PAGE_SIZE);
+
+ /* freed after init ends here */
+ .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) {
+ __init_end = .;
+ }
+
+ /*
+ * smp_locks might be freed after init
+ * start/end must be page aligned
+ */
+ . = ALIGN(PAGE_SIZE);
+ .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+ __smp_locks = .;
+ *(.smp_locks)
+ __smp_locks_end = .;
+ . = ALIGN(PAGE_SIZE);
+ }
+
+#ifdef CONFIG_X86_64
+ .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+ NOSAVE_DATA
+ }
+#endif
+
+ /* BSS */
+ . = ALIGN(PAGE_SIZE);
+ .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+ __bss_start = .;
+ *(.bss.page_aligned)
+ *(.bss)
+ . = ALIGN(4);
+ __bss_stop = .;
+ }
+
+ . = ALIGN(PAGE_SIZE);
+ .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+ __brk_base = .;
+ . += 64 * 1024; /* 64k alignment slop space */
+ *(.brk_reservation) /* areas brk users have reserved */
+ __brk_limit = .;
+ }
+
+ .end : AT(ADDR(.end) - LOAD_OFFSET) {
+ _end = .;
+ }
+
+ /* Sections to be discarded */
+ /DISCARD/ : {
+ *(.exitcall.exit)
+ *(.eh_frame)
+ *(.discard)
+ }
+
+ STABS_DEBUG
+ DWARF_DEBUG
+}
+
+
+#ifdef CONFIG_X86_32
+. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+ "kernel image bigger than KERNEL_IMAGE_SIZE");
+#else
+/*
+ * Per-cpu symbols which need to be offset from __per_cpu_load
+ * for the boot processor.
+ */
+#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
+INIT_PER_CPU(gdt_page);
+INIT_PER_CPU(irq_stack_union);
+
+/*
+ * Build-time check on the image size:
+ */
+. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
+ "kernel image bigger than KERNEL_IMAGE_SIZE");
+
+#ifdef CONFIG_SMP
+. = ASSERT((per_cpu__irq_stack_union == 0),
+ "irq_stack_union is not at start of per-cpu area");
+#endif
+
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_KEXEC
+#include <asm/kexec.h>
+
+. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+ "kexec control code size is too big");
+#endif
+
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
deleted file mode 100644
index 62ad500d55f..00000000000
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ /dev/null
@@ -1,229 +0,0 @@
-/* ld script to make i386 Linux kernel
- * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
- *
- * Don't define absolute symbols until and unless you know that symbol
- * value is should remain constant even if kernel image is relocated
- * at run time. Absolute symbols are not relocated. If symbol value should
- * change if kernel is relocated, make the symbol section relative and
- * put it inside the section definition.
- */
-
-#define LOAD_OFFSET __PAGE_OFFSET
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/thread_info.h>
-#include <asm/page_types.h>
-#include <asm/cache.h>
-#include <asm/boot.h>
-
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(phys_startup_32)
-jiffies = jiffies_64;
-
-PHDRS {
- text PT_LOAD FLAGS(5); /* R_E */
- data PT_LOAD FLAGS(7); /* RWE */
- note PT_NOTE FLAGS(0); /* ___ */
-}
-SECTIONS
-{
- . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
- phys_startup_32 = startup_32 - LOAD_OFFSET;
-
- .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
- _text = .; /* Text and read-only data */
- *(.text.head)
- } :text = 0x9090
-
- /* read-only */
- .text : AT(ADDR(.text) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */
- *(.text.page_aligned)
- TEXT_TEXT
- SCHED_TEXT
- LOCK_TEXT
- KPROBES_TEXT
- IRQENTRY_TEXT
- *(.fixup)
- *(.gnu.warning)
- _etext = .; /* End of text section */
- } :text = 0x9090
-
- NOTES :text :note
-
- . = ALIGN(16); /* Exception table */
- __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
- __start___ex_table = .;
- *(__ex_table)
- __stop___ex_table = .;
- } :text = 0x9090
-
- RODATA
-
- /* writeable */
- . = ALIGN(PAGE_SIZE);
- .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
- DATA_DATA
- CONSTRUCTORS
- } :data
-
- . = ALIGN(PAGE_SIZE);
- .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
- __nosave_begin = .;
- *(.data.nosave)
- . = ALIGN(PAGE_SIZE);
- __nosave_end = .;
- }
-
- . = ALIGN(PAGE_SIZE);
- .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
- *(.data.page_aligned)
- *(.data.idt)
- }
-
- . = ALIGN(32);
- .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
- *(.data.cacheline_aligned)
- }
-
- /* rarely changed data like cpu maps */
- . = ALIGN(32);
- .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
- *(.data.read_mostly)
- _edata = .; /* End of data section */
- }
-
- . = ALIGN(THREAD_SIZE); /* init_task */
- .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
- *(.data.init_task)
- }
-
- /* might get freed after init */
- . = ALIGN(PAGE_SIZE);
- .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
- __smp_locks = .;
- *(.smp_locks)
- __smp_locks_end = .;
- }
- /* will be freed after init
- * Following ALIGN() is required to make sure no other data falls on the
- * same page where __smp_alt_end is pointing as that page might be freed
- * after boot. Always make sure that ALIGN() directive is present after
- * the section which contains __smp_alt_end.
- */
- . = ALIGN(PAGE_SIZE);
-
- /* will be freed after init */
- . = ALIGN(PAGE_SIZE); /* Init code and data */
- .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
- __init_begin = .;
- _sinittext = .;
- INIT_TEXT
- _einittext = .;
- }
- .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
- INIT_DATA
- }
- . = ALIGN(16);
- .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
- __setup_start = .;
- *(.init.setup)
- __setup_end = .;
- }
- .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
- __initcall_start = .;
- INITCALLS
- __initcall_end = .;
- }
- .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
- __con_initcall_start = .;
- *(.con_initcall.init)
- __con_initcall_end = .;
- }
- .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
- __x86_cpu_dev_start = .;
- *(.x86_cpu_dev.init)
- __x86_cpu_dev_end = .;
- }
- SECURITY_INIT
- . = ALIGN(4);
- .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
- __alt_instructions = .;
- *(.altinstructions)
- __alt_instructions_end = .;
- }
- .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
- *(.altinstr_replacement)
- }
- . = ALIGN(4);
- .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
- __parainstructions = .;
- *(.parainstructions)
- __parainstructions_end = .;
- }
- /* .exit.text is discard at runtime, not link time, to deal with references
- from .altinstructions and .eh_frame */
- .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
- EXIT_TEXT
- }
- .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
- EXIT_DATA
- }
-#if defined(CONFIG_BLK_DEV_INITRD)
- . = ALIGN(PAGE_SIZE);
- .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
- __initramfs_start = .;
- *(.init.ramfs)
- __initramfs_end = .;
- }
-#endif
- PERCPU(PAGE_SIZE)
- . = ALIGN(PAGE_SIZE);
- /* freed after init ends here */
-
- .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
- __init_end = .;
- __bss_start = .; /* BSS */
- *(.bss.page_aligned)
- *(.bss)
- . = ALIGN(4);
- __bss_stop = .;
- }
-
- .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE);
- __brk_base = . ;
- . += 64 * 1024 ; /* 64k alignment slop space */
- *(.brk_reservation) /* areas brk users have reserved */
- __brk_limit = . ;
- }
-
- .end : AT(ADDR(.end) - LOAD_OFFSET) {
- _end = . ;
- }
-
- /* Sections to be discarded */
- /DISCARD/ : {
- *(.exitcall.exit)
- *(.discard)
- }
-
- STABS_DEBUG
-
- DWARF_DEBUG
-}
-
-/*
- * Build-time check on the image size:
- */
-ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
- "kernel image bigger than KERNEL_IMAGE_SIZE")
-
-#ifdef CONFIG_KEXEC
-/* Link time checks */
-#include <asm/kexec.h>
-
-ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
- "kexec control code size is too big")
-#endif
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
deleted file mode 100644
index c8742507b03..00000000000
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ /dev/null
@@ -1,298 +0,0 @@
-/* ld script to make x86-64 Linux kernel
- * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
- */
-
-#define LOAD_OFFSET __START_KERNEL_map
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/asm-offsets.h>
-#include <asm/page_types.h>
-
-#undef i386 /* in case the preprocessor is a 32bit one */
-
-OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
-OUTPUT_ARCH(i386:x86-64)
-ENTRY(phys_startup_64)
-jiffies_64 = jiffies;
-PHDRS {
- text PT_LOAD FLAGS(5); /* R_E */
- data PT_LOAD FLAGS(7); /* RWE */
- user PT_LOAD FLAGS(7); /* RWE */
- data.init PT_LOAD FLAGS(7); /* RWE */
-#ifdef CONFIG_SMP
- percpu PT_LOAD FLAGS(7); /* RWE */
-#endif
- data.init2 PT_LOAD FLAGS(7); /* RWE */
- note PT_NOTE FLAGS(0); /* ___ */
-}
-SECTIONS
-{
- . = __START_KERNEL;
- phys_startup_64 = startup_64 - LOAD_OFFSET;
- .text : AT(ADDR(.text) - LOAD_OFFSET) {
- _text = .; /* Text and read-only data */
- /* First the code that has to be first for bootstrapping */
- *(.text.head)
- _stext = .;
- /* Then the rest */
- TEXT_TEXT
- SCHED_TEXT
- LOCK_TEXT
- KPROBES_TEXT
- IRQENTRY_TEXT
- *(.fixup)
- *(.gnu.warning)
- _etext = .; /* End of text section */
- } :text = 0x9090
-
- NOTES :text :note
-
- . = ALIGN(16); /* Exception table */
- __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
- __start___ex_table = .;
- *(__ex_table)
- __stop___ex_table = .;
- } :text = 0x9090
-
- RODATA
-
- . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */
- /* Data */
- .data : AT(ADDR(.data) - LOAD_OFFSET) {
- DATA_DATA
- CONSTRUCTORS
- _edata = .; /* End of data section */
- } :data
-
-
- .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE);
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- *(.data.cacheline_aligned)
- }
- . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
- .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
- *(.data.read_mostly)
- }
-
-#define VSYSCALL_ADDR (-10*1024*1024)
-#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
-#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
-
-#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
-#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
-
-#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
-#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
-
- . = VSYSCALL_ADDR;
- .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
- __vsyscall_0 = VSYSCALL_VIRT_ADDR;
-
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
- { *(.vsyscall_gtod_data) }
- vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
- .vsyscall_clock : AT(VLOAD(.vsyscall_clock))
- { *(.vsyscall_clock) }
- vsyscall_clock = VVIRT(.vsyscall_clock);
-
-
- .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
- { *(.vsyscall_1) }
- .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
- { *(.vsyscall_2) }
-
- .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
- vgetcpu_mode = VVIRT(.vgetcpu_mode);
-
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
- jiffies = VVIRT(.jiffies);
-
- .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
- { *(.vsyscall_3) }
-
- . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
-
-#undef VSYSCALL_ADDR
-#undef VSYSCALL_PHYS_ADDR
-#undef VSYSCALL_VIRT_ADDR
-#undef VLOAD_OFFSET
-#undef VLOAD
-#undef VVIRT_OFFSET
-#undef VVIRT
-
- .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
- . = ALIGN(THREAD_SIZE); /* init_task */
- *(.data.init_task)
- }:data.init
-
- .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE);
- *(.data.page_aligned)
- }
-
- .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
- /* might get freed after init */
- . = ALIGN(PAGE_SIZE);
- __smp_alt_begin = .;
- __smp_locks = .;
- *(.smp_locks)
- __smp_locks_end = .;
- . = ALIGN(PAGE_SIZE);
- __smp_alt_end = .;
- }
-
- . = ALIGN(PAGE_SIZE); /* Init code and data */
- __init_begin = .; /* paired with __init_end */
- .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
- _sinittext = .;
- INIT_TEXT
- _einittext = .;
- }
- .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
- __initdata_begin = .;
- INIT_DATA
- __initdata_end = .;
- }
-
- .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
- . = ALIGN(16);
- __setup_start = .;
- *(.init.setup)
- __setup_end = .;
- }
- .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
- __initcall_start = .;
- INITCALLS
- __initcall_end = .;
- }
- .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
- __con_initcall_start = .;
- *(.con_initcall.init)
- __con_initcall_end = .;
- }
- .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
- __x86_cpu_dev_start = .;
- *(.x86_cpu_dev.init)
- __x86_cpu_dev_end = .;
- }
- SECURITY_INIT
-
- . = ALIGN(8);
- .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
- __parainstructions = .;
- *(.parainstructions)
- __parainstructions_end = .;
- }
-
- .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
- . = ALIGN(8);
- __alt_instructions = .;
- *(.altinstructions)
- __alt_instructions_end = .;
- }
- .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
- *(.altinstr_replacement)
- }
- /* .exit.text is discard at runtime, not link time, to deal with references
- from .altinstructions and .eh_frame */
- .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
- EXIT_TEXT
- }
- .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
- EXIT_DATA
- }
-
-#ifdef CONFIG_BLK_DEV_INITRD
- . = ALIGN(PAGE_SIZE);
- .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
- __initramfs_start = .;
- *(.init.ramfs)
- __initramfs_end = .;
- }
-#endif
-
-#ifdef CONFIG_SMP
- /*
- * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
- * output PHDR, so the next output section - __data_nosave - should
- * start another section data.init2. Also, pda should be at the head of
- * percpu area. Preallocate it and define the percpu offset symbol
- * so that it can be accessed as a percpu variable.
- */
- . = ALIGN(PAGE_SIZE);
- PERCPU_VADDR(0, :percpu)
-#else
- PERCPU(PAGE_SIZE)
-#endif
-
- . = ALIGN(PAGE_SIZE);
- __init_end = .;
-
- .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE);
- __nosave_begin = .;
- *(.data.nosave)
- . = ALIGN(PAGE_SIZE);
- __nosave_end = .;
- } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
-
- .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE);
- __bss_start = .; /* BSS */
- *(.bss.page_aligned)
- *(.bss)
- __bss_stop = .;
- }
-
- .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE);
- __brk_base = . ;
- . += 64 * 1024 ; /* 64k alignment slop space */
- *(.brk_reservation) /* areas brk users have reserved */
- __brk_limit = . ;
- }
-
- _end = . ;
-
- /* Sections to be discarded */
- /DISCARD/ : {
- *(.exitcall.exit)
- *(.eh_frame)
- *(.discard)
- }
-
- STABS_DEBUG
-
- DWARF_DEBUG
-}
-
- /*
- * Per-cpu symbols which need to be offset from __per_cpu_load
- * for the boot processor.
- */
-#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
-INIT_PER_CPU(gdt_page);
-INIT_PER_CPU(irq_stack_union);
-
-/*
- * Build-time check on the image size:
- */
-ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
- "kernel image bigger than KERNEL_IMAGE_SIZE")
-
-#ifdef CONFIG_SMP
-ASSERT((per_cpu__irq_stack_union == 0),
- "irq_stack_union is not at start of per-cpu area");
-#endif
-
-#ifdef CONFIG_KEXEC
-#include <asm/kexec.h>
-
-ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
- "kexec control code size is too big")
-#endif
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 44153afc906..25ee06a80aa 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -132,15 +132,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
return;
}
- /*
- * Surround the RDTSC by barriers, to make sure it's not
- * speculated to outside the seqlock critical section and
- * does not cause time warps:
- */
- rdtsc_barrier();
now = vread();
- rdtsc_barrier();
-
base = __vsyscall_gtod_data.clock.cycle_last;
mask = __vsyscall_gtod_data.clock.mask;
mult = __vsyscall_gtod_data.clock.mult;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a58504ea78c..8600a09e0c6 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -50,6 +50,9 @@ config KVM_INTEL
Provides support for KVM on Intel processors equipped with the VT
extensions.
+ To compile this as a module, choose M here: the module
+ will be called kvm-intel.
+
config KVM_AMD
tristate "KVM for AMD processors support"
depends on KVM
@@ -57,6 +60,9 @@ config KVM_AMD
Provides support for KVM on AMD processors equipped with the AMD-V
(SVM) extensions.
+ To compile this as a module, choose M here: the module
+ will be called kvm-amd.
+
config KVM_TRACE
bool "KVM trace support"
depends on KVM && SYSFS
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d3ec292f00f..b43c4efafe8 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -14,7 +14,7 @@ endif
EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
- i8254.o
+ i8254.o timer.o
obj-$(CONFIG_KVM) += kvm.o
kvm-intel-objs = vmx.o
obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c13bb92d315..21f68e00524 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -98,6 +98,40 @@ static int pit_get_gate(struct kvm *kvm, int channel)
return kvm->arch.vpit->pit_state.channels[channel].gate;
}
+static s64 __kpit_elapsed(struct kvm *kvm)
+{
+ s64 elapsed;
+ ktime_t remaining;
+ struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+
+ if (!ps->pit_timer.period)
+ return 0;
+
+ /*
+ * The Counter does not stop when it reaches zero. In
+ * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to
+ * the highest count, either FFFF hex for binary counting
+ * or 9999 for BCD counting, and continues counting.
+ * Modes 2 and 3 are periodic; the Counter reloads
+ * itself with the initial count and continues counting
+ * from there.
+ */
+ remaining = hrtimer_expires_remaining(&ps->pit_timer.timer);
+ elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
+ elapsed = mod_64(elapsed, ps->pit_timer.period);
+
+ return elapsed;
+}
+
+static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c,
+ int channel)
+{
+ if (channel == 0)
+ return __kpit_elapsed(kvm);
+
+ return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+}
+
static int pit_get_count(struct kvm *kvm, int channel)
{
struct kvm_kpit_channel_state *c =
@@ -107,7 +141,7 @@ static int pit_get_count(struct kvm *kvm, int channel)
WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
- t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+ t = kpit_elapsed(kvm, c, channel);
d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
switch (c->mode) {
@@ -137,7 +171,7 @@ static int pit_get_out(struct kvm *kvm, int channel)
WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
- t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+ t = kpit_elapsed(kvm, c, channel);
d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
switch (c->mode) {
@@ -193,28 +227,6 @@ static void pit_latch_status(struct kvm *kvm, int channel)
}
}
-static int __pit_timer_fn(struct kvm_kpit_state *ps)
-{
- struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0];
- struct kvm_kpit_timer *pt = &ps->pit_timer;
-
- if (!atomic_inc_and_test(&pt->pending))
- set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
-
- if (!pt->reinject)
- atomic_set(&pt->pending, 1);
-
- if (vcpu0 && waitqueue_active(&vcpu0->wq))
- wake_up_interruptible(&vcpu0->wq);
-
- hrtimer_add_expires_ns(&pt->timer, pt->period);
- pt->scheduled = hrtimer_get_expires_ns(&pt->timer);
- if (pt->period)
- ps->channels[0].count_load_time = ktime_get();
-
- return (pt->period == 0 ? 0 : 1);
-}
-
int pit_has_pending_timer(struct kvm_vcpu *vcpu)
{
struct kvm_pit *pit = vcpu->kvm->arch.vpit;
@@ -235,21 +247,6 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
spin_unlock(&ps->inject_lock);
}
-static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
-{
- struct kvm_kpit_state *ps;
- int restart_timer = 0;
-
- ps = container_of(data, struct kvm_kpit_state, pit_timer.timer);
-
- restart_timer = __pit_timer_fn(ps);
-
- if (restart_timer)
- return HRTIMER_RESTART;
- else
- return HRTIMER_NORESTART;
-}
-
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
{
struct kvm_pit *pit = vcpu->kvm->arch.vpit;
@@ -263,15 +260,26 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
-static void destroy_pit_timer(struct kvm_kpit_timer *pt)
+static void destroy_pit_timer(struct kvm_timer *pt)
{
pr_debug("pit: execute del timer!\n");
hrtimer_cancel(&pt->timer);
}
+static bool kpit_is_periodic(struct kvm_timer *ktimer)
+{
+ struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
+ pit_timer);
+ return ps->is_periodic;
+}
+
+static struct kvm_timer_ops kpit_ops = {
+ .is_periodic = kpit_is_periodic,
+};
+
static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
{
- struct kvm_kpit_timer *pt = &ps->pit_timer;
+ struct kvm_timer *pt = &ps->pit_timer;
s64 interval;
interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -280,8 +288,14 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
/* TODO The new value only affected after the retriggered */
hrtimer_cancel(&pt->timer);
- pt->period = (is_period == 0) ? 0 : interval;
- pt->timer.function = pit_timer_fn;
+ pt->period = interval;
+ ps->is_periodic = is_period;
+
+ pt->timer.function = kvm_timer_fn;
+ pt->t_ops = &kpit_ops;
+ pt->kvm = ps->pit->kvm;
+ pt->vcpu_id = 0;
+
atomic_set(&pt->pending, 0);
ps->irq_ack = 1;
@@ -298,23 +312,23 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
pr_debug("pit: load_count val is %d, channel is %d\n", val, channel);
/*
- * Though spec said the state of 8254 is undefined after power-up,
- * seems some tricky OS like Windows XP depends on IRQ0 interrupt
- * when booting up.
- * So here setting initialize rate for it, and not a specific number
+ * The largest possible initial count is 0; this is equivalent
+ * to 216 for binary counting and 104 for BCD counting.
*/
if (val == 0)
val = 0x10000;
- ps->channels[channel].count_load_time = ktime_get();
ps->channels[channel].count = val;
- if (channel != 0)
+ if (channel != 0) {
+ ps->channels[channel].count_load_time = ktime_get();
return;
+ }
/* Two types of timer
* mode 1 is one shot, mode 2 is period, otherwise del timer */
switch (ps->channels[0].mode) {
+ case 0:
case 1:
/* FIXME: enhance mode 4 precision */
case 4:
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 6acbe4b505d..bbd863ff60b 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -3,15 +3,6 @@
#include "iodev.h"
-struct kvm_kpit_timer {
- struct hrtimer timer;
- int irq;
- s64 period; /* unit: ns */
- s64 scheduled;
- atomic_t pending;
- bool reinject;
-};
-
struct kvm_kpit_channel_state {
u32 count; /* can be 65536 */
u16 latched_count;
@@ -30,7 +21,8 @@ struct kvm_kpit_channel_state {
struct kvm_kpit_state {
struct kvm_kpit_channel_state channels[3];
- struct kvm_kpit_timer pit_timer;
+ struct kvm_timer pit_timer;
+ bool is_periodic;
u32 speaker_data_on;
struct mutex lock;
struct kvm_pit *pit;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index cf17ed52f6f..96dfbb6ad2a 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -24,6 +24,7 @@
#include "irq.h"
#include "i8254.h"
+#include "x86.h"
/*
* check if there are pending timer events
@@ -48,6 +49,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
{
struct kvm_pic *s;
+ if (!irqchip_in_kernel(v->kvm))
+ return v->arch.interrupt.pending;
+
if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
if (kvm_apic_accept_pic_intr(v)) {
s = pic_irqchip(v->kvm); /* PIC */
@@ -67,6 +71,9 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
struct kvm_pic *s;
int vector;
+ if (!irqchip_in_kernel(v->kvm))
+ return v->arch.interrupt.nr;
+
vector = kvm_get_apic_interrupt(v); /* APIC */
if (vector == -1) {
if (kvm_apic_accept_pic_intr(v)) {
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
new file mode 100644
index 00000000000..26bd6ba74e1
--- /dev/null
+++ b/arch/x86/kvm/kvm_timer.h
@@ -0,0 +1,18 @@
+
+struct kvm_timer {
+ struct hrtimer timer;
+ s64 period; /* unit: ns */
+ atomic_t pending; /* accumulated triggered timers */
+ bool reinject;
+ struct kvm_timer_ops *t_ops;
+ struct kvm *kvm;
+ int vcpu_id;
+};
+
+struct kvm_timer_ops {
+ bool (*is_periodic)(struct kvm_timer *);
+};
+
+
+enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
+
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index f0b67f2cdd6..ae99d83f81a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -196,20 +196,15 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
+static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
+ int vector, int level, int trig_mode);
+
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!apic_test_and_set_irr(vec, apic)) {
- /* a new pending irq is set in IRR */
- if (trig)
- apic_set_vector(vec, apic->regs + APIC_TMR);
- else
- apic_clear_vector(vec, apic->regs + APIC_TMR);
- kvm_vcpu_kick(apic->vcpu);
- return 1;
- }
- return 0;
+ return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
+ irq->level, irq->trig_mode);
}
static inline int apic_find_highest_isr(struct kvm_lapic *apic)
@@ -250,7 +245,7 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
{
- return kvm_apic_id(apic) == dest;
+ return dest == 0xff || kvm_apic_id(apic) == dest;
}
int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
@@ -279,37 +274,34 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
return result;
}
-static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int short_hand, int dest, int dest_mode)
{
int result = 0;
struct kvm_lapic *target = vcpu->arch.apic;
apic_debug("target %p, source %p, dest 0x%x, "
- "dest_mode 0x%x, short_hand 0x%x",
+ "dest_mode 0x%x, short_hand 0x%x\n",
target, source, dest, dest_mode, short_hand);
ASSERT(!target);
switch (short_hand) {
case APIC_DEST_NOSHORT:
- if (dest_mode == 0) {
+ if (dest_mode == 0)
/* Physical mode. */
- if ((dest == 0xFF) || (dest == kvm_apic_id(target)))
- result = 1;
- } else
+ result = kvm_apic_match_physical_addr(target, dest);
+ else
/* Logical mode. */
result = kvm_apic_match_logical_addr(target, dest);
break;
case APIC_DEST_SELF:
- if (target == source)
- result = 1;
+ result = (target == source);
break;
case APIC_DEST_ALLINC:
result = 1;
break;
case APIC_DEST_ALLBUT:
- if (target != source)
- result = 1;
+ result = (target != source);
break;
default:
printk(KERN_WARNING "Bad dest shorthand value %x\n",
@@ -327,20 +319,22 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
int vector, int level, int trig_mode)
{
- int orig_irr, result = 0;
+ int result = 0;
struct kvm_vcpu *vcpu = apic->vcpu;
switch (delivery_mode) {
- case APIC_DM_FIXED:
case APIC_DM_LOWEST:
+ vcpu->arch.apic_arb_prio++;
+ case APIC_DM_FIXED:
/* FIXME add logic for vcpu on reset */
if (unlikely(!apic_enabled(apic)))
break;
- orig_irr = apic_test_and_set_irr(vector, apic);
- if (orig_irr && trig_mode) {
- apic_debug("level trig mode repeatedly for vector %d",
- vector);
+ result = !apic_test_and_set_irr(vector, apic);
+ if (!result) {
+ if (trig_mode)
+ apic_debug("level trig mode repeatedly for "
+ "vector %d", vector);
break;
}
@@ -349,10 +343,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
apic_set_vector(vector, apic->regs + APIC_TMR);
} else
apic_clear_vector(vector, apic->regs + APIC_TMR);
-
kvm_vcpu_kick(vcpu);
-
- result = (orig_irr == 0);
break;
case APIC_DM_REMRD:
@@ -364,12 +355,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
break;
case APIC_DM_NMI:
+ result = 1;
kvm_inject_nmi(vcpu);
kvm_vcpu_kick(vcpu);
break;
case APIC_DM_INIT:
if (level) {
+ result = 1;
if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
printk(KERN_DEBUG
"INIT on a runnable vcpu %d\n",
@@ -386,6 +379,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
apic_debug("SIPI to vcpu %d vector 0x%02x\n",
vcpu->vcpu_id, vector);
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+ result = 1;
vcpu->arch.sipi_vector = vector;
vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
kvm_vcpu_kick(vcpu);
@@ -408,43 +402,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
return result;
}
-static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
- unsigned long bitmap)
-{
- int last;
- int next;
- struct kvm_lapic *apic = NULL;
-
- last = kvm->arch.round_robin_prev_vcpu;
- next = last;
-
- do {
- if (++next == KVM_MAX_VCPUS)
- next = 0;
- if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
- continue;
- apic = kvm->vcpus[next]->arch.apic;
- if (apic && apic_enabled(apic))
- break;
- apic = NULL;
- } while (next != last);
- kvm->arch.round_robin_prev_vcpu = next;
-
- if (!apic)
- printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
-
- return apic;
-}
-
-struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
- unsigned long bitmap)
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
{
- struct kvm_lapic *apic;
-
- apic = kvm_apic_round_robin(kvm, vector, bitmap);
- if (apic)
- return apic->vcpu;
- return NULL;
+ return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
}
static void apic_set_eoi(struct kvm_lapic *apic)
@@ -472,47 +432,24 @@ static void apic_send_ipi(struct kvm_lapic *apic)
{
u32 icr_low = apic_get_reg(apic, APIC_ICR);
u32 icr_high = apic_get_reg(apic, APIC_ICR2);
+ struct kvm_lapic_irq irq;
- unsigned int dest = GET_APIC_DEST_FIELD(icr_high);
- unsigned int short_hand = icr_low & APIC_SHORT_MASK;
- unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG;
- unsigned int level = icr_low & APIC_INT_ASSERT;
- unsigned int dest_mode = icr_low & APIC_DEST_MASK;
- unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
- unsigned int vector = icr_low & APIC_VECTOR_MASK;
-
- struct kvm_vcpu *target;
- struct kvm_vcpu *vcpu;
- unsigned long lpr_map = 0;
- int i;
+ irq.vector = icr_low & APIC_VECTOR_MASK;
+ irq.delivery_mode = icr_low & APIC_MODE_MASK;
+ irq.dest_mode = icr_low & APIC_DEST_MASK;
+ irq.level = icr_low & APIC_INT_ASSERT;
+ irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
+ irq.shorthand = icr_low & APIC_SHORT_MASK;
+ irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
apic_debug("icr_high 0x%x, icr_low 0x%x, "
"short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
"dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
- icr_high, icr_low, short_hand, dest,
- trig_mode, level, dest_mode, delivery_mode, vector);
-
- for (i = 0; i < KVM_MAX_VCPUS; i++) {
- vcpu = apic->vcpu->kvm->vcpus[i];
- if (!vcpu)
- continue;
-
- if (vcpu->arch.apic &&
- apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
- if (delivery_mode == APIC_DM_LOWEST)
- set_bit(vcpu->vcpu_id, &lpr_map);
- else
- __apic_accept_irq(vcpu->arch.apic, delivery_mode,
- vector, level, trig_mode);
- }
- }
+ icr_high, icr_low, irq.shorthand, irq.dest_id,
+ irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
+ irq.vector);
- if (delivery_mode == APIC_DM_LOWEST) {
- target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
- if (target != NULL)
- __apic_accept_irq(target->arch.apic, delivery_mode,
- vector, level, trig_mode);
- }
+ kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
}
static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -527,12 +464,13 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
if (apic_get_reg(apic, APIC_TMICT) == 0)
return 0;
- remaining = hrtimer_expires_remaining(&apic->timer.dev);
+ remaining = hrtimer_expires_remaining(&apic->lapic_timer.timer);
if (ktime_to_ns(remaining) < 0)
remaining = ktime_set(0, 0);
- ns = mod_64(ktime_to_ns(remaining), apic->timer.period);
- tmcct = div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->timer.divide_count));
+ ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
+ tmcct = div64_u64(ns,
+ (APIC_BUS_CYCLE_NS * apic->divide_count));
return tmcct;
}
@@ -619,25 +557,25 @@ static void update_divide_count(struct kvm_lapic *apic)
tdcr = apic_get_reg(apic, APIC_TDCR);
tmp1 = tdcr & 0xf;
tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
- apic->timer.divide_count = 0x1 << (tmp2 & 0x7);
+ apic->divide_count = 0x1 << (tmp2 & 0x7);
apic_debug("timer divide count is 0x%x\n",
- apic->timer.divide_count);
+ apic->divide_count);
}
static void start_apic_timer(struct kvm_lapic *apic)
{
- ktime_t now = apic->timer.dev.base->get_time();
+ ktime_t now = apic->lapic_timer.timer.base->get_time();
- apic->timer.period = apic_get_reg(apic, APIC_TMICT) *
- APIC_BUS_CYCLE_NS * apic->timer.divide_count;
- atomic_set(&apic->timer.pending, 0);
+ apic->lapic_timer.period = apic_get_reg(apic, APIC_TMICT) *
+ APIC_BUS_CYCLE_NS * apic->divide_count;
+ atomic_set(&apic->lapic_timer.pending, 0);
- if (!apic->timer.period)
+ if (!apic->lapic_timer.period)
return;
- hrtimer_start(&apic->timer.dev,
- ktime_add_ns(now, apic->timer.period),
+ hrtimer_start(&apic->lapic_timer.timer,
+ ktime_add_ns(now, apic->lapic_timer.period),
HRTIMER_MODE_ABS);
apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
@@ -646,9 +584,9 @@ static void start_apic_timer(struct kvm_lapic *apic)
"expire @ 0x%016" PRIx64 ".\n", __func__,
APIC_BUS_CYCLE_NS, ktime_to_ns(now),
apic_get_reg(apic, APIC_TMICT),
- apic->timer.period,
+ apic->lapic_timer.period,
ktime_to_ns(ktime_add_ns(now,
- apic->timer.period)));
+ apic->lapic_timer.period)));
}
static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
@@ -730,7 +668,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
apic_set_reg(apic, APIC_LVTT + 0x10 * i,
lvt_val | APIC_LVT_MASKED);
}
- atomic_set(&apic->timer.pending, 0);
+ atomic_set(&apic->lapic_timer.pending, 0);
}
break;
@@ -762,7 +700,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
break;
case APIC_TMICT:
- hrtimer_cancel(&apic->timer.dev);
+ hrtimer_cancel(&apic->lapic_timer.timer);
apic_set_reg(apic, APIC_TMICT, val);
start_apic_timer(apic);
return;
@@ -802,7 +740,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
if (!vcpu->arch.apic)
return;
- hrtimer_cancel(&vcpu->arch.apic->timer.dev);
+ hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
if (vcpu->arch.apic->regs_page)
__free_page(vcpu->arch.apic->regs_page);
@@ -880,7 +818,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
ASSERT(apic != NULL);
/* Stop the timer in case it's a reset to an active apic */
- hrtimer_cancel(&apic->timer.dev);
+ hrtimer_cancel(&apic->lapic_timer.timer);
apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
apic_set_reg(apic, APIC_LVR, APIC_VERSION);
@@ -905,11 +843,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
update_divide_count(apic);
- atomic_set(&apic->timer.pending, 0);
+ atomic_set(&apic->lapic_timer.pending, 0);
if (vcpu->vcpu_id == 0)
vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
apic_update_ppr(apic);
+ vcpu->arch.apic_arb_prio = 0;
+
apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
"0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
vcpu, kvm_apic_id(apic),
@@ -917,16 +857,14 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_lapic_reset);
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+bool kvm_apic_present(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
- int ret = 0;
-
- if (!apic)
- return 0;
- ret = apic_enabled(apic);
+ return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic);
+}
- return ret;
+int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+ return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
}
EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
@@ -936,22 +874,11 @@ EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
*----------------------------------------------------------------------
*/
-/* TODO: make sure __apic_timer_fn runs in current pCPU */
-static int __apic_timer_fn(struct kvm_lapic *apic)
+static bool lapic_is_periodic(struct kvm_timer *ktimer)
{
- int result = 0;
- wait_queue_head_t *q = &apic->vcpu->wq;
-
- if(!atomic_inc_and_test(&apic->timer.pending))
- set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
- if (waitqueue_active(q))
- wake_up_interruptible(q);
-
- if (apic_lvtt_period(apic)) {
- result = 1;
- hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period);
- }
- return result;
+ struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic,
+ lapic_timer);
+ return apic_lvtt_period(apic);
}
int apic_has_pending_timer(struct kvm_vcpu *vcpu)
@@ -959,7 +886,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
struct kvm_lapic *lapic = vcpu->arch.apic;
if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
- return atomic_read(&lapic->timer.pending);
+ return atomic_read(&lapic->lapic_timer.pending);
return 0;
}
@@ -986,20 +913,9 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
kvm_apic_local_deliver(apic, APIC_LVT0);
}
-static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
-{
- struct kvm_lapic *apic;
- int restart_timer = 0;
-
- apic = container_of(data, struct kvm_lapic, timer.dev);
-
- restart_timer = __apic_timer_fn(apic);
-
- if (restart_timer)
- return HRTIMER_RESTART;
- else
- return HRTIMER_NORESTART;
-}
+static struct kvm_timer_ops lapic_timer_ops = {
+ .is_periodic = lapic_is_periodic,
+};
int kvm_create_lapic(struct kvm_vcpu *vcpu)
{
@@ -1024,8 +940,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
memset(apic->regs, 0, PAGE_SIZE);
apic->vcpu = vcpu;
- hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
- apic->timer.dev.function = apic_timer_fn;
+ hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
+ apic->lapic_timer.timer.function = kvm_timer_fn;
+ apic->lapic_timer.t_ops = &lapic_timer_ops;
+ apic->lapic_timer.kvm = vcpu->kvm;
+ apic->lapic_timer.vcpu_id = vcpu->vcpu_id;
+
apic->base_address = APIC_DEFAULT_PHYS_BASE;
vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
@@ -1078,9 +999,9 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (apic && atomic_read(&apic->timer.pending) > 0) {
+ if (apic && atomic_read(&apic->lapic_timer.pending) > 0) {
if (kvm_apic_local_deliver(apic, APIC_LVTT))
- atomic_dec(&apic->timer.pending);
+ atomic_dec(&apic->lapic_timer.pending);
}
}
@@ -1106,7 +1027,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
MSR_IA32_APICBASE_BASE;
apic_set_reg(apic, APIC_LVR, APIC_VERSION);
apic_update_ppr(apic);
- hrtimer_cancel(&apic->timer.dev);
+ hrtimer_cancel(&apic->lapic_timer.timer);
update_divide_count(apic);
start_apic_timer(apic);
}
@@ -1119,7 +1040,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
if (!apic)
return;
- timer = &apic->timer.dev;
+ timer = &apic->lapic_timer.timer;
if (hrtimer_cancel(timer))
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 45ab6ee7120..a587f8349c4 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -2,18 +2,15 @@
#define __KVM_X86_LAPIC_H
#include "iodev.h"
+#include "kvm_timer.h"
#include <linux/kvm_host.h>
struct kvm_lapic {
unsigned long base_address;
struct kvm_io_device dev;
- struct {
- atomic_t pending;
- s64 period; /* unit: ns */
- u32 divide_count;
- struct hrtimer dev;
- } timer;
+ struct kvm_timer lapic_timer;
+ u32 divide_count;
struct kvm_vcpu *vcpu;
struct page *regs_page;
void *regs;
@@ -34,12 +31,13 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
+bool kvm_apic_present(struct kvm_vcpu *vcpu);
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 32cf11e5728..0ef5bb2b404 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -126,6 +126,7 @@ module_param(oos_shadow, bool, 0644);
#define PFERR_PRESENT_MASK (1U << 0)
#define PFERR_WRITE_MASK (1U << 1)
#define PFERR_USER_MASK (1U << 2)
+#define PFERR_RSVD_MASK (1U << 3)
#define PFERR_FETCH_MASK (1U << 4)
#define PT_DIRECTORY_LEVEL 2
@@ -177,7 +178,11 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
static u64 __read_mostly shadow_user_mask;
static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
-static u64 __read_mostly shadow_mt_mask;
+
+static inline u64 rsvd_bits(int s, int e)
+{
+ return ((1ULL << (e - s + 1)) - 1) << s;
+}
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
{
@@ -193,14 +198,13 @@ void kvm_mmu_set_base_ptes(u64 base_pte)
EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask)
+ u64 dirty_mask, u64 nx_mask, u64 x_mask)
{
shadow_user_mask = user_mask;
shadow_accessed_mask = accessed_mask;
shadow_dirty_mask = dirty_mask;
shadow_nx_mask = nx_mask;
shadow_x_mask = x_mask;
- shadow_mt_mask = mt_mask;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
@@ -219,11 +223,6 @@ static int is_nx(struct kvm_vcpu *vcpu)
return vcpu->arch.shadow_efer & EFER_NX;
}
-static int is_present_pte(unsigned long pte)
-{
- return pte & PT_PRESENT_MASK;
-}
-
static int is_shadow_present_pte(u64 pte)
{
return pte != shadow_trap_nonpresent_pte
@@ -490,16 +489,20 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
*
* If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
* containing more mappings.
+ *
+ * Returns the number of rmap entries before the spte was added or zero if
+ * the spte was not added.
+ *
*/
-static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
+static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
{
struct kvm_mmu_page *sp;
struct kvm_rmap_desc *desc;
unsigned long *rmapp;
- int i;
+ int i, count = 0;
if (!is_rmap_pte(*spte))
- return;
+ return count;
gfn = unalias_gfn(vcpu->kvm, gfn);
sp = page_header(__pa(spte));
sp->gfns[spte - sp->spt] = gfn;
@@ -516,8 +519,10 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
} else {
rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
- while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
+ while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) {
desc = desc->more;
+ count += RMAP_EXT;
+ }
if (desc->shadow_ptes[RMAP_EXT-1]) {
desc->more = mmu_alloc_rmap_desc(vcpu);
desc = desc->more;
@@ -526,6 +531,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
;
desc->shadow_ptes[i] = spte;
}
+ return count;
}
static void rmap_desc_remove_entry(unsigned long *rmapp,
@@ -755,6 +761,19 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
return young;
}
+#define RMAP_RECYCLE_THRESHOLD 1000
+
+static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage)
+{
+ unsigned long *rmapp;
+
+ gfn = unalias_gfn(vcpu->kvm, gfn);
+ rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
+
+ kvm_unmap_rmapp(vcpu->kvm, rmapp);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+}
+
int kvm_age_hva(struct kvm *kvm, unsigned long hva)
{
return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
@@ -1074,18 +1093,10 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
return NULL;
}
-static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- list_del(&sp->oos_link);
- --kvm->stat.mmu_unsync_global;
-}
-
static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
WARN_ON(!sp->unsync);
sp->unsync = 0;
- if (sp->global)
- kvm_unlink_unsync_global(kvm, sp);
--kvm->stat.mmu_unsync;
}
@@ -1248,7 +1259,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
sp->gfn = gfn;
sp->role = role;
- sp->global = 0;
hlist_add_head(&sp->hash_link, bucket);
if (!direct) {
if (rmap_write_protect(vcpu->kvm, gfn))
@@ -1417,24 +1427,25 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
*/
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
{
+ int used_pages;
+
+ used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
+ used_pages = max(0, used_pages);
+
/*
* If we set the number of mmu pages to be smaller be than the
* number of actived pages , we must to free some mmu pages before we
* change the value
*/
- if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
- kvm_nr_mmu_pages) {
- int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
- - kvm->arch.n_free_mmu_pages;
-
- while (n_used_mmu_pages > kvm_nr_mmu_pages) {
+ if (used_pages > kvm_nr_mmu_pages) {
+ while (used_pages > kvm_nr_mmu_pages) {
struct kvm_mmu_page *page;
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
kvm_mmu_zap_page(kvm, page);
- n_used_mmu_pages--;
+ used_pages--;
}
kvm->arch.n_free_mmu_pages = 0;
}
@@ -1616,7 +1627,7 @@ static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
return mtrr_state->def_type;
}
-static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
+u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
{
u8 mtrr;
@@ -1626,6 +1637,7 @@ static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
mtrr = MTRR_TYPE_WRBACK;
return mtrr;
}
+EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
@@ -1646,11 +1658,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
++vcpu->kvm->stat.mmu_unsync;
sp->unsync = 1;
- if (sp->global) {
- list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages);
- ++vcpu->kvm->stat.mmu_unsync_global;
- } else
- kvm_mmu_mark_parents_unsync(vcpu, sp);
+ kvm_mmu_mark_parents_unsync(vcpu, sp);
mmu_convert_notrap(sp);
return 0;
@@ -1677,21 +1685,11 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pte_access, int user_fault,
int write_fault, int dirty, int largepage,
- int global, gfn_t gfn, pfn_t pfn, bool speculative,
+ gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync)
{
u64 spte;
int ret = 0;
- u64 mt_mask = shadow_mt_mask;
- struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
-
- if (!global && sp->global) {
- sp->global = 0;
- if (sp->unsync) {
- kvm_unlink_unsync_global(vcpu->kvm, sp);
- kvm_mmu_mark_parents_unsync(vcpu, sp);
- }
- }
/*
* We don't set the accessed bit, since we sometimes want to see
@@ -1711,16 +1709,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
spte |= shadow_user_mask;
if (largepage)
spte |= PT_PAGE_SIZE_MASK;
- if (mt_mask) {
- if (!kvm_is_mmio_pfn(pfn)) {
- mt_mask = get_memory_type(vcpu, gfn) <<
- kvm_x86_ops->get_mt_mask_shift();
- mt_mask |= VMX_EPT_IGMT_BIT;
- } else
- mt_mask = MTRR_TYPE_UNCACHABLE <<
- kvm_x86_ops->get_mt_mask_shift();
- spte |= mt_mask;
- }
+ if (tdp_enabled)
+ spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
+ kvm_is_mmio_pfn(pfn));
spte |= (u64)pfn << PAGE_SHIFT;
@@ -1765,11 +1756,12 @@ set_pte:
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty,
- int *ptwrite, int largepage, int global,
- gfn_t gfn, pfn_t pfn, bool speculative)
+ int *ptwrite, int largepage, gfn_t gfn,
+ pfn_t pfn, bool speculative)
{
int was_rmapped = 0;
int was_writeble = is_writeble_pte(*shadow_pte);
+ int rmap_count;
pgprintk("%s: spte %llx access %x write_fault %d"
" user_fault %d gfn %lx\n",
@@ -1795,7 +1787,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
was_rmapped = 1;
}
if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
- dirty, largepage, global, gfn, pfn, speculative, true)) {
+ dirty, largepage, gfn, pfn, speculative, true)) {
if (write_fault)
*ptwrite = 1;
kvm_x86_ops->tlb_flush(vcpu);
@@ -1811,9 +1803,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
if (!was_rmapped) {
- rmap_add(vcpu, shadow_pte, gfn, largepage);
+ rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage);
if (!is_rmap_pte(*shadow_pte))
kvm_release_pfn_clean(pfn);
+ if (rmap_count > RMAP_RECYCLE_THRESHOLD)
+ rmap_recycle(vcpu, gfn, largepage);
} else {
if (was_writeble)
kvm_release_pfn_dirty(pfn);
@@ -1843,7 +1837,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
|| (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
0, write, 1, &pt_write,
- largepage, 0, gfn, pfn, false);
+ largepage, gfn, pfn, false);
++vcpu->stat.pf_fixed;
break;
}
@@ -1942,7 +1936,19 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
}
-static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
+static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
+{
+ int ret = 0;
+
+ if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
+ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
{
int i;
gfn_t root_gfn;
@@ -1957,13 +1963,15 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
ASSERT(!VALID_PAGE(root));
if (tdp_enabled)
direct = 1;
+ if (mmu_check_root(vcpu, root_gfn))
+ return 1;
sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
PT64_ROOT_LEVEL, direct,
ACC_ALL, NULL);
root = __pa(sp->spt);
++sp->root_count;
vcpu->arch.mmu.root_hpa = root;
- return;
+ return 0;
}
direct = !is_paging(vcpu);
if (tdp_enabled)
@@ -1980,6 +1988,8 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
} else if (vcpu->arch.mmu.root_level == 0)
root_gfn = 0;
+ if (mmu_check_root(vcpu, root_gfn))
+ return 1;
sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
PT32_ROOT_LEVEL, direct,
ACC_ALL, NULL);
@@ -1988,6 +1998,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
}
vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+ return 0;
}
static void mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -2006,7 +2017,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
- if (root) {
+ if (root && VALID_PAGE(root)) {
root &= PT64_BASE_ADDR_MASK;
sp = page_header(root);
mmu_sync_children(vcpu, sp);
@@ -2014,15 +2025,6 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
}
}
-static void mmu_sync_global(struct kvm_vcpu *vcpu)
-{
- struct kvm *kvm = vcpu->kvm;
- struct kvm_mmu_page *sp, *n;
-
- list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link)
- kvm_sync_page(vcpu, sp);
-}
-
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
{
spin_lock(&vcpu->kvm->mmu_lock);
@@ -2030,13 +2032,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
spin_unlock(&vcpu->kvm->mmu_lock);
}
-void kvm_mmu_sync_global(struct kvm_vcpu *vcpu)
-{
- spin_lock(&vcpu->kvm->mmu_lock);
- mmu_sync_global(vcpu);
- spin_unlock(&vcpu->kvm->mmu_lock);
-}
-
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
{
return vaddr;
@@ -2151,6 +2146,14 @@ static void paging_free(struct kvm_vcpu *vcpu)
nonpaging_free(vcpu);
}
+static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
+{
+ int bit7;
+
+ bit7 = (gpte >> 7) & 1;
+ return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
+}
+
#define PTTYPE 64
#include "paging_tmpl.h"
#undef PTTYPE
@@ -2159,6 +2162,59 @@ static void paging_free(struct kvm_vcpu *vcpu)
#include "paging_tmpl.h"
#undef PTTYPE
+static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+ int maxphyaddr = cpuid_maxphyaddr(vcpu);
+ u64 exb_bit_rsvd = 0;
+
+ if (!is_nx(vcpu))
+ exb_bit_rsvd = rsvd_bits(63, 63);
+ switch (level) {
+ case PT32_ROOT_LEVEL:
+ /* no rsvd bits for 2 level 4K page table entries */
+ context->rsvd_bits_mask[0][1] = 0;
+ context->rsvd_bits_mask[0][0] = 0;
+ if (is_cpuid_PSE36())
+ /* 36bits PSE 4MB page */
+ context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
+ else
+ /* 32 bits PSE 4MB page */
+ context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
+ break;
+ case PT32E_ROOT_LEVEL:
+ context->rsvd_bits_mask[0][2] =
+ rsvd_bits(maxphyaddr, 63) |
+ rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */
+ context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 62); /* PDE */
+ context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 62); /* PTE */
+ context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 62) |
+ rsvd_bits(13, 20); /* large page */
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
+ break;
+ case PT64_ROOT_LEVEL:
+ context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+ context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+ context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51);
+ context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51);
+ context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
+ context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2];
+ context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51) |
+ rsvd_bits(13, 20); /* large page */
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
+ break;
+ }
+}
+
static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
{
struct kvm_mmu *context = &vcpu->arch.mmu;
@@ -2179,6 +2235,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
static int paging64_init_context(struct kvm_vcpu *vcpu)
{
+ reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
}
@@ -2186,6 +2243,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = &vcpu->arch.mmu;
+ reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
context->new_cr3 = paging_new_cr3;
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2201,6 +2259,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
static int paging32E_init_context(struct kvm_vcpu *vcpu)
{
+ reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
}
@@ -2221,12 +2280,15 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->root_level = 0;
} else if (is_long_mode(vcpu)) {
+ reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
context->gva_to_gpa = paging64_gva_to_gpa;
context->root_level = PT64_ROOT_LEVEL;
} else if (is_pae(vcpu)) {
+ reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
context->gva_to_gpa = paging64_gva_to_gpa;
context->root_level = PT32E_ROOT_LEVEL;
} else {
+ reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
context->gva_to_gpa = paging32_gva_to_gpa;
context->root_level = PT32_ROOT_LEVEL;
}
@@ -2290,9 +2352,11 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
goto out;
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
- mmu_alloc_roots(vcpu);
+ r = mmu_alloc_roots(vcpu);
mmu_sync_roots(vcpu);
spin_unlock(&vcpu->kvm->mmu_lock);
+ if (r)
+ goto out;
kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
kvm_mmu_flush_tlb(vcpu);
out:
@@ -2638,14 +2702,6 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu_page *sp;
-
- while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
- sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
- struct kvm_mmu_page, link);
- kvm_mmu_zap_page(vcpu->kvm, sp);
- cond_resched();
- }
free_page((unsigned long)vcpu->arch.mmu.pae_root);
}
@@ -2710,7 +2766,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
{
struct kvm_mmu_page *sp;
- spin_lock(&kvm->mmu_lock);
list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
int i;
u64 *pt;
@@ -2725,7 +2780,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
pt[i] &= ~PT_WRITABLE_MASK;
}
kvm_flush_remote_tlbs(kvm);
- spin_unlock(&kvm->mmu_lock);
}
void kvm_mmu_zap_all(struct kvm *kvm)
@@ -3007,11 +3061,13 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
" in nonleaf level: levels %d gva %lx"
" level %d pte %llx\n", audit_msg,
vcpu->arch.mmu.root_level, va, level, ent);
-
- audit_mappings_page(vcpu, ent, va, level - 1);
+ else
+ audit_mappings_page(vcpu, ent, va, level - 1);
} else {
gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
- hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
+ hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
if (is_shadow_present_pte(ent)
&& (ent & PT64_BASE_ADDR_MASK) != hpa)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index eaab2145f62..3494a2fb136 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -75,4 +75,9 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
return vcpu->arch.cr0 & X86_CR0_PG;
}
+static inline int is_present_pte(unsigned long pte)
+{
+ return pte & PT_PRESENT_MASK;
+}
+
#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6bd70206c56..67785f63539 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -123,6 +123,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
gfn_t table_gfn;
unsigned index, pt_access, pte_access;
gpa_t pte_gpa;
+ int rsvd_fault = 0;
pgprintk("%s: addr %lx\n", __func__, addr);
walk:
@@ -157,6 +158,10 @@ walk:
if (!is_present_pte(pte))
goto not_present;
+ rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
+ if (rsvd_fault)
+ goto access_error;
+
if (write_fault && !is_writeble_pte(pte))
if (user_fault || is_write_protection(vcpu))
goto access_error;
@@ -209,7 +214,6 @@ walk:
if (ret)
goto walk;
pte |= PT_DIRTY_MASK;
- kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0);
walker->ptes[walker->level - 1] = pte;
}
@@ -233,6 +237,8 @@ err:
walker->error_code |= PFERR_USER_MASK;
if (fetch_fault)
walker->error_code |= PFERR_FETCH_MASK;
+ if (rsvd_fault)
+ walker->error_code |= PFERR_RSVD_MASK;
return 0;
}
@@ -262,8 +268,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
kvm_get_pfn(pfn);
mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
gpte & PT_DIRTY_MASK, NULL, largepage,
- gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte),
- pfn, true);
+ gpte_to_gfn(gpte), pfn, true);
}
/*
@@ -276,7 +281,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
{
unsigned access = gw->pt_access;
struct kvm_mmu_page *shadow_page;
- u64 spte, *sptep;
+ u64 spte, *sptep = NULL;
int direct;
gfn_t table_gfn;
int r;
@@ -297,7 +302,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
user_fault, write_fault,
gw->ptes[gw->level-1] & PT_DIRTY_MASK,
ptwrite, largepage,
- gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
gw->gfn, pfn, false);
break;
}
@@ -380,7 +384,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
return r;
/*
- * Look up the shadow pte for the faulting address.
+ * Look up the guest pte for the faulting address.
*/
r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
fetch_fault);
@@ -586,7 +590,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
nr_present++;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
- is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn,
+ is_dirty_pte(gpte), 0, gfn,
spte_to_pfn(sp->spt[i]), true, false);
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1f8510c51d6..b1f658ad2f0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -19,6 +19,7 @@
#include "irq.h"
#include "mmu.h"
#include "kvm_cache_regs.h"
+#include "x86.h"
#include <linux/module.h>
#include <linux/kernel.h>
@@ -69,7 +70,6 @@ module_param(npt, int, S_IRUGO);
static int nested = 0;
module_param(nested, int, S_IRUGO);
-static void kvm_reput_irq(struct vcpu_svm *svm);
static void svm_flush_tlb(struct kvm_vcpu *vcpu);
static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override);
@@ -132,24 +132,6 @@ static inline u32 svm_has(u32 feat)
return svm_features & feat;
}
-static inline u8 pop_irq(struct kvm_vcpu *vcpu)
-{
- int word_index = __ffs(vcpu->arch.irq_summary);
- int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
- int irq = word_index * BITS_PER_LONG + bit_index;
-
- clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
- if (!vcpu->arch.irq_pending[word_index])
- clear_bit(word_index, &vcpu->arch.irq_summary);
- return irq;
-}
-
-static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
-{
- set_bit(irq, vcpu->arch.irq_pending);
- set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
-}
-
static inline void clgi(void)
{
asm volatile (__ex(SVM_CLGI));
@@ -214,17 +196,31 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
svm->vmcb->control.event_inj_err = error_code;
}
-static bool svm_exception_injected(struct kvm_vcpu *vcpu)
+static int is_external_interrupt(u32 info)
+{
+ info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
+ return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
+}
+
+static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ u32 ret = 0;
- return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID);
+ if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
+ ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS;
+ return ret & mask;
}
-static int is_external_interrupt(u32 info)
+static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
{
- info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
- return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (mask == 0)
+ svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
+ else
+ svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
+
}
static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
@@ -232,7 +228,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
if (!svm->next_rip) {
- printk(KERN_DEBUG "%s: NOP\n", __func__);
+ if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) !=
+ EMULATE_DONE)
+ printk(KERN_DEBUG "%s: NOP\n", __func__);
return;
}
if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
@@ -240,9 +238,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
__func__, kvm_rip_read(vcpu), svm->next_rip);
kvm_rip_write(vcpu, svm->next_rip);
- svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
-
- vcpu->arch.interrupt_window_open = (svm->vcpu.arch.hflags & HF_GIF_MASK);
+ svm_set_interrupt_shadow(vcpu, 0);
}
static int has_svm(void)
@@ -715,6 +711,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
svm->vmcb->control.tsc_offset += delta;
vcpu->cpu = cpu;
kvm_migrate_timers(vcpu);
+ svm->asid_generation = 0;
}
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
@@ -830,6 +827,15 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
if (!var->unusable)
var->type |= 0x1;
break;
+ case VCPU_SREG_SS:
+ /* On AMD CPUs sometimes the DB bit in the segment
+ * descriptor is left as 1, although the whole segment has
+ * been made unusable. Clear it here to pass an Intel VMX
+ * entry check when cross vendor migrating.
+ */
+ if (var->unusable)
+ var->db = 0;
+ break;
}
}
@@ -960,15 +966,16 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
}
-static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
+static void update_db_intercept(struct kvm_vcpu *vcpu)
{
- int old_debug = vcpu->guest_debug;
struct vcpu_svm *svm = to_svm(vcpu);
- vcpu->guest_debug = dbg->control;
-
svm->vmcb->control.intercept_exceptions &=
~((1 << DB_VECTOR) | (1 << BP_VECTOR));
+
+ if (vcpu->arch.singlestep)
+ svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
+
if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
if (vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
@@ -979,6 +986,16 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1 << BP_VECTOR;
} else
vcpu->guest_debug = 0;
+}
+
+static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
+{
+ int old_debug = vcpu->guest_debug;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ vcpu->guest_debug = dbg->control;
+
+ update_db_intercept(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
@@ -993,16 +1010,6 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
return 0;
}
-static int svm_get_irq(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u32 exit_int_info = svm->vmcb->control.exit_int_info;
-
- if (is_external_interrupt(exit_int_info))
- return exit_int_info & SVM_EVTINJ_VEC_MASK;
- return -1;
-}
-
static void load_host_msrs(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_X86_64
@@ -1025,7 +1032,6 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
}
- svm->vcpu.cpu = svm_data->cpu;
svm->asid_generation = svm_data->asid_generation;
svm->vmcb->control.asid = svm_data->next_asid++;
}
@@ -1107,17 +1113,8 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
- u32 exit_int_info = svm->vmcb->control.exit_int_info;
- struct kvm *kvm = svm->vcpu.kvm;
u64 fault_address;
u32 error_code;
- bool event_injection = false;
-
- if (!irqchip_in_kernel(kvm) &&
- is_external_interrupt(exit_int_info)) {
- event_injection = true;
- push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
- }
fault_address = svm->vmcb->control.exit_info_2;
error_code = svm->vmcb->control.exit_info_1;
@@ -1137,23 +1134,40 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
*/
if (npt_enabled)
svm_flush_tlb(&svm->vcpu);
-
- if (!npt_enabled && event_injection)
- kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
+ else {
+ if (kvm_event_needs_reinjection(&svm->vcpu))
+ kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
+ }
return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
}
static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
if (!(svm->vcpu.guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
+ !svm->vcpu.arch.singlestep) {
kvm_queue_exception(&svm->vcpu, DB_VECTOR);
return 1;
}
- kvm_run->exit_reason = KVM_EXIT_DEBUG;
- kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
- kvm_run->debug.arch.exception = DB_VECTOR;
- return 0;
+
+ if (svm->vcpu.arch.singlestep) {
+ svm->vcpu.arch.singlestep = false;
+ if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
+ svm->vmcb->save.rflags &=
+ ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+ update_db_intercept(&svm->vcpu);
+ }
+
+ if (svm->vcpu.guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ kvm_run->debug.arch.pc =
+ svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ return 0;
+ }
+
+ return 1;
}
static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
@@ -1842,17 +1856,51 @@ static int task_switch_interception(struct vcpu_svm *svm,
struct kvm_run *kvm_run)
{
u16 tss_selector;
+ int reason;
+ int int_type = svm->vmcb->control.exit_int_info &
+ SVM_EXITINTINFO_TYPE_MASK;
+ int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
+ uint32_t type =
+ svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
+ uint32_t idt_v =
+ svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
tss_selector = (u16)svm->vmcb->control.exit_info_1;
+
if (svm->vmcb->control.exit_info_2 &
(1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
- return kvm_task_switch(&svm->vcpu, tss_selector,
- TASK_SWITCH_IRET);
- if (svm->vmcb->control.exit_info_2 &
- (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
- return kvm_task_switch(&svm->vcpu, tss_selector,
- TASK_SWITCH_JMP);
- return kvm_task_switch(&svm->vcpu, tss_selector, TASK_SWITCH_CALL);
+ reason = TASK_SWITCH_IRET;
+ else if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
+ reason = TASK_SWITCH_JMP;
+ else if (idt_v)
+ reason = TASK_SWITCH_GATE;
+ else
+ reason = TASK_SWITCH_CALL;
+
+ if (reason == TASK_SWITCH_GATE) {
+ switch (type) {
+ case SVM_EXITINTINFO_TYPE_NMI:
+ svm->vcpu.arch.nmi_injected = false;
+ break;
+ case SVM_EXITINTINFO_TYPE_EXEPT:
+ kvm_clear_exception_queue(&svm->vcpu);
+ break;
+ case SVM_EXITINTINFO_TYPE_INTR:
+ kvm_clear_interrupt_queue(&svm->vcpu);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (reason != TASK_SWITCH_GATE ||
+ int_type == SVM_EXITINTINFO_TYPE_SOFT ||
+ (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
+ (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
+ skip_emulated_instruction(&svm->vcpu);
+
+ return kvm_task_switch(&svm->vcpu, tss_selector, reason);
}
static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
@@ -1862,6 +1910,14 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
return 1;
}
+static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+ ++svm->vcpu.stat.nmi_window_exits;
+ svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
+ svm->vcpu.arch.hflags |= HF_IRET_MASK;
+ return 1;
+}
+
static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
@@ -1879,8 +1935,14 @@ static int emulate_on_interception(struct vcpu_svm *svm,
static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
+ u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
+ /* instruction emulation calls kvm_set_cr8() */
emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
- if (irqchip_in_kernel(svm->vcpu.kvm))
+ if (irqchip_in_kernel(svm->vcpu.kvm)) {
+ svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
+ return 1;
+ }
+ if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
return 1;
kvm_run->exit_reason = KVM_EXIT_SET_TPR;
return 0;
@@ -2090,8 +2152,9 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
* If the user space waits to inject interrupts, exit as soon as
* possible
*/
- if (kvm_run->request_interrupt_window &&
- !svm->vcpu.arch.irq_summary) {
+ if (!irqchip_in_kernel(svm->vcpu.kvm) &&
+ kvm_run->request_interrupt_window &&
+ !kvm_cpu_has_interrupt(&svm->vcpu)) {
++svm->vcpu.stat.irq_window_exits;
kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
return 0;
@@ -2134,6 +2197,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
[SVM_EXIT_VINTR] = interrupt_window_interception,
/* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
[SVM_EXIT_CPUID] = cpuid_interception,
+ [SVM_EXIT_IRET] = iret_interception,
[SVM_EXIT_INVD] = emulate_on_interception,
[SVM_EXIT_HLT] = halt_interception,
[SVM_EXIT_INVLPG] = invlpg_interception,
@@ -2194,7 +2258,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
}
}
- kvm_reput_irq(svm);
if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2205,7 +2268,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
- exit_code != SVM_EXIT_NPF)
+ exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH)
printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
"exit_code 0x%x\n",
__func__, svm->vmcb->control.exit_int_info,
@@ -2237,11 +2300,20 @@ static void pre_svm_run(struct vcpu_svm *svm)
struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
- if (svm->vcpu.cpu != cpu ||
- svm->asid_generation != svm_data->asid_generation)
+ /* FIXME: handle wraparound of asid_generation */
+ if (svm->asid_generation != svm_data->asid_generation)
new_asid(svm, svm_data);
}
+static void svm_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
+ vcpu->arch.hflags |= HF_NMI_MASK;
+ svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
+ ++vcpu->stat.nmi_injections;
+}
static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
{
@@ -2257,134 +2329,71 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
}
-static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
+static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr)
{
struct vcpu_svm *svm = to_svm(vcpu);
- nested_svm_intr(svm);
-
- svm_inject_irq(svm, irq);
+ svm->vmcb->control.event_inj = nr |
+ SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
}
-static void update_cr8_intercept(struct kvm_vcpu *vcpu)
+static void svm_set_irq(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *vmcb = svm->vmcb;
- int max_irr, tpr;
- if (!irqchip_in_kernel(vcpu->kvm) || vcpu->arch.apic->vapic_addr)
- return;
+ nested_svm_intr(svm);
- vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
+ svm_queue_irq(vcpu, vcpu->arch.interrupt.nr);
+}
- max_irr = kvm_lapic_find_highest_irr(vcpu);
- if (max_irr == -1)
- return;
+static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
- tpr = kvm_lapic_get_cr8(vcpu) << 4;
+ if (irr == -1)
+ return;
- if (tpr >= (max_irr & 0xf0))
- vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
+ if (tpr >= irr)
+ svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
}
-static void svm_intr_assist(struct kvm_vcpu *vcpu)
+static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb;
- int intr_vector = -1;
-
- if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
- ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
- intr_vector = vmcb->control.exit_int_info &
- SVM_EVTINJ_VEC_MASK;
- vmcb->control.exit_int_info = 0;
- svm_inject_irq(svm, intr_vector);
- goto out;
- }
-
- if (vmcb->control.int_ctl & V_IRQ_MASK)
- goto out;
-
- if (!kvm_cpu_has_interrupt(vcpu))
- goto out;
-
- if (nested_svm_intr(svm))
- goto out;
-
- if (!(svm->vcpu.arch.hflags & HF_GIF_MASK))
- goto out;
-
- if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
- (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
- (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
- /* unable to deliver irq, set pending irq */
- svm_set_vintr(svm);
- svm_inject_irq(svm, 0x0);
- goto out;
- }
- /* Okay, we can deliver the interrupt: grab it and update PIC state. */
- intr_vector = kvm_cpu_get_interrupt(vcpu);
- svm_inject_irq(svm, intr_vector);
-out:
- update_cr8_intercept(vcpu);
+ return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+ !(svm->vcpu.arch.hflags & HF_NMI_MASK);
}
-static void kvm_reput_irq(struct vcpu_svm *svm)
+static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
{
- struct vmcb_control_area *control = &svm->vmcb->control;
-
- if ((control->int_ctl & V_IRQ_MASK)
- && !irqchip_in_kernel(svm->vcpu.kvm)) {
- control->int_ctl &= ~V_IRQ_MASK;
- push_irq(&svm->vcpu, control->int_vector);
- }
-
- svm->vcpu.arch.interrupt_window_open =
- !(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
- (svm->vcpu.arch.hflags & HF_GIF_MASK);
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+ return (vmcb->save.rflags & X86_EFLAGS_IF) &&
+ !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+ (svm->vcpu.arch.hflags & HF_GIF_MASK);
}
-static void svm_do_inject_vector(struct vcpu_svm *svm)
+static void enable_irq_window(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
- int word_index = __ffs(vcpu->arch.irq_summary);
- int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
- int irq = word_index * BITS_PER_LONG + bit_index;
-
- clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
- if (!vcpu->arch.irq_pending[word_index])
- clear_bit(word_index, &vcpu->arch.irq_summary);
- svm_inject_irq(svm, irq);
+ svm_set_vintr(to_svm(vcpu));
+ svm_inject_irq(to_svm(vcpu), 0x0);
}
-static void do_interrupt_requests(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb_control_area *control = &svm->vmcb->control;
-
- if (nested_svm_intr(svm))
- return;
- svm->vcpu.arch.interrupt_window_open =
- (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
- (svm->vmcb->save.rflags & X86_EFLAGS_IF) &&
- (svm->vcpu.arch.hflags & HF_GIF_MASK));
+ if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
+ == HF_NMI_MASK)
+ return; /* IRET will cause a vm exit */
- if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
- /*
- * If interrupts enabled, and not blocked by sti or mov ss. Good.
- */
- svm_do_inject_vector(svm);
-
- /*
- * Interrupts blocked. Wait for unblock.
- */
- if (!svm->vcpu.arch.interrupt_window_open &&
- (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
- svm_set_vintr(svm);
- else
- svm_clear_vintr(svm);
+ /* Something prevents NMI from been injected. Single step over
+ possible problem (IRET or exception injection or interrupt
+ shadow) */
+ vcpu->arch.singlestep = true;
+ svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+ update_db_intercept(vcpu);
}
static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -2407,7 +2416,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
- kvm_lapic_set_tpr(vcpu, cr8);
+ kvm_set_cr8(vcpu, cr8);
}
}
@@ -2416,14 +2425,54 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
u64 cr8;
- if (!irqchip_in_kernel(vcpu->kvm))
- return;
-
cr8 = kvm_get_cr8(vcpu);
svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
}
+static void svm_complete_interrupts(struct vcpu_svm *svm)
+{
+ u8 vector;
+ int type;
+ u32 exitintinfo = svm->vmcb->control.exit_int_info;
+
+ if (svm->vcpu.arch.hflags & HF_IRET_MASK)
+ svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+
+ svm->vcpu.arch.nmi_injected = false;
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
+ if (!(exitintinfo & SVM_EXITINTINFO_VALID))
+ return;
+
+ vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
+ type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
+
+ switch (type) {
+ case SVM_EXITINTINFO_TYPE_NMI:
+ svm->vcpu.arch.nmi_injected = true;
+ break;
+ case SVM_EXITINTINFO_TYPE_EXEPT:
+ /* In case of software exception do not reinject an exception
+ vector, but re-execute and instruction instead */
+ if (kvm_exception_is_soft(vector))
+ break;
+ if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
+ u32 err = svm->vmcb->control.exit_int_info_err;
+ kvm_queue_exception_e(&svm->vcpu, vector, err);
+
+ } else
+ kvm_queue_exception(&svm->vcpu, vector);
+ break;
+ case SVM_EXITINTINFO_TYPE_INTR:
+ kvm_queue_interrupt(&svm->vcpu, vector, false);
+ break;
+ default:
+ break;
+ }
+}
+
#ifdef CONFIG_X86_64
#define R "r"
#else
@@ -2552,6 +2601,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
sync_cr8_to_lapic(vcpu);
svm->next_rip = 0;
+
+ svm_complete_interrupts(svm);
}
#undef R
@@ -2617,7 +2668,7 @@ static int get_npt_level(void)
#endif
}
-static int svm_get_mt_mask_shift(void)
+static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
return 0;
}
@@ -2667,17 +2718,21 @@ static struct kvm_x86_ops svm_x86_ops = {
.run = svm_vcpu_run,
.handle_exit = handle_exit,
.skip_emulated_instruction = skip_emulated_instruction,
+ .set_interrupt_shadow = svm_set_interrupt_shadow,
+ .get_interrupt_shadow = svm_get_interrupt_shadow,
.patch_hypercall = svm_patch_hypercall,
- .get_irq = svm_get_irq,
.set_irq = svm_set_irq,
+ .set_nmi = svm_inject_nmi,
.queue_exception = svm_queue_exception,
- .exception_injected = svm_exception_injected,
- .inject_pending_irq = svm_intr_assist,
- .inject_pending_vectors = do_interrupt_requests,
+ .interrupt_allowed = svm_interrupt_allowed,
+ .nmi_allowed = svm_nmi_allowed,
+ .enable_nmi_window = enable_nmi_window,
+ .enable_irq_window = enable_irq_window,
+ .update_cr8_intercept = update_cr8_intercept,
.set_tss_addr = svm_set_tss_addr,
.get_tdp_level = get_npt_level,
- .get_mt_mask_shift = svm_get_mt_mask_shift,
+ .get_mt_mask = svm_get_mt_mask,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
new file mode 100644
index 00000000000..86dbac072d0
--- /dev/null
+++ b/arch/x86/kvm/timer.c
@@ -0,0 +1,46 @@
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/hrtimer.h>
+#include <asm/atomic.h>
+#include "kvm_timer.h"
+
+static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
+{
+ int restart_timer = 0;
+ wait_queue_head_t *q = &vcpu->wq;
+
+ /* FIXME: this code should not know anything about vcpus */
+ if (!atomic_inc_and_test(&ktimer->pending))
+ set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+
+ if (!ktimer->reinject)
+ atomic_set(&ktimer->pending, 1);
+
+ if (waitqueue_active(q))
+ wake_up_interruptible(q);
+
+ if (ktimer->t_ops->is_periodic(ktimer)) {
+ hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+ restart_timer = 1;
+ }
+
+ return restart_timer;
+}
+
+enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
+{
+ int restart_timer;
+ struct kvm_vcpu *vcpu;
+ struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+
+ vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id];
+ if (!vcpu)
+ return HRTIMER_NORESTART;
+
+ restart_timer = __kvm_timer_fn(vcpu, ktimer);
+ if (restart_timer)
+ return HRTIMER_RESTART;
+ else
+ return HRTIMER_NORESTART;
+}
+
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bb481330716..29f912927a5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -32,26 +32,27 @@
#include <asm/desc.h>
#include <asm/vmx.h>
#include <asm/virtext.h>
+#include <asm/mce.h>
#define __ex(x) __kvm_handle_fault_on_reboot(x)
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
-static int bypass_guest_pf = 1;
-module_param(bypass_guest_pf, bool, 0);
+static int __read_mostly bypass_guest_pf = 1;
+module_param(bypass_guest_pf, bool, S_IRUGO);
-static int enable_vpid = 1;
-module_param(enable_vpid, bool, 0);
+static int __read_mostly enable_vpid = 1;
+module_param_named(vpid, enable_vpid, bool, 0444);
-static int flexpriority_enabled = 1;
-module_param(flexpriority_enabled, bool, 0);
+static int __read_mostly flexpriority_enabled = 1;
+module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
-static int enable_ept = 1;
-module_param(enable_ept, bool, 0);
+static int __read_mostly enable_ept = 1;
+module_param_named(ept, enable_ept, bool, S_IRUGO);
-static int emulate_invalid_guest_state = 0;
-module_param(emulate_invalid_guest_state, bool, 0);
+static int __read_mostly emulate_invalid_guest_state = 0;
+module_param(emulate_invalid_guest_state, bool, S_IRUGO);
struct vmcs {
u32 revision_id;
@@ -97,6 +98,7 @@ struct vcpu_vmx {
int soft_vnmi_blocked;
ktime_t entry_time;
s64 vnmi_blocked_time;
+ u32 exit_reason;
};
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -111,9 +113,10 @@ static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
-static struct page *vmx_io_bitmap_a;
-static struct page *vmx_io_bitmap_b;
-static struct page *vmx_msr_bitmap;
+static unsigned long *vmx_io_bitmap_a;
+static unsigned long *vmx_io_bitmap_b;
+static unsigned long *vmx_msr_bitmap_legacy;
+static unsigned long *vmx_msr_bitmap_longmode;
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
@@ -213,70 +216,78 @@ static inline int is_external_interrupt(u32 intr_info)
== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
}
+static inline int is_machine_check(u32 intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+ INTR_INFO_VALID_MASK)) ==
+ (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
+}
+
static inline int cpu_has_vmx_msr_bitmap(void)
{
- return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS);
+ return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
}
static inline int cpu_has_vmx_tpr_shadow(void)
{
- return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW);
+ return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
}
static inline int vm_need_tpr_shadow(struct kvm *kvm)
{
- return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
+ return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
}
static inline int cpu_has_secondary_exec_ctrls(void)
{
- return (vmcs_config.cpu_based_exec_ctrl &
- CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
+ return vmcs_config.cpu_based_exec_ctrl &
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
}
static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
{
- return flexpriority_enabled
- && (vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+}
+
+static inline bool cpu_has_vmx_flexpriority(void)
+{
+ return cpu_has_vmx_tpr_shadow() &&
+ cpu_has_vmx_virtualize_apic_accesses();
}
static inline int cpu_has_vmx_invept_individual_addr(void)
{
- return (!!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT));
+ return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
}
static inline int cpu_has_vmx_invept_context(void)
{
- return (!!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT));
+ return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT);
}
static inline int cpu_has_vmx_invept_global(void)
{
- return (!!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT));
+ return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT);
}
static inline int cpu_has_vmx_ept(void)
{
- return (vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_ENABLE_EPT);
-}
-
-static inline int vm_need_ept(void)
-{
- return (cpu_has_vmx_ept() && enable_ept);
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_EPT;
}
static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
{
- return ((cpu_has_vmx_virtualize_apic_accesses()) &&
- (irqchip_in_kernel(kvm)));
+ return flexpriority_enabled &&
+ (cpu_has_vmx_virtualize_apic_accesses()) &&
+ (irqchip_in_kernel(kvm));
}
static inline int cpu_has_vmx_vpid(void)
{
- return (vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_ENABLE_VPID);
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_VPID;
}
static inline int cpu_has_virtual_nmis(void)
@@ -284,6 +295,11 @@ static inline int cpu_has_virtual_nmis(void)
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
}
+static inline bool report_flexpriority(void)
+{
+ return flexpriority_enabled;
+}
+
static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
{
int i;
@@ -381,7 +397,7 @@ static inline void ept_sync_global(void)
static inline void ept_sync_context(u64 eptp)
{
- if (vm_need_ept()) {
+ if (enable_ept) {
if (cpu_has_vmx_invept_context())
__invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
else
@@ -391,7 +407,7 @@ static inline void ept_sync_context(u64 eptp)
static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
{
- if (vm_need_ept()) {
+ if (enable_ept) {
if (cpu_has_vmx_invept_individual_addr())
__invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
eptp, gpa);
@@ -478,7 +494,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
{
u32 eb;
- eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
+ eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR);
if (!vcpu->fpu_active)
eb |= 1u << NM_VECTOR;
if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
@@ -488,9 +504,9 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
eb |= 1u << BP_VECTOR;
}
- if (vcpu->arch.rmode.active)
+ if (vcpu->arch.rmode.vm86_active)
eb = ~0;
- if (vm_need_ept())
+ if (enable_ept)
eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
vmcs_write32(EXCEPTION_BITMAP, eb);
}
@@ -724,29 +740,50 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
- if (vcpu->arch.rmode.active)
+ if (vcpu->arch.rmode.vm86_active)
rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
vmcs_writel(GUEST_RFLAGS, rflags);
}
+static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+ u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ int ret = 0;
+
+ if (interruptibility & GUEST_INTR_STATE_STI)
+ ret |= X86_SHADOW_INT_STI;
+ if (interruptibility & GUEST_INTR_STATE_MOV_SS)
+ ret |= X86_SHADOW_INT_MOV_SS;
+
+ return ret & mask;
+}
+
+static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+ u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ u32 interruptibility = interruptibility_old;
+
+ interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
+
+ if (mask & X86_SHADOW_INT_MOV_SS)
+ interruptibility |= GUEST_INTR_STATE_MOV_SS;
+ if (mask & X86_SHADOW_INT_STI)
+ interruptibility |= GUEST_INTR_STATE_STI;
+
+ if ((interruptibility != interruptibility_old))
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
+}
+
static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
unsigned long rip;
- u32 interruptibility;
rip = kvm_rip_read(vcpu);
rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
kvm_rip_write(vcpu, rip);
- /*
- * We emulated an instruction, so temporary interrupt blocking
- * should be removed, if set.
- */
- interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
- if (interruptibility & 3)
- vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
- interruptibility & ~3);
- vcpu->arch.interrupt_window_open = 1;
+ /* skipping an emulated instruction also counts */
+ vmx_set_interrupt_shadow(vcpu, 0);
}
static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
@@ -760,7 +797,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}
- if (vcpu->arch.rmode.active) {
+ if (vcpu->arch.rmode.vm86_active) {
vmx->rmode.irq.pending = true;
vmx->rmode.irq.vector = nr;
vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -773,8 +810,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
return;
}
- if (nr == BP_VECTOR || nr == OF_VECTOR) {
- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+ if (kvm_exception_is_soft(nr)) {
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmx->vcpu.arch.event_exit_inst_len);
intr_info |= INTR_TYPE_SOFT_EXCEPTION;
} else
intr_info |= INTR_TYPE_HARD_EXCEPTION;
@@ -782,11 +820,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
}
-static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
-{
- return false;
-}
-
/*
* Swap MSR entry in host/guest MSR entry array.
*/
@@ -812,6 +845,7 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
static void setup_msrs(struct vcpu_vmx *vmx)
{
int save_nmsrs;
+ unsigned long *msr_bitmap;
vmx_load_host_state(vmx);
save_nmsrs = 0;
@@ -847,6 +881,15 @@ static void setup_msrs(struct vcpu_vmx *vmx)
__find_msr_index(vmx, MSR_KERNEL_GS_BASE);
#endif
vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
+
+ if (cpu_has_vmx_msr_bitmap()) {
+ if (is_long_mode(&vmx->vcpu))
+ msr_bitmap = vmx_msr_bitmap_longmode;
+ else
+ msr_bitmap = vmx_msr_bitmap_legacy;
+
+ vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
+ }
}
/*
@@ -1034,13 +1077,6 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
return 0;
}
-static int vmx_get_irq(struct kvm_vcpu *vcpu)
-{
- if (!vcpu->arch.interrupt.pending)
- return -1;
- return vcpu->arch.interrupt.nr;
-}
-
static __init int cpu_has_kvm_support(void)
{
return cpu_has_vmx();
@@ -1241,7 +1277,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
struct page *pages;
struct vmcs *vmcs;
- pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
+ pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
if (!pages)
return NULL;
vmcs = page_address(pages);
@@ -1294,6 +1330,18 @@ static __init int hardware_setup(void)
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);
+ if (!cpu_has_vmx_vpid())
+ enable_vpid = 0;
+
+ if (!cpu_has_vmx_ept())
+ enable_ept = 0;
+
+ if (!cpu_has_vmx_flexpriority())
+ flexpriority_enabled = 0;
+
+ if (!cpu_has_vmx_tpr_shadow())
+ kvm_x86_ops->update_cr8_intercept = NULL;
+
return alloc_kvm_area();
}
@@ -1324,7 +1372,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
vmx->emulation_required = 1;
- vcpu->arch.rmode.active = 0;
+ vcpu->arch.rmode.vm86_active = 0;
vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
@@ -1386,7 +1434,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
vmx->emulation_required = 1;
- vcpu->arch.rmode.active = 1;
+ vcpu->arch.rmode.vm86_active = 1;
vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
@@ -1485,7 +1533,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
{
vpid_sync_vcpu_all(to_vmx(vcpu));
- if (vm_need_ept())
+ if (enable_ept)
ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
}
@@ -1555,10 +1603,10 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vmx_fpu_deactivate(vcpu);
- if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
+ if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE))
enter_pmode(vcpu);
- if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE))
+ if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE))
enter_rmode(vcpu);
#ifdef CONFIG_X86_64
@@ -1570,7 +1618,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
}
#endif
- if (vm_need_ept())
+ if (enable_ept)
ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
vmcs_writel(CR0_READ_SHADOW, cr0);
@@ -1599,7 +1647,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
u64 eptp;
guest_cr3 = cr3;
- if (vm_need_ept()) {
+ if (enable_ept) {
eptp = construct_eptp(cr3);
vmcs_write64(EPT_POINTER, eptp);
ept_sync_context(eptp);
@@ -1616,11 +1664,11 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ?
+ unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ?
KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
vcpu->arch.cr4 = cr4;
- if (vm_need_ept())
+ if (enable_ept)
ept_update_paging_mode_cr4(&hw_cr4, vcpu);
vmcs_writel(CR4_READ_SHADOW, cr4);
@@ -1699,7 +1747,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
u32 ar;
- if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) {
+ if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) {
vcpu->arch.rmode.tr.selector = var->selector;
vcpu->arch.rmode.tr.base = var->base;
vcpu->arch.rmode.tr.limit = var->limit;
@@ -1709,7 +1757,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
vmcs_writel(sf->base, var->base);
vmcs_write32(sf->limit, var->limit);
vmcs_write16(sf->selector, var->selector);
- if (vcpu->arch.rmode.active && var->s) {
+ if (vcpu->arch.rmode.vm86_active && var->s) {
/*
* Hack real-mode segments into vm86 compatibility.
*/
@@ -1982,7 +2030,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
pfn_t identity_map_pfn;
u32 tmp;
- if (!vm_need_ept())
+ if (!enable_ept)
return 1;
if (unlikely(!kvm->arch.ept_identity_pagetable)) {
printk(KERN_ERR "EPT: identity-mapping pagetable "
@@ -2071,7 +2119,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
int vpid;
vmx->vpid = 0;
- if (!enable_vpid || !cpu_has_vmx_vpid())
+ if (!enable_vpid)
return;
spin_lock(&vmx_vpid_lock);
vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
@@ -2082,9 +2130,9 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
spin_unlock(&vmx_vpid_lock);
}
-static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
+static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
{
- void *va;
+ int f = sizeof(unsigned long);
if (!cpu_has_vmx_msr_bitmap())
return;
@@ -2094,16 +2142,21 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
* have the write-low and read-high bitmap offsets the wrong way round.
* We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
*/
- va = kmap(msr_bitmap);
if (msr <= 0x1fff) {
- __clear_bit(msr, va + 0x000); /* read-low */
- __clear_bit(msr, va + 0x800); /* write-low */
+ __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
+ __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
msr &= 0x1fff;
- __clear_bit(msr, va + 0x400); /* read-high */
- __clear_bit(msr, va + 0xc00); /* write-high */
+ __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
+ __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
}
- kunmap(msr_bitmap);
+}
+
+static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
+{
+ if (!longmode_only)
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
}
/*
@@ -2121,11 +2174,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
u32 exec_control;
/* I/O */
- vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
- vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
+ vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
+ vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
if (cpu_has_vmx_msr_bitmap())
- vmcs_write64(MSR_BITMAP, page_to_phys(vmx_msr_bitmap));
+ vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
@@ -2141,7 +2194,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
CPU_BASED_CR8_LOAD_EXITING;
#endif
}
- if (!vm_need_ept())
+ if (!enable_ept)
exec_control |= CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_INVLPG_EXITING;
@@ -2154,7 +2207,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
if (vmx->vpid == 0)
exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
- if (!vm_need_ept())
+ if (!enable_ept)
exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
}
@@ -2273,7 +2326,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
goto out;
}
- vmx->vcpu.arch.rmode.active = 0;
+ vmx->vcpu.arch.rmode.vm86_active = 0;
vmx->soft_vnmi_blocked = 0;
@@ -2402,14 +2455,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
}
-static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
+static void vmx_inject_irq(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ uint32_t intr;
+ int irq = vcpu->arch.interrupt.nr;
KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
++vcpu->stat.irq_injections;
- if (vcpu->arch.rmode.active) {
+ if (vcpu->arch.rmode.vm86_active) {
vmx->rmode.irq.pending = true;
vmx->rmode.irq.vector = irq;
vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -2419,8 +2474,14 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
return;
}
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
- irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
+ intr = irq | INTR_INFO_VALID_MASK;
+ if (vcpu->arch.interrupt.soft) {
+ intr |= INTR_TYPE_SOFT_INTR;
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmx->vcpu.arch.event_exit_inst_len);
+ } else
+ intr |= INTR_TYPE_EXT_INTR;
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
}
static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2441,7 +2502,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
}
++vcpu->stat.nmi_injections;
- if (vcpu->arch.rmode.active) {
+ if (vcpu->arch.rmode.vm86_active) {
vmx->rmode.irq.pending = true;
vmx->rmode.irq.vector = NMI_VECTOR;
vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -2456,76 +2517,21 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
}
-static void vmx_update_window_states(struct kvm_vcpu *vcpu)
+static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
{
- u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-
- vcpu->arch.nmi_window_open =
- !(guest_intr & (GUEST_INTR_STATE_STI |
- GUEST_INTR_STATE_MOV_SS |
- GUEST_INTR_STATE_NMI));
if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
- vcpu->arch.nmi_window_open = 0;
-
- vcpu->arch.interrupt_window_open =
- ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
- !(guest_intr & (GUEST_INTR_STATE_STI |
- GUEST_INTR_STATE_MOV_SS)));
-}
-
-static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
-{
- int word_index = __ffs(vcpu->arch.irq_summary);
- int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
- int irq = word_index * BITS_PER_LONG + bit_index;
+ return 0;
- clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
- if (!vcpu->arch.irq_pending[word_index])
- clear_bit(word_index, &vcpu->arch.irq_summary);
- kvm_queue_interrupt(vcpu, irq);
+ return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS |
+ GUEST_INTR_STATE_NMI));
}
-static void do_interrupt_requests(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
+static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
{
- vmx_update_window_states(vcpu);
-
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_STI |
- GUEST_INTR_STATE_MOV_SS);
-
- if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
- if (vcpu->arch.interrupt.pending) {
- enable_nmi_window(vcpu);
- } else if (vcpu->arch.nmi_window_open) {
- vcpu->arch.nmi_pending = false;
- vcpu->arch.nmi_injected = true;
- } else {
- enable_nmi_window(vcpu);
- return;
- }
- }
- if (vcpu->arch.nmi_injected) {
- vmx_inject_nmi(vcpu);
- if (vcpu->arch.nmi_pending)
- enable_nmi_window(vcpu);
- else if (vcpu->arch.irq_summary
- || kvm_run->request_interrupt_window)
- enable_irq_window(vcpu);
- return;
- }
-
- if (vcpu->arch.interrupt_window_open) {
- if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
- kvm_do_inject_irq(vcpu);
-
- if (vcpu->arch.interrupt.pending)
- vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
- }
- if (!vcpu->arch.interrupt_window_open &&
- (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
- enable_irq_window(vcpu);
+ return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+ !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
}
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -2585,6 +2591,31 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
return 0;
}
+/*
+ * Trigger machine check on the host. We assume all the MSRs are already set up
+ * by the CPU and that we still run on the same CPU as the MCE occurred on.
+ * We pass a fake environment to the machine check handler because we want
+ * the guest to be always treated like user space, no matter what context
+ * it used internally.
+ */
+static void kvm_machine_check(void)
+{
+#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
+ struct pt_regs regs = {
+ .cs = 3, /* Fake ring 3 no matter what the guest ran on */
+ .flags = X86_EFLAGS_IF,
+ };
+
+ do_machine_check(&regs, 0);
+#endif
+}
+
+static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ /* already handled by vcpu_run */
+ return 1;
+}
+
static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2596,17 +2627,14 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vect_info = vmx->idt_vectoring_info;
intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ if (is_machine_check(intr_info))
+ return handle_machine_check(vcpu, kvm_run);
+
if ((vect_info & VECTORING_INFO_VALID_MASK) &&
!is_page_fault(intr_info))
printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
"intr info 0x%x\n", __func__, vect_info, intr_info);
- if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
- int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
- set_bit(irq, vcpu->arch.irq_pending);
- set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
- }
-
if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
return 1; /* already handled by vmx_vcpu_run() */
@@ -2628,17 +2656,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
if (is_page_fault(intr_info)) {
/* EPT won't cause page fault directly */
- if (vm_need_ept())
+ if (enable_ept)
BUG();
cr2 = vmcs_readl(EXIT_QUALIFICATION);
KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
(u32)((u64)cr2 >> 32), handler);
- if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending)
+ if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, cr2);
return kvm_mmu_page_fault(vcpu, cr2, error_code);
}
- if (vcpu->arch.rmode.active &&
+ if (vcpu->arch.rmode.vm86_active &&
handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
error_code)) {
if (vcpu->arch.halt_request) {
@@ -2753,13 +2781,18 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
skip_emulated_instruction(vcpu);
return 1;
- case 8:
- kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg));
- skip_emulated_instruction(vcpu);
- if (irqchip_in_kernel(vcpu->kvm))
- return 1;
- kvm_run->exit_reason = KVM_EXIT_SET_TPR;
- return 0;
+ case 8: {
+ u8 cr8_prev = kvm_get_cr8(vcpu);
+ u8 cr8 = kvm_register_read(vcpu, reg);
+ kvm_set_cr8(vcpu, cr8);
+ skip_emulated_instruction(vcpu);
+ if (irqchip_in_kernel(vcpu->kvm))
+ return 1;
+ if (cr8_prev <= cr8)
+ return 1;
+ kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+ return 0;
+ }
};
break;
case 2: /* clts */
@@ -2957,8 +2990,9 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
* If the user space waits to inject interrupts, exit as soon as
* possible
*/
- if (kvm_run->request_interrupt_window &&
- !vcpu->arch.irq_summary) {
+ if (!irqchip_in_kernel(vcpu->kvm) &&
+ kvm_run->request_interrupt_window &&
+ !kvm_cpu_has_interrupt(vcpu)) {
kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
return 0;
}
@@ -2978,9 +3012,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
}
+static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
- u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
kvm_mmu_invlpg(vcpu, exit_qualification);
skip_emulated_instruction(vcpu);
@@ -2996,11 +3036,11 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
- u64 exit_qualification;
+ unsigned long exit_qualification;
enum emulation_result er;
unsigned long offset;
- exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
offset = exit_qualification & 0xffful;
er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
@@ -3019,22 +3059,41 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long exit_qualification;
u16 tss_selector;
- int reason;
+ int reason, type, idt_v;
+
+ idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+ type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
reason = (u32)exit_qualification >> 30;
- if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected &&
- (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
- (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK)
- == INTR_TYPE_NMI_INTR) {
- vcpu->arch.nmi_injected = false;
- if (cpu_has_virtual_nmis())
- vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
+ if (reason == TASK_SWITCH_GATE && idt_v) {
+ switch (type) {
+ case INTR_TYPE_NMI_INTR:
+ vcpu->arch.nmi_injected = false;
+ if (cpu_has_virtual_nmis())
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ break;
+ case INTR_TYPE_EXT_INTR:
+ case INTR_TYPE_SOFT_INTR:
+ kvm_clear_interrupt_queue(vcpu);
+ break;
+ case INTR_TYPE_HARD_EXCEPTION:
+ case INTR_TYPE_SOFT_EXCEPTION:
+ kvm_clear_exception_queue(vcpu);
+ break;
+ default:
+ break;
+ }
}
tss_selector = exit_qualification;
+ if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
+ type != INTR_TYPE_EXT_INTR &&
+ type != INTR_TYPE_NMI_INTR))
+ skip_emulated_instruction(vcpu);
+
if (!kvm_task_switch(vcpu, tss_selector, reason))
return 0;
@@ -3051,11 +3110,11 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
- u64 exit_qualification;
+ unsigned long exit_qualification;
gpa_t gpa;
int gla_validity;
- exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
if (exit_qualification & (1 << 6)) {
printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
@@ -3067,7 +3126,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
(long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
- (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
+ vmcs_readl(GUEST_LINEAR_ADDRESS));
printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
(long unsigned int)exit_qualification);
kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
@@ -3098,8 +3157,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx = to_vmx(vcpu);
enum emulation_result err = EMULATE_DONE;
- preempt_enable();
local_irq_enable();
+ preempt_enable();
while (!guest_state_valid(vcpu)) {
err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
@@ -3109,7 +3168,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
if (err != EMULATE_DONE) {
kvm_report_emulation_failure(vcpu, "emulation failure");
- return;
+ break;
}
if (signal_pending(current))
@@ -3118,8 +3177,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
schedule();
}
- local_irq_disable();
preempt_disable();
+ local_irq_disable();
vmx->invalid_state_emulation_result = err;
}
@@ -3145,11 +3204,21 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
[EXIT_REASON_HLT] = handle_halt,
[EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_VMCALL] = handle_vmcall,
+ [EXIT_REASON_VMCLEAR] = handle_vmx_insn,
+ [EXIT_REASON_VMLAUNCH] = handle_vmx_insn,
+ [EXIT_REASON_VMPTRLD] = handle_vmx_insn,
+ [EXIT_REASON_VMPTRST] = handle_vmx_insn,
+ [EXIT_REASON_VMREAD] = handle_vmx_insn,
+ [EXIT_REASON_VMRESUME] = handle_vmx_insn,
+ [EXIT_REASON_VMWRITE] = handle_vmx_insn,
+ [EXIT_REASON_VMOFF] = handle_vmx_insn,
+ [EXIT_REASON_VMON] = handle_vmx_insn,
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
[EXIT_REASON_WBINVD] = handle_wbinvd,
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,
[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
+ [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
};
static const int kvm_vmx_max_exit_handlers =
@@ -3159,10 +3228,10 @@ static const int kvm_vmx_max_exit_handlers =
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
*/
-static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
{
- u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
@@ -3178,7 +3247,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
/* Access CR3 don't cause VMExit in paging mode, so we need
* to sync with guest real CR3. */
- if (vm_need_ept() && is_paging(vcpu)) {
+ if (enable_ept && is_paging(vcpu)) {
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
ept_load_pdptrs(vcpu);
}
@@ -3199,9 +3268,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
__func__, vectoring_info, exit_reason);
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
- if (vcpu->arch.interrupt_window_open) {
+ if (vmx_interrupt_allowed(vcpu)) {
vmx->soft_vnmi_blocked = 0;
- vcpu->arch.nmi_window_open = 1;
} else if (vmx->vnmi_blocked_time > 1000000000LL &&
vcpu->arch.nmi_pending) {
/*
@@ -3214,7 +3282,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
"state on VCPU %d after 1 s timeout\n",
__func__, vcpu->vcpu_id);
vmx->soft_vnmi_blocked = 0;
- vmx->vcpu.arch.nmi_window_open = 1;
}
}
@@ -3228,122 +3295,107 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
return 0;
}
-static void update_tpr_threshold(struct kvm_vcpu *vcpu)
+static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
- int max_irr, tpr;
-
- if (!vm_need_tpr_shadow(vcpu->kvm))
- return;
-
- if (!kvm_lapic_enabled(vcpu) ||
- ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) {
+ if (irr == -1 || tpr < irr) {
vmcs_write32(TPR_THRESHOLD, 0);
return;
}
- tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4;
- vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
+ vmcs_write32(TPR_THRESHOLD, irr);
}
static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
{
u32 exit_intr_info;
- u32 idt_vectoring_info;
+ u32 idt_vectoring_info = vmx->idt_vectoring_info;
bool unblock_nmi;
u8 vector;
int type;
bool idtv_info_valid;
- u32 error;
exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+
+ /* Handle machine checks before interrupts are enabled */
+ if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
+ || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
+ && is_machine_check(exit_intr_info)))
+ kvm_machine_check();
+
+ /* We need to handle NMIs before interrupts are enabled */
+ if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
+ (exit_intr_info & INTR_INFO_VALID_MASK)) {
+ KVMTRACE_0D(NMI, &vmx->vcpu, handler);
+ asm("int $2");
+ }
+
+ idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+
if (cpu_has_virtual_nmis()) {
unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
/*
- * SDM 3: 25.7.1.2
+ * SDM 3: 27.7.1.2 (September 2008)
* Re-set bit "block by NMI" before VM entry if vmexit caused by
* a guest IRET fault.
+ * SDM 3: 23.2.2 (September 2008)
+ * Bit 12 is undefined in any of the following cases:
+ * If the VM exit sets the valid bit in the IDT-vectoring
+ * information field.
+ * If the VM exit is due to a double fault.
*/
- if (unblock_nmi && vector != DF_VECTOR)
+ if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
+ vector != DF_VECTOR && !idtv_info_valid)
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
GUEST_INTR_STATE_NMI);
} else if (unlikely(vmx->soft_vnmi_blocked))
vmx->vnmi_blocked_time +=
ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
- idt_vectoring_info = vmx->idt_vectoring_info;
- idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+ vmx->vcpu.arch.nmi_injected = false;
+ kvm_clear_exception_queue(&vmx->vcpu);
+ kvm_clear_interrupt_queue(&vmx->vcpu);
+
+ if (!idtv_info_valid)
+ return;
+
vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
- if (vmx->vcpu.arch.nmi_injected) {
+
+ switch (type) {
+ case INTR_TYPE_NMI_INTR:
+ vmx->vcpu.arch.nmi_injected = true;
/*
- * SDM 3: 25.7.1.2
- * Clear bit "block by NMI" before VM entry if a NMI delivery
- * faulted.
+ * SDM 3: 27.7.1.2 (September 2008)
+ * Clear bit "block by NMI" before VM entry if a NMI
+ * delivery faulted.
*/
- if (idtv_info_valid && type == INTR_TYPE_NMI_INTR)
- vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
- else
- vmx->vcpu.arch.nmi_injected = false;
- }
- kvm_clear_exception_queue(&vmx->vcpu);
- if (idtv_info_valid && (type == INTR_TYPE_HARD_EXCEPTION ||
- type == INTR_TYPE_SOFT_EXCEPTION)) {
+ vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ break;
+ case INTR_TYPE_SOFT_EXCEPTION:
+ vmx->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ /* fall through */
+ case INTR_TYPE_HARD_EXCEPTION:
if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
- error = vmcs_read32(IDT_VECTORING_ERROR_CODE);
- kvm_queue_exception_e(&vmx->vcpu, vector, error);
+ u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+ kvm_queue_exception_e(&vmx->vcpu, vector, err);
} else
kvm_queue_exception(&vmx->vcpu, vector);
- vmx->idt_vectoring_info = 0;
- }
- kvm_clear_interrupt_queue(&vmx->vcpu);
- if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) {
- kvm_queue_interrupt(&vmx->vcpu, vector);
- vmx->idt_vectoring_info = 0;
- }
-}
-
-static void vmx_intr_assist(struct kvm_vcpu *vcpu)
-{
- update_tpr_threshold(vcpu);
-
- vmx_update_window_states(vcpu);
-
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_STI |
- GUEST_INTR_STATE_MOV_SS);
-
- if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
- if (vcpu->arch.interrupt.pending) {
- enable_nmi_window(vcpu);
- } else if (vcpu->arch.nmi_window_open) {
- vcpu->arch.nmi_pending = false;
- vcpu->arch.nmi_injected = true;
- } else {
- enable_nmi_window(vcpu);
- return;
- }
- }
- if (vcpu->arch.nmi_injected) {
- vmx_inject_nmi(vcpu);
- if (vcpu->arch.nmi_pending)
- enable_nmi_window(vcpu);
- else if (kvm_cpu_has_interrupt(vcpu))
- enable_irq_window(vcpu);
- return;
- }
- if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
- if (vcpu->arch.interrupt_window_open)
- kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
- else
- enable_irq_window(vcpu);
- }
- if (vcpu->arch.interrupt.pending) {
- vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
- if (kvm_cpu_has_interrupt(vcpu))
- enable_irq_window(vcpu);
+ break;
+ case INTR_TYPE_SOFT_INTR:
+ vmx->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ /* fall through */
+ case INTR_TYPE_EXT_INTR:
+ kvm_queue_interrupt(&vmx->vcpu, vector,
+ type == INTR_TYPE_SOFT_INTR);
+ break;
+ default:
+ break;
}
}
@@ -3381,7 +3433,6 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 intr_info;
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
@@ -3505,20 +3556,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (vmx->rmode.irq.pending)
fixup_rmode_irq(vmx);
- vmx_update_window_states(vcpu);
-
asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
vmx->launched = 1;
- intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
- /* We need to handle NMIs before interrupts are enabled */
- if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
- (intr_info & INTR_INFO_VALID_MASK)) {
- KVMTRACE_0D(NMI, vcpu, handler);
- asm("int $2");
- }
-
vmx_complete_interrupts(vmx);
}
@@ -3593,7 +3633,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (alloc_apic_access_page(kvm) != 0)
goto free_vmcs;
- if (vm_need_ept())
+ if (enable_ept)
if (alloc_identity_pagetable(kvm) != 0)
goto free_vmcs;
@@ -3631,9 +3671,32 @@ static int get_ept_level(void)
return VMX_EPT_DEFAULT_GAW + 1;
}
-static int vmx_get_mt_mask_shift(void)
+static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
- return VMX_EPT_MT_EPTE_SHIFT;
+ u64 ret;
+
+ /* For VT-d and EPT combination
+ * 1. MMIO: always map as UC
+ * 2. EPT with VT-d:
+ * a. VT-d without snooping control feature: can't guarantee the
+ * result, try to trust guest.
+ * b. VT-d with snooping control feature: snooping control feature of
+ * VT-d engine can guarantee the cache correctness. Just set it
+ * to WB to keep consistent with host. So the same as item 3.
+ * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep
+ * consistent with host MTRR
+ */
+ if (is_mmio)
+ ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
+ else if (vcpu->kvm->arch.iommu_domain &&
+ !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
+ ret = kvm_get_guest_memory_type(vcpu, gfn) <<
+ VMX_EPT_MT_EPTE_SHIFT;
+ else
+ ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
+ | VMX_EPT_IGMT_BIT;
+
+ return ret;
}
static struct kvm_x86_ops vmx_x86_ops = {
@@ -3644,7 +3707,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.check_processor_compatibility = vmx_check_processor_compat,
.hardware_enable = hardware_enable,
.hardware_disable = hardware_disable,
- .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
+ .cpu_has_accelerated_tpr = report_flexpriority,
.vcpu_create = vmx_create_vcpu,
.vcpu_free = vmx_free_vcpu,
@@ -3678,78 +3741,82 @@ static struct kvm_x86_ops vmx_x86_ops = {
.tlb_flush = vmx_flush_tlb,
.run = vmx_vcpu_run,
- .handle_exit = kvm_handle_exit,
+ .handle_exit = vmx_handle_exit,
.skip_emulated_instruction = skip_emulated_instruction,
+ .set_interrupt_shadow = vmx_set_interrupt_shadow,
+ .get_interrupt_shadow = vmx_get_interrupt_shadow,
.patch_hypercall = vmx_patch_hypercall,
- .get_irq = vmx_get_irq,
.set_irq = vmx_inject_irq,
+ .set_nmi = vmx_inject_nmi,
.queue_exception = vmx_queue_exception,
- .exception_injected = vmx_exception_injected,
- .inject_pending_irq = vmx_intr_assist,
- .inject_pending_vectors = do_interrupt_requests,
+ .interrupt_allowed = vmx_interrupt_allowed,
+ .nmi_allowed = vmx_nmi_allowed,
+ .enable_nmi_window = enable_nmi_window,
+ .enable_irq_window = enable_irq_window,
+ .update_cr8_intercept = update_cr8_intercept,
.set_tss_addr = vmx_set_tss_addr,
.get_tdp_level = get_ept_level,
- .get_mt_mask_shift = vmx_get_mt_mask_shift,
+ .get_mt_mask = vmx_get_mt_mask,
};
static int __init vmx_init(void)
{
- void *va;
int r;
- vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+ vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_io_bitmap_a)
return -ENOMEM;
- vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+ vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_io_bitmap_b) {
r = -ENOMEM;
goto out;
}
- vmx_msr_bitmap = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
- if (!vmx_msr_bitmap) {
+ vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_legacy) {
r = -ENOMEM;
goto out1;
}
+ vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_longmode) {
+ r = -ENOMEM;
+ goto out2;
+ }
+
/*
* Allow direct access to the PC debug port (it is often used for I/O
* delays, but the vmexits simply slow things down).
*/
- va = kmap(vmx_io_bitmap_a);
- memset(va, 0xff, PAGE_SIZE);
- clear_bit(0x80, va);
- kunmap(vmx_io_bitmap_a);
+ memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
+ clear_bit(0x80, vmx_io_bitmap_a);
- va = kmap(vmx_io_bitmap_b);
- memset(va, 0xff, PAGE_SIZE);
- kunmap(vmx_io_bitmap_b);
+ memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
- va = kmap(vmx_msr_bitmap);
- memset(va, 0xff, PAGE_SIZE);
- kunmap(vmx_msr_bitmap);
+ memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
+ memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
if (r)
- goto out2;
+ goto out3;
- vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_FS_BASE);
- vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_GS_BASE);
- vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_CS);
- vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP);
- vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP);
+ vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
+ vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
+ vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
+ vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
+ vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
+ vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
- if (vm_need_ept()) {
+ if (enable_ept) {
bypass_guest_pf = 0;
kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
VMX_EPT_WRITABLE_MASK);
kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
- VMX_EPT_EXECUTABLE_MASK,
- VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
+ VMX_EPT_EXECUTABLE_MASK);
kvm_enable_tdp();
} else
kvm_disable_tdp();
@@ -3761,20 +3828,23 @@ static int __init vmx_init(void)
return 0;
+out3:
+ free_page((unsigned long)vmx_msr_bitmap_longmode);
out2:
- __free_page(vmx_msr_bitmap);
+ free_page((unsigned long)vmx_msr_bitmap_legacy);
out1:
- __free_page(vmx_io_bitmap_b);
+ free_page((unsigned long)vmx_io_bitmap_b);
out:
- __free_page(vmx_io_bitmap_a);
+ free_page((unsigned long)vmx_io_bitmap_a);
return r;
}
static void __exit vmx_exit(void)
{
- __free_page(vmx_msr_bitmap);
- __free_page(vmx_io_bitmap_b);
- __free_page(vmx_io_bitmap_a);
+ free_page((unsigned long)vmx_msr_bitmap_legacy);
+ free_page((unsigned long)vmx_msr_bitmap_longmode);
+ free_page((unsigned long)vmx_io_bitmap_b);
+ free_page((unsigned long)vmx_io_bitmap_a);
kvm_exit();
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3944e917e79..3d452901182 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -91,7 +91,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "hypercalls", VCPU_STAT(hypercalls) },
{ "request_irq", VCPU_STAT(request_irq_exits) },
- { "request_nmi", VCPU_STAT(request_nmi_exits) },
{ "irq_exits", VCPU_STAT(irq_exits) },
{ "host_state_reload", VCPU_STAT(host_state_reload) },
{ "efer_reload", VCPU_STAT(efer_reload) },
@@ -108,7 +107,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "mmu_recycled", VM_STAT(mmu_recycled) },
{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
{ "mmu_unsync", VM_STAT(mmu_unsync) },
- { "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ "largepages", VM_STAT(lpages) },
{ NULL }
@@ -234,7 +232,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
goto out;
}
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
- if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
+ if (is_present_pte(pdpte[i]) &&
+ (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
ret = 0;
goto out;
}
@@ -321,7 +320,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
kvm_x86_ops->set_cr0(vcpu, cr0);
vcpu->arch.cr0 = cr0;
- kvm_mmu_sync_global(vcpu);
kvm_mmu_reset_context(vcpu);
return;
}
@@ -370,7 +368,6 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
kvm_x86_ops->set_cr4(vcpu, cr4);
vcpu->arch.cr4 = cr4;
vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
- kvm_mmu_sync_global(vcpu);
kvm_mmu_reset_context(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -523,6 +520,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
efer |= vcpu->arch.shadow_efer & EFER_LMA;
vcpu->arch.shadow_efer = efer;
+
+ vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
+ kvm_mmu_reset_context(vcpu);
}
void kvm_enable_efer_bits(u64 mask)
@@ -630,14 +630,17 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
unsigned long flags;
struct kvm_vcpu_arch *vcpu = &v->arch;
void *shared_kaddr;
+ unsigned long this_tsc_khz;
if ((!vcpu->time_page))
return;
- if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) {
- kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock);
- vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz);
+ this_tsc_khz = get_cpu_var(cpu_tsc_khz);
+ if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
+ kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
+ vcpu->hv_clock_tsc_khz = this_tsc_khz;
}
+ put_cpu_var(cpu_tsc_khz);
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
@@ -701,11 +704,48 @@ static bool msr_mtrr_valid(unsigned msr)
return false;
}
+static bool valid_pat_type(unsigned t)
+{
+ return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
+}
+
+static bool valid_mtrr_type(unsigned t)
+{
+ return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
+}
+
+static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ int i;
+
+ if (!msr_mtrr_valid(msr))
+ return false;
+
+ if (msr == MSR_IA32_CR_PAT) {
+ for (i = 0; i < 8; i++)
+ if (!valid_pat_type((data >> (i * 8)) & 0xff))
+ return false;
+ return true;
+ } else if (msr == MSR_MTRRdefType) {
+ if (data & ~0xcff)
+ return false;
+ return valid_mtrr_type(data & 0xff);
+ } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
+ for (i = 0; i < 8 ; i++)
+ if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
+ return false;
+ return true;
+ }
+
+ /* variable MTRRs */
+ return valid_mtrr_type(data & 0xff);
+}
+
static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
- if (!msr_mtrr_valid(msr))
+ if (!mtrr_valid(vcpu, msr, data))
return 1;
if (msr == MSR_MTRRdefType) {
@@ -893,6 +933,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_LASTINTFROMIP:
case MSR_IA32_LASTINTTOIP:
case MSR_VM_HSAVE_PA:
+ case MSR_P6_EVNTSEL0:
+ case MSR_P6_EVNTSEL1:
+ case MSR_K7_EVNTSEL0:
data = 0;
break;
case MSR_MTRRcap:
@@ -1024,6 +1067,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_SYNC_MMU:
case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
+ case KVM_CAP_ASSIGN_DEV_IRQ:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -1072,14 +1116,13 @@ long kvm_arch_dev_ioctl(struct file *filp,
if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
goto out;
r = -E2BIG;
- if (n < num_msrs_to_save)
+ if (n < msr_list.nmsrs)
goto out;
r = -EFAULT;
if (copy_to_user(user_msr_list->indices, &msrs_to_save,
num_msrs_to_save * sizeof(u32)))
goto out;
- if (copy_to_user(user_msr_list->indices
- + num_msrs_to_save * sizeof(u32),
+ if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
&emulated_msrs,
ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
goto out;
@@ -1241,41 +1284,53 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->flags = 0;
}
+#define F(x) bit(X86_FEATURE_##x)
+
static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
u32 index, int *nent, int maxnent)
{
- const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
- bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
- bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
- bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
- bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
- bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
- bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
- bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
- bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
- bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
- const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
- bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
- bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
- bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
- bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
- bit(X86_FEATURE_PGE) |
- bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
- bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
- bit(X86_FEATURE_SYSCALL) |
- (is_efer_nx() ? bit(X86_FEATURE_NX) : 0) |
+ unsigned f_nx = is_efer_nx() ? F(NX) : 0;
#ifdef CONFIG_X86_64
- bit(X86_FEATURE_LM) |
+ unsigned f_lm = F(LM);
+#else
+ unsigned f_lm = 0;
#endif
- bit(X86_FEATURE_FXSR_OPT) |
- bit(X86_FEATURE_MMXEXT) |
- bit(X86_FEATURE_3DNOWEXT) |
- bit(X86_FEATURE_3DNOW);
- const u32 kvm_supported_word3_x86_features =
- bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
+
+ /* cpuid 1.edx */
+ const u32 kvm_supported_word0_x86_features =
+ F(FPU) | F(VME) | F(DE) | F(PSE) |
+ F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+ F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
+ F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+ F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
+ 0 /* Reserved, DS, ACPI */ | F(MMX) |
+ F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
+ 0 /* HTT, TM, Reserved, PBE */;
+ /* cpuid 0x80000001.edx */
+ const u32 kvm_supported_word1_x86_features =
+ F(FPU) | F(VME) | F(DE) | F(PSE) |
+ F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+ F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
+ F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+ F(PAT) | F(PSE36) | 0 /* Reserved */ |
+ f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
+ F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ |
+ 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
+ /* cpuid 1.ecx */
+ const u32 kvm_supported_word4_x86_features =
+ F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
+ 0 /* DS-CPL, VMX, SMX, EST */ |
+ 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
+ 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
+ 0 /* Reserved, DCA */ | F(XMM4_1) |
+ F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) |
+ 0 /* Reserved, XSAVE, OSXSAVE */;
+ /* cpuid 0x80000001.ecx */
const u32 kvm_supported_word6_x86_features =
- bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) |
- bit(X86_FEATURE_SVM);
+ F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
+ F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
+ F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
+ 0 /* SKINIT */ | 0 /* WDT */;
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -1288,7 +1343,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
break;
case 1:
entry->edx &= kvm_supported_word0_x86_features;
- entry->ecx &= kvm_supported_word3_x86_features;
+ entry->ecx &= kvm_supported_word4_x86_features;
break;
/* function 2 entries are STATEFUL. That is, repeated cpuid commands
* may return different values. This forces us to get_cpu() before
@@ -1350,6 +1405,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
put_cpu();
}
+#undef F
+
static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries)
{
@@ -1421,8 +1478,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
return -ENXIO;
vcpu_load(vcpu);
- set_bit(irq->irq, vcpu->arch.irq_pending);
- set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
+ kvm_queue_interrupt(vcpu, irq->irq, false);
vcpu_put(vcpu);
@@ -1584,8 +1640,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
}
out:
- if (lapic)
- kfree(lapic);
+ kfree(lapic);
return r;
}
@@ -1606,10 +1661,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
return -EINVAL;
down_write(&kvm->slots_lock);
+ spin_lock(&kvm->mmu_lock);
kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
+ spin_unlock(&kvm->mmu_lock);
up_write(&kvm->slots_lock);
return 0;
}
@@ -1785,7 +1842,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
/* If nothing is dirty, don't bother messing with page tables. */
if (is_dirty) {
+ spin_lock(&kvm->mmu_lock);
kvm_mmu_slot_remove_write_access(kvm, log->slot);
+ spin_unlock(&kvm->mmu_lock);
kvm_flush_remote_tlbs(kvm);
memslot = &kvm->memslots[log->slot];
n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
@@ -2360,7 +2419,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
u16 error_code,
int emulation_type)
{
- int r;
+ int r, shadow_mask;
struct decode_cache *c;
kvm_clear_exception_queue(vcpu);
@@ -2408,7 +2467,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
}
}
+ if (emulation_type & EMULTYPE_SKIP) {
+ kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
+ return EMULATE_DONE;
+ }
+
r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+ shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
+
+ if (r == 0)
+ kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
if (vcpu->arch.pio.string)
return EMULATE_DO_MMIO;
@@ -2761,7 +2829,7 @@ int kvm_arch_init(void *opaque)
kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
- PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
+ PT_DIRTY_MASK, PT64_NX_MASK, 0);
for_each_possible_cpu(cpu)
per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
@@ -3012,6 +3080,16 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
return best;
}
+int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+ if (best)
+ return best->eax & 0xff;
+ return 36;
+}
+
void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
{
u32 function, index;
@@ -3048,10 +3126,9 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
{
- return (!vcpu->arch.irq_summary &&
+ return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
kvm_run->request_interrupt_window &&
- vcpu->arch.interrupt_window_open &&
- (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
+ kvm_arch_interrupt_allowed(vcpu));
}
static void post_kvm_run_save(struct kvm_vcpu *vcpu,
@@ -3064,8 +3141,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
kvm_run->ready_for_interrupt_injection = 1;
else
kvm_run->ready_for_interrupt_injection =
- (vcpu->arch.interrupt_window_open &&
- vcpu->arch.irq_summary == 0);
+ kvm_arch_interrupt_allowed(vcpu) &&
+ !kvm_cpu_has_interrupt(vcpu) &&
+ !kvm_event_needs_reinjection(vcpu);
}
static void vapic_enter(struct kvm_vcpu *vcpu)
@@ -3094,9 +3172,63 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
up_read(&vcpu->kvm->slots_lock);
}
+static void update_cr8_intercept(struct kvm_vcpu *vcpu)
+{
+ int max_irr, tpr;
+
+ if (!kvm_x86_ops->update_cr8_intercept)
+ return;
+
+ if (!vcpu->arch.apic->vapic_addr)
+ max_irr = kvm_lapic_find_highest_irr(vcpu);
+ else
+ max_irr = -1;
+
+ if (max_irr != -1)
+ max_irr >>= 4;
+
+ tpr = kvm_lapic_get_cr8(vcpu);
+
+ kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
+}
+
+static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
+
+ /* try to reinject previous events if any */
+ if (vcpu->arch.nmi_injected) {
+ kvm_x86_ops->set_nmi(vcpu);
+ return;
+ }
+
+ if (vcpu->arch.interrupt.pending) {
+ kvm_x86_ops->set_irq(vcpu);
+ return;
+ }
+
+ /* try to inject new event if pending */
+ if (vcpu->arch.nmi_pending) {
+ if (kvm_x86_ops->nmi_allowed(vcpu)) {
+ vcpu->arch.nmi_pending = false;
+ vcpu->arch.nmi_injected = true;
+ kvm_x86_ops->set_nmi(vcpu);
+ }
+ } else if (kvm_cpu_has_interrupt(vcpu)) {
+ if (kvm_x86_ops->interrupt_allowed(vcpu)) {
+ kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
+ false);
+ kvm_x86_ops->set_irq(vcpu);
+ }
+ }
+}
+
static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
int r;
+ bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+ kvm_run->request_interrupt_window;
if (vcpu->requests)
if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -3128,9 +3260,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
}
- clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
- kvm_inject_pending_timer_irqs(vcpu);
-
preempt_disable();
kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -3138,6 +3267,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
local_irq_disable();
+ clear_bit(KVM_REQ_KICK, &vcpu->requests);
+ smp_mb__after_clear_bit();
+
if (vcpu->requests || need_resched() || signal_pending(current)) {
local_irq_enable();
preempt_enable();
@@ -3145,21 +3277,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
goto out;
}
- vcpu->guest_mode = 1;
- /*
- * Make sure that guest_mode assignment won't happen after
- * testing the pending IRQ vector bitmap.
- */
- smp_wmb();
-
if (vcpu->arch.exception.pending)
__queue_exception(vcpu);
- else if (irqchip_in_kernel(vcpu->kvm))
- kvm_x86_ops->inject_pending_irq(vcpu);
else
- kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
+ inject_pending_irq(vcpu, kvm_run);
- kvm_lapic_sync_to_vapic(vcpu);
+ /* enable NMI/IRQ window open exits if needed */
+ if (vcpu->arch.nmi_pending)
+ kvm_x86_ops->enable_nmi_window(vcpu);
+ else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+ kvm_x86_ops->enable_irq_window(vcpu);
+
+ if (kvm_lapic_enabled(vcpu)) {
+ update_cr8_intercept(vcpu);
+ kvm_lapic_sync_to_vapic(vcpu);
+ }
up_read(&vcpu->kvm->slots_lock);
@@ -3193,7 +3325,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
set_debugreg(vcpu->arch.host_dr6, 6);
set_debugreg(vcpu->arch.host_dr7, 7);
- vcpu->guest_mode = 0;
+ set_bit(KVM_REQ_KICK, &vcpu->requests);
local_irq_enable();
++vcpu->stat.exits;
@@ -3220,8 +3352,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
profile_hit(KVM_PROFILING, (void *)rip);
}
- if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
- vcpu->arch.exception.pending = false;
kvm_lapic_sync_from_vapic(vcpu);
@@ -3230,6 +3360,7 @@ out:
return r;
}
+
static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
int r;
@@ -3256,29 +3387,42 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
kvm_vcpu_block(vcpu);
down_read(&vcpu->kvm->slots_lock);
if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
- if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+ {
+ switch(vcpu->arch.mp_state) {
+ case KVM_MP_STATE_HALTED:
vcpu->arch.mp_state =
- KVM_MP_STATE_RUNNABLE;
- if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
- r = -EINTR;
+ KVM_MP_STATE_RUNNABLE;
+ case KVM_MP_STATE_RUNNABLE:
+ break;
+ case KVM_MP_STATE_SIPI_RECEIVED:
+ default:
+ r = -EINTR;
+ break;
+ }
+ }
}
- if (r > 0) {
- if (dm_request_for_irq_injection(vcpu, kvm_run)) {
- r = -EINTR;
- kvm_run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.request_irq_exits;
- }
- if (signal_pending(current)) {
- r = -EINTR;
- kvm_run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.signal_exits;
- }
- if (need_resched()) {
- up_read(&vcpu->kvm->slots_lock);
- kvm_resched(vcpu);
- down_read(&vcpu->kvm->slots_lock);
- }
+ if (r <= 0)
+ break;
+
+ clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+ if (kvm_cpu_has_pending_timer(vcpu))
+ kvm_inject_pending_timer_irqs(vcpu);
+
+ if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+ r = -EINTR;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.request_irq_exits;
+ }
+ if (signal_pending(current)) {
+ r = -EINTR;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.signal_exits;
+ }
+ if (need_resched()) {
+ up_read(&vcpu->kvm->slots_lock);
+ kvm_resched(vcpu);
+ down_read(&vcpu->kvm->slots_lock);
}
}
@@ -3442,7 +3586,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
struct descriptor_table dt;
- int pending_vec;
vcpu_load(vcpu);
@@ -3472,16 +3615,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
sregs->efer = vcpu->arch.shadow_efer;
sregs->apic_base = kvm_get_apic_base(vcpu);
- if (irqchip_in_kernel(vcpu->kvm)) {
- memset(sregs->interrupt_bitmap, 0,
- sizeof sregs->interrupt_bitmap);
- pending_vec = kvm_x86_ops->get_irq(vcpu);
- if (pending_vec >= 0)
- set_bit(pending_vec,
- (unsigned long *)sregs->interrupt_bitmap);
- } else
- memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
- sizeof sregs->interrupt_bitmap);
+ memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
+
+ if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
+ set_bit(vcpu->arch.interrupt.nr,
+ (unsigned long *)sregs->interrupt_bitmap);
vcpu_put(vcpu);
@@ -3688,7 +3826,6 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
- tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
}
static int load_state_from_tss32(struct kvm_vcpu *vcpu,
@@ -3785,8 +3922,8 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
}
static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
- u32 old_tss_base,
- struct desc_struct *nseg_desc)
+ u16 old_tss_sel, u32 old_tss_base,
+ struct desc_struct *nseg_desc)
{
struct tss_segment_16 tss_segment_16;
int ret = 0;
@@ -3805,6 +3942,16 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
&tss_segment_16, sizeof tss_segment_16))
goto out;
+ if (old_tss_sel != 0xffff) {
+ tss_segment_16.prev_task_link = old_tss_sel;
+
+ if (kvm_write_guest(vcpu->kvm,
+ get_tss_base_addr(vcpu, nseg_desc),
+ &tss_segment_16.prev_task_link,
+ sizeof tss_segment_16.prev_task_link))
+ goto out;
+ }
+
if (load_state_from_tss16(vcpu, &tss_segment_16))
goto out;
@@ -3814,7 +3961,7 @@ out:
}
static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
- u32 old_tss_base,
+ u16 old_tss_sel, u32 old_tss_base,
struct desc_struct *nseg_desc)
{
struct tss_segment_32 tss_segment_32;
@@ -3834,6 +3981,16 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
&tss_segment_32, sizeof tss_segment_32))
goto out;
+ if (old_tss_sel != 0xffff) {
+ tss_segment_32.prev_task_link = old_tss_sel;
+
+ if (kvm_write_guest(vcpu->kvm,
+ get_tss_base_addr(vcpu, nseg_desc),
+ &tss_segment_32.prev_task_link,
+ sizeof tss_segment_32.prev_task_link))
+ goto out;
+ }
+
if (load_state_from_tss32(vcpu, &tss_segment_32))
goto out;
@@ -3887,14 +4044,22 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
}
- kvm_x86_ops->skip_emulated_instruction(vcpu);
+ /* set back link to prev task only if NT bit is set in eflags
+ note that old_tss_sel is not used afetr this point */
+ if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+ old_tss_sel = 0xffff;
+
+ /* set back link to prev task only if NT bit is set in eflags
+ note that old_tss_sel is not used afetr this point */
+ if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+ old_tss_sel = 0xffff;
if (nseg_desc.type & 8)
- ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
- &nseg_desc);
+ ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
+ old_tss_base, &nseg_desc);
else
- ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base,
- &nseg_desc);
+ ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
+ old_tss_base, &nseg_desc);
if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
u32 eflags = kvm_x86_ops->get_rflags(vcpu);
@@ -3920,7 +4085,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
int mmu_reset_needed = 0;
- int i, pending_vec, max_bits;
+ int pending_vec, max_bits;
struct descriptor_table dt;
vcpu_load(vcpu);
@@ -3934,7 +4099,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
vcpu->arch.cr2 = sregs->cr2;
mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
- vcpu->arch.cr3 = sregs->cr3;
+
+ down_read(&vcpu->kvm->slots_lock);
+ if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
+ vcpu->arch.cr3 = sregs->cr3;
+ else
+ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ up_read(&vcpu->kvm->slots_lock);
kvm_set_cr8(vcpu, sregs->cr8);
@@ -3956,25 +4127,14 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
if (mmu_reset_needed)
kvm_mmu_reset_context(vcpu);
- if (!irqchip_in_kernel(vcpu->kvm)) {
- memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
- sizeof vcpu->arch.irq_pending);
- vcpu->arch.irq_summary = 0;
- for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
- if (vcpu->arch.irq_pending[i])
- __set_bit(i, &vcpu->arch.irq_summary);
- } else {
- max_bits = (sizeof sregs->interrupt_bitmap) << 3;
- pending_vec = find_first_bit(
- (const unsigned long *)sregs->interrupt_bitmap,
- max_bits);
- /* Only pending external irq is handled here */
- if (pending_vec < max_bits) {
- kvm_x86_ops->set_irq(vcpu, pending_vec);
- pr_debug("Set back pending irq %d\n",
- pending_vec);
- }
- kvm_pic_clear_isr_ack(vcpu->kvm);
+ max_bits = (sizeof sregs->interrupt_bitmap) << 3;
+ pending_vec = find_first_bit(
+ (const unsigned long *)sregs->interrupt_bitmap, max_bits);
+ if (pending_vec < max_bits) {
+ kvm_queue_interrupt(vcpu, pending_vec, false);
+ pr_debug("Set back pending irq %d\n", pending_vec);
+ if (irqchip_in_kernel(vcpu->kvm))
+ kvm_pic_clear_isr_ack(vcpu->kvm);
}
kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -4308,7 +4468,6 @@ struct kvm *kvm_arch_create_vm(void)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
- INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -4411,12 +4570,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
}
}
+ spin_lock(&kvm->mmu_lock);
if (!kvm->arch.n_requested_mmu_pages) {
unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
}
kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+ spin_unlock(&kvm->mmu_lock);
kvm_flush_remote_tlbs(kvm);
return 0;
@@ -4425,6 +4586,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
void kvm_arch_flush_shadow(struct kvm *kvm)
{
kvm_mmu_zap_all(kvm);
+ kvm_reload_remote_mmus(kvm);
}
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
@@ -4434,28 +4596,24 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
|| vcpu->arch.nmi_pending;
}
-static void vcpu_kick_intr(void *info)
-{
-#ifdef DEBUG
- struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
- printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
-#endif
-}
-
void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
{
- int ipi_pcpu = vcpu->cpu;
- int cpu = get_cpu();
+ int me;
+ int cpu = vcpu->cpu;
if (waitqueue_active(&vcpu->wq)) {
wake_up_interruptible(&vcpu->wq);
++vcpu->stat.halt_wakeup;
}
- /*
- * We may be called synchronously with irqs disabled in guest mode,
- * So need not to call smp_call_function_single() in that case.
- */
- if (vcpu->guest_mode && vcpu->cpu != cpu)
- smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
+
+ me = get_cpu();
+ if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
+ if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+ smp_send_reschedule(cpu);
put_cpu();
}
+
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+ return kvm_x86_ops->interrupt_allowed(vcpu);
+}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 6a4be78a738..4c8e10af78e 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,9 +8,11 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
vcpu->arch.exception.pending = false;
}
-static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector)
+static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
+ bool soft)
{
vcpu->arch.interrupt.pending = true;
+ vcpu->arch.interrupt.soft = soft;
vcpu->arch.interrupt.nr = vector;
}
@@ -19,4 +21,14 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
vcpu->arch.interrupt.pending = false;
}
+static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending ||
+ vcpu->arch.nmi_injected;
+}
+
+static inline bool kvm_exception_is_soft(unsigned int nr)
+{
+ return (nr == BP_VECTOR) || (nr == OF_VECTOR);
+}
#endif
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index ca91749d208..616de4628d6 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -59,13 +59,14 @@
#define SrcImm (5<<4) /* Immediate operand. */
#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
#define SrcOne (7<<4) /* Implied '1' */
-#define SrcMask (7<<4)
+#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
+#define SrcMask (0xf<<4)
/* Generic ModRM decode. */
-#define ModRM (1<<7)
+#define ModRM (1<<8)
/* Destination is only written; never read. */
-#define Mov (1<<8)
-#define BitOp (1<<9)
-#define MemAbs (1<<10) /* Memory operand is absolute displacement */
+#define Mov (1<<9)
+#define BitOp (1<<10)
+#define MemAbs (1<<11) /* Memory operand is absolute displacement */
#define String (1<<12) /* String instruction (rep capable) */
#define Stack (1<<13) /* Stack instruction (push/pop) */
#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
@@ -76,6 +77,7 @@
#define Src2CL (1<<29)
#define Src2ImmByte (2<<29)
#define Src2One (3<<29)
+#define Src2Imm16 (4<<29)
#define Src2Mask (7<<29)
enum {
@@ -135,11 +137,11 @@ static u32 opcode_table[256] = {
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
/* 0x70 - 0x77 */
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+ SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
+ SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
/* 0x78 - 0x7F */
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+ SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
+ SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
/* 0x80 - 0x87 */
Group | Group1_80, Group | Group1_81,
Group | Group1_82, Group | Group1_83,
@@ -153,7 +155,8 @@ static u32 opcode_table[256] = {
/* 0x90 - 0x97 */
DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
/* 0x98 - 0x9F */
- 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
+ 0, 0, SrcImm | Src2Imm16, 0,
+ ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
/* 0xA0 - 0xA7 */
ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
@@ -178,7 +181,8 @@ static u32 opcode_table[256] = {
0, ImplicitOps | Stack, 0, 0,
ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
/* 0xC8 - 0xCF */
- 0, 0, 0, ImplicitOps | Stack, 0, 0, 0, 0,
+ 0, 0, 0, ImplicitOps | Stack,
+ ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps,
/* 0xD0 - 0xD7 */
ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
@@ -187,11 +191,11 @@ static u32 opcode_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0,
/* 0xE0 - 0xE7 */
0, 0, 0, 0,
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+ ByteOp | SrcImmUByte, SrcImmUByte,
+ ByteOp | SrcImmUByte, SrcImmUByte,
/* 0xE8 - 0xEF */
- ImplicitOps | Stack, SrcImm | ImplicitOps,
- ImplicitOps, SrcImmByte | ImplicitOps,
+ SrcImm | Stack, SrcImm | ImplicitOps,
+ SrcImm | Src2Imm16, SrcImmByte | ImplicitOps,
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
/* 0xF0 - 0xF7 */
@@ -230,10 +234,8 @@ static u32 twobyte_table[256] = {
/* 0x70 - 0x7F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0x80 - 0x8F */
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+ SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
+ SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
/* 0x90 - 0x9F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0xA0 - 0xA7 */
@@ -1044,10 +1046,14 @@ done_prefixes:
}
break;
case SrcImmByte:
+ case SrcImmUByte:
c->src.type = OP_IMM;
c->src.ptr = (unsigned long *)c->eip;
c->src.bytes = 1;
- c->src.val = insn_fetch(s8, 1, c->eip);
+ if ((c->d & SrcMask) == SrcImmByte)
+ c->src.val = insn_fetch(s8, 1, c->eip);
+ else
+ c->src.val = insn_fetch(u8, 1, c->eip);
break;
case SrcOne:
c->src.bytes = 1;
@@ -1072,6 +1078,12 @@ done_prefixes:
c->src2.bytes = 1;
c->src2.val = insn_fetch(u8, 1, c->eip);
break;
+ case Src2Imm16:
+ c->src2.type = OP_IMM;
+ c->src2.ptr = (unsigned long *)c->eip;
+ c->src2.bytes = 2;
+ c->src2.val = insn_fetch(u16, 2, c->eip);
+ break;
case Src2One:
c->src2.bytes = 1;
c->src2.val = 1;
@@ -1349,6 +1361,20 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
return 0;
}
+static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
+{
+ u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
+ /*
+ * an sti; sti; sequence only disable interrupts for the first
+ * instruction. So, if the last instruction, be it emulated or
+ * not, left the system with the INT_STI flag enabled, it
+ * means that the last instruction is an sti. We should not
+ * leave the flag on in this case. The same goes for mov ss
+ */
+ if (!(int_shadow & mask))
+ ctxt->interruptibility = mask;
+}
+
int
x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
@@ -1360,6 +1386,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
int io_dir_in;
int rc = 0;
+ ctxt->interruptibility = 0;
+
/* Shadow copy of register state. Committed on successful emulation.
* NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
* modify them.
@@ -1531,13 +1559,10 @@ special_insn:
return -1;
}
return 0;
- case 0x70 ... 0x7f: /* jcc (short) */ {
- int rel = insn_fetch(s8, 1, c->eip);
-
+ case 0x70 ... 0x7f: /* jcc (short) */
if (test_cc(c->b, ctxt->eflags))
- jmp_rel(c, rel);
+ jmp_rel(c, c->src.val);
break;
- }
case 0x80 ... 0x83: /* Grp1 */
switch (c->modrm_reg) {
case 0:
@@ -1609,6 +1634,9 @@ special_insn:
int err;
sel = c->src.val;
+ if (c->modrm_reg == VCPU_SREG_SS)
+ toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
+
if (c->modrm_reg <= 5) {
type_bits = (c->modrm_reg == 1) ? 9 : 1;
err = kvm_load_segment_descriptor(ctxt->vcpu, sel,
@@ -1769,59 +1797,32 @@ special_insn:
break;
case 0xe4: /* inb */
case 0xe5: /* in */
- port = insn_fetch(u8, 1, c->eip);
+ port = c->src.val;
io_dir_in = 1;
goto do_io;
case 0xe6: /* outb */
case 0xe7: /* out */
- port = insn_fetch(u8, 1, c->eip);
+ port = c->src.val;
io_dir_in = 0;
goto do_io;
case 0xe8: /* call (near) */ {
- long int rel;
- switch (c->op_bytes) {
- case 2:
- rel = insn_fetch(s16, 2, c->eip);
- break;
- case 4:
- rel = insn_fetch(s32, 4, c->eip);
- break;
- default:
- DPRINTF("Call: Invalid op_bytes\n");
- goto cannot_emulate;
- }
+ long int rel = c->src.val;
c->src.val = (unsigned long) c->eip;
jmp_rel(c, rel);
- c->op_bytes = c->ad_bytes;
emulate_push(ctxt);
break;
}
case 0xe9: /* jmp rel */
goto jmp;
- case 0xea: /* jmp far */ {
- uint32_t eip;
- uint16_t sel;
-
- switch (c->op_bytes) {
- case 2:
- eip = insn_fetch(u16, 2, c->eip);
- break;
- case 4:
- eip = insn_fetch(u32, 4, c->eip);
- break;
- default:
- DPRINTF("jmp far: Invalid op_bytes\n");
- goto cannot_emulate;
- }
- sel = insn_fetch(u16, 2, c->eip);
- if (kvm_load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) {
+ case 0xea: /* jmp far */
+ if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9,
+ VCPU_SREG_CS) < 0) {
DPRINTF("jmp far: Failed to load CS descriptor\n");
goto cannot_emulate;
}
- c->eip = eip;
+ c->eip = c->src.val;
break;
- }
case 0xeb:
jmp: /* jmp rel short */
jmp_rel(c, c->src.val);
@@ -1865,6 +1866,7 @@ special_insn:
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xfb: /* sti */
+ toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
ctxt->eflags |= X86_EFLAGS_IF;
c->dst.type = OP_NONE; /* Disable writeback. */
break;
@@ -2039,28 +2041,11 @@ twobyte_insn:
if (!test_cc(c->b, ctxt->eflags))
c->dst.type = OP_NONE; /* no writeback */
break;
- case 0x80 ... 0x8f: /* jnz rel, etc*/ {
- long int rel;
-
- switch (c->op_bytes) {
- case 2:
- rel = insn_fetch(s16, 2, c->eip);
- break;
- case 4:
- rel = insn_fetch(s32, 4, c->eip);
- break;
- case 8:
- rel = insn_fetch(s64, 8, c->eip);
- break;
- default:
- DPRINTF("jnz: Invalid op_bytes\n");
- goto cannot_emulate;
- }
+ case 0x80 ... 0x8f: /* jnz rel, etc*/
if (test_cc(c->b, ctxt->eflags))
- jmp_rel(c, rel);
+ jmp_rel(c, c->src.val);
c->dst.type = OP_NONE;
break;
- }
case 0xa3:
bt: /* bt */
c->dst.type = OP_NONE;
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 8dab8f7844d..38718041efc 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -2,7 +2,6 @@ config LGUEST_GUEST
bool "Lguest guest support"
select PARAVIRT
depends on X86_32
- depends on !X86_PAE
select VIRTIO
select VIRTIO_RING
select VIRTIO_CONSOLE
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 33a93b41739..d677fa9ca65 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -22,7 +22,8 @@
*
* So how does the kernel know it's a Guest? We'll see that later, but let's
* just say that we end up here where we replace the native functions various
- * "paravirt" structures with our Guest versions, then boot like normal. :*/
+ * "paravirt" structures with our Guest versions, then boot like normal.
+:*/
/*
* Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
@@ -74,7 +75,8 @@
*
* The Guest in our tale is a simple creature: identical to the Host but
* behaving in simplified but equivalent ways. In particular, the Guest is the
- * same kernel as the Host (or at least, built from the same source code). :*/
+ * same kernel as the Host (or at least, built from the same source code).
+:*/
struct lguest_data lguest_data = {
.hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
@@ -85,35 +87,41 @@ struct lguest_data lguest_data = {
.syscall_vec = SYSCALL_VECTOR,
};
-/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a
+/*G:037
+ * async_hcall() is pretty simple: I'm quite proud of it really. We have a
* ring buffer of stored hypercalls which the Host will run though next time we
- * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall
+ * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall
* arguments, and a "hcall_status" word which is 0 if the call is ready to go,
* and 255 once the Host has finished with it.
*
* If we come around to a slot which hasn't been finished, then the table is
* full and we just make the hypercall directly. This has the nice side
* effect of causing the Host to run all the stored calls in the ring buffer
- * which empties it for next time! */
+ * which empties it for next time!
+ */
static void async_hcall(unsigned long call, unsigned long arg1,
- unsigned long arg2, unsigned long arg3)
+ unsigned long arg2, unsigned long arg3,
+ unsigned long arg4)
{
/* Note: This code assumes we're uniprocessor. */
static unsigned int next_call;
unsigned long flags;
- /* Disable interrupts if not already disabled: we don't want an
+ /*
+ * Disable interrupts if not already disabled: we don't want an
* interrupt handler making a hypercall while we're already doing
- * one! */
+ * one!
+ */
local_irq_save(flags);
if (lguest_data.hcall_status[next_call] != 0xFF) {
/* Table full, so do normal hcall which will flush table. */
- kvm_hypercall3(call, arg1, arg2, arg3);
+ kvm_hypercall4(call, arg1, arg2, arg3, arg4);
} else {
lguest_data.hcalls[next_call].arg0 = call;
lguest_data.hcalls[next_call].arg1 = arg1;
lguest_data.hcalls[next_call].arg2 = arg2;
lguest_data.hcalls[next_call].arg3 = arg3;
+ lguest_data.hcalls[next_call].arg4 = arg4;
/* Arguments must all be written before we mark it to go */
wmb();
lguest_data.hcall_status[next_call] = 0;
@@ -123,8 +131,9 @@ static void async_hcall(unsigned long call, unsigned long arg1,
local_irq_restore(flags);
}
-/*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first
- * real optimization trick!
+/*G:035
+ * Notice the lazy_hcall() above, rather than hcall(). This is our first real
+ * optimization trick!
*
* When lazy_mode is set, it means we're allowed to defer all hypercalls and do
* them as a batch when lazy_mode is eventually turned off. Because hypercalls
@@ -134,16 +143,18 @@ static void async_hcall(unsigned long call, unsigned long arg1,
* lguest_leave_lazy_mode().
*
* So, when we're in lazy mode, we call async_hcall() to store the call for
- * future processing: */
+ * future processing:
+ */
static void lazy_hcall1(unsigned long call,
unsigned long arg1)
{
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
kvm_hypercall1(call, arg1);
else
- async_hcall(call, arg1, 0, 0);
+ async_hcall(call, arg1, 0, 0, 0);
}
+/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
static void lazy_hcall2(unsigned long call,
unsigned long arg1,
unsigned long arg2)
@@ -151,7 +162,7 @@ static void lazy_hcall2(unsigned long call,
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
kvm_hypercall2(call, arg1, arg2);
else
- async_hcall(call, arg1, arg2, 0);
+ async_hcall(call, arg1, arg2, 0, 0);
}
static void lazy_hcall3(unsigned long call,
@@ -162,18 +173,40 @@ static void lazy_hcall3(unsigned long call,
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
kvm_hypercall3(call, arg1, arg2, arg3);
else
- async_hcall(call, arg1, arg2, arg3);
+ async_hcall(call, arg1, arg2, arg3, 0);
}
-/* When lazy mode is turned off reset the per-cpu lazy mode variable and then
- * issue the do-nothing hypercall to flush any stored calls. */
-static void lguest_leave_lazy_mode(void)
+#ifdef CONFIG_X86_PAE
+static void lazy_hcall4(unsigned long call,
+ unsigned long arg1,
+ unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4)
+{
+ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
+ kvm_hypercall4(call, arg1, arg2, arg3, arg4);
+ else
+ async_hcall(call, arg1, arg2, arg3, arg4);
+}
+#endif
+
+/*G:036
+ * When lazy mode is turned off reset the per-cpu lazy mode variable and then
+ * issue the do-nothing hypercall to flush any stored calls.
+:*/
+static void lguest_leave_lazy_mmu_mode(void)
+{
+ kvm_hypercall0(LHCALL_FLUSH_ASYNC);
+ paravirt_leave_lazy_mmu();
+}
+
+static void lguest_end_context_switch(struct task_struct *next)
{
- paravirt_leave_lazy(paravirt_get_lazy_mode());
kvm_hypercall0(LHCALL_FLUSH_ASYNC);
+ paravirt_end_context_switch(next);
}
-/*G:033
+/*G:032
* After that diversion we return to our first native-instruction
* replacements: four functions for interrupt control.
*
@@ -186,49 +219,51 @@ static void lguest_leave_lazy_mode(void)
* check there before it tries to deliver an interrupt.
*/
-/* save_flags() is expected to return the processor state (ie. "flags"). The
+/*
+ * save_flags() is expected to return the processor state (ie. "flags"). The
* flags word contains all kind of stuff, but in practice Linux only cares
- * about the interrupt flag. Our "save_flags()" just returns that. */
+ * about the interrupt flag. Our "save_flags()" just returns that.
+ */
static unsigned long save_fl(void)
{
return lguest_data.irq_enabled;
}
-PV_CALLEE_SAVE_REGS_THUNK(save_fl);
-
-/* restore_flags() just sets the flags back to the value given. */
-static void restore_fl(unsigned long flags)
-{
- lguest_data.irq_enabled = flags;
-}
-PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
/* Interrupts go off... */
static void irq_disable(void)
{
lguest_data.irq_enabled = 0;
}
+
+/*
+ * Let's pause a moment. Remember how I said these are called so often?
+ * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
+ * break some rules. In particular, these functions are assumed to save their
+ * own registers if they need to: normal C functions assume they can trash the
+ * eax register. To use normal C functions, we use
+ * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
+ * C function, then restores it.
+ */
+PV_CALLEE_SAVE_REGS_THUNK(save_fl);
PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
+/*:*/
-/* Interrupts go on... */
-static void irq_enable(void)
-{
- lguest_data.irq_enabled = X86_EFLAGS_IF;
-}
-PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
+/* These are in i386_head.S */
+extern void lg_irq_enable(void);
+extern void lg_restore_fl(unsigned long flags);
-/*:*/
-/*M:003 Note that we don't check for outstanding interrupts when we re-enable
- * them (or when we unmask an interrupt). This seems to work for the moment,
- * since interrupts are rare and we'll just get the interrupt on the next timer
- * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way
- * would be to put the "irq_enabled" field in a page by itself, and have the
- * Host write-protect it when an interrupt comes in when irqs are disabled.
- * There will then be a page fault as soon as interrupts are re-enabled.
+/*M:003
+ * We could be more efficient in our checking of outstanding interrupts, rather
+ * than using a branch. One way would be to put the "irq_enabled" field in a
+ * page by itself, and have the Host write-protect it when an interrupt comes
+ * in when irqs are disabled. There will then be a page fault as soon as
+ * interrupts are re-enabled.
*
* A better method is to implement soft interrupt disable generally for x86:
* instead of disabling interrupts, we set a flag. If an interrupt does come
* in, we then disable them for real. This is uncommon, so we could simply use
- * a hypercall for interrupt control and not worry about efficiency. :*/
+ * a hypercall for interrupt control and not worry about efficiency.
+:*/
/*G:034
* The Interrupt Descriptor Table (IDT).
@@ -241,10 +276,12 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
static void lguest_write_idt_entry(gate_desc *dt,
int entrynum, const gate_desc *g)
{
- /* The gate_desc structure is 8 bytes long: we hand it to the Host in
+ /*
+ * The gate_desc structure is 8 bytes long: we hand it to the Host in
* two 32-bit chunks. The whole 32-bit kernel used to hand descriptors
* around like this; typesafety wasn't a big concern in Linux's early
- * years. */
+ * years.
+ */
u32 *desc = (u32 *)g;
/* Keep the local copy up to date. */
native_write_idt_entry(dt, entrynum, g);
@@ -252,9 +289,11 @@ static void lguest_write_idt_entry(gate_desc *dt,
kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
}
-/* Changing to a different IDT is very rare: we keep the IDT up-to-date every
+/*
+ * Changing to a different IDT is very rare: we keep the IDT up-to-date every
* time it is written, so we can simply loop through all entries and tell the
- * Host about them. */
+ * Host about them.
+ */
static void lguest_load_idt(const struct desc_ptr *desc)
{
unsigned int i;
@@ -285,9 +324,11 @@ static void lguest_load_gdt(const struct desc_ptr *desc)
kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b);
}
-/* For a single GDT entry which changes, we do the lazy thing: alter our GDT,
+/*
+ * For a single GDT entry which changes, we do the lazy thing: alter our GDT,
* then tell the Host to reload the entire thing. This operation is so rare
- * that this naive implementation is reasonable. */
+ * that this naive implementation is reasonable.
+ */
static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
const void *desc, int type)
{
@@ -297,29 +338,36 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
dt[entrynum].a, dt[entrynum].b);
}
-/* OK, I lied. There are three "thread local storage" GDT entries which change
+/*
+ * OK, I lied. There are three "thread local storage" GDT entries which change
* on every context switch (these three entries are how glibc implements
- * __thread variables). So we have a hypercall specifically for this case. */
+ * __thread variables). So we have a hypercall specifically for this case.
+ */
static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
{
- /* There's one problem which normal hardware doesn't have: the Host
+ /*
+ * There's one problem which normal hardware doesn't have: the Host
* can't handle us removing entries we're currently using. So we clear
- * the GS register here: if it's needed it'll be reloaded anyway. */
+ * the GS register here: if it's needed it'll be reloaded anyway.
+ */
lazy_load_gs(0);
lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
}
-/*G:038 That's enough excitement for now, back to ploughing through each of
- * the different pv_ops structures (we're about 1/3 of the way through).
+/*G:038
+ * That's enough excitement for now, back to ploughing through each of the
+ * different pv_ops structures (we're about 1/3 of the way through).
*
* This is the Local Descriptor Table, another weird Intel thingy. Linux only
* uses this for some strange applications like Wine. We don't do anything
- * here, so they'll get an informative and friendly Segmentation Fault. */
+ * here, so they'll get an informative and friendly Segmentation Fault.
+ */
static void lguest_set_ldt(const void *addr, unsigned entries)
{
}
-/* This loads a GDT entry into the "Task Register": that entry points to a
+/*
+ * This loads a GDT entry into the "Task Register": that entry points to a
* structure called the Task State Segment. Some comments scattered though the
* kernel code indicate that this used for task switching in ages past, along
* with blood sacrifice and astrology.
@@ -327,19 +375,21 @@ static void lguest_set_ldt(const void *addr, unsigned entries)
* Now there's nothing interesting in here that we don't get told elsewhere.
* But the native version uses the "ltr" instruction, which makes the Host
* complain to the Guest about a Segmentation Fault and it'll oops. So we
- * override the native version with a do-nothing version. */
+ * override the native version with a do-nothing version.
+ */
static void lguest_load_tr_desc(void)
{
}
-/* The "cpuid" instruction is a way of querying both the CPU identity
+/*
+ * The "cpuid" instruction is a way of querying both the CPU identity
* (manufacturer, model, etc) and its features. It was introduced before the
* Pentium in 1993 and keeps getting extended by both Intel, AMD and others.
* As you might imagine, after a decade and a half this treatment, it is now a
* giant ball of hair. Its entry in the current Intel manual runs to 28 pages.
*
* This instruction even it has its own Wikipedia entry. The Wikipedia entry
- * has been translated into 4 languages. I am not making this up!
+ * has been translated into 5 languages. I am not making this up!
*
* We could get funky here and identify ourselves as "GenuineLguest", but
* instead we just use the real "cpuid" instruction. Then I pretty much turned
@@ -351,7 +401,8 @@ static void lguest_load_tr_desc(void)
* Replacing the cpuid so we can turn features off is great for the kernel, but
* anyone (including userspace) can just use the raw "cpuid" instruction and
* the Host won't even notice since it isn't privileged. So we try not to get
- * too worked up about it. */
+ * too worked up about it.
+ */
static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
unsigned int *cx, unsigned int *dx)
{
@@ -359,33 +410,63 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
native_cpuid(ax, bx, cx, dx);
switch (function) {
- case 1: /* Basic feature request. */
- /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
+ /*
+ * CPUID 0 gives the highest legal CPUID number (and the ID string).
+ * We futureproof our code a little by sticking to known CPUID values.
+ */
+ case 0:
+ if (*ax > 5)
+ *ax = 5;
+ break;
+
+ /*
+ * CPUID 1 is a basic feature request.
+ *
+ * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3
+ * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE.
+ */
+ case 1:
*cx &= 0x00002201;
- /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */
- *dx &= 0x07808111;
- /* The Host can do a nice optimization if it knows that the
+ *dx &= 0x07808151;
+ /*
+ * The Host can do a nice optimization if it knows that the
* kernel mappings (addresses above 0xC0000000 or whatever
* PAGE_OFFSET is set to) haven't changed. But Linux calls
* flush_tlb_user() for both user and kernel mappings unless
- * the Page Global Enable (PGE) feature bit is set. */
+ * the Page Global Enable (PGE) feature bit is set.
+ */
*dx |= 0x00002000;
- /* We also lie, and say we're family id 5. 6 or greater
+ /*
+ * We also lie, and say we're family id 5. 6 or greater
* leads to a rdmsr in early_init_intel which we can't handle.
- * Family ID is returned as bits 8-12 in ax. */
+ * Family ID is returned as bits 8-12 in ax.
+ */
*ax &= 0xFFFFF0FF;
*ax |= 0x00000500;
break;
+ /*
+ * 0x80000000 returns the highest Extended Function, so we futureproof
+ * like we do above by limiting it to known fields.
+ */
case 0x80000000:
- /* Futureproof this a little: if they ask how much extended
- * processor information there is, limit it to known fields. */
if (*ax > 0x80000008)
*ax = 0x80000008;
break;
+
+ /*
+ * PAE systems can mark pages as non-executable. Linux calls this the
+ * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
+ * Virus Protection). We just switch turn if off here, since we don't
+ * support it.
+ */
+ case 0x80000001:
+ *dx &= ~(1 << 20);
+ break;
}
}
-/* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
+/*
+ * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
* I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother
* it. The Host needs to know when the Guest wants to change them, so we have
* a whole series of functions like read_cr0() and write_cr0().
@@ -400,7 +481,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
* name like "FPUTRAP bit" be a little less cryptic?
*
* We store cr0 locally because the Host never changes it. The Guest sometimes
- * wants to read it and we'd prefer not to bother the Host unnecessarily. */
+ * wants to read it and we'd prefer not to bother the Host unnecessarily.
+ */
static unsigned long current_cr0;
static void lguest_write_cr0(unsigned long val)
{
@@ -413,18 +495,22 @@ static unsigned long lguest_read_cr0(void)
return current_cr0;
}
-/* Intel provided a special instruction to clear the TS bit for people too cool
+/*
+ * Intel provided a special instruction to clear the TS bit for people too cool
* to use write_cr0() to do it. This "clts" instruction is faster, because all
- * the vowels have been optimized out. */
+ * the vowels have been optimized out.
+ */
static void lguest_clts(void)
{
lazy_hcall1(LHCALL_TS, 0);
current_cr0 &= ~X86_CR0_TS;
}
-/* cr2 is the virtual address of the last page fault, which the Guest only ever
+/*
+ * cr2 is the virtual address of the last page fault, which the Guest only ever
* reads. The Host kindly writes this into our "struct lguest_data", so we
- * just read it out of there. */
+ * just read it out of there.
+ */
static unsigned long lguest_read_cr2(void)
{
return lguest_data.cr2;
@@ -433,10 +519,12 @@ static unsigned long lguest_read_cr2(void)
/* See lguest_set_pte() below. */
static bool cr3_changed = false;
-/* cr3 is the current toplevel pagetable page: the principle is the same as
+/*
+ * cr3 is the current toplevel pagetable page: the principle is the same as
* cr0. Keep a local copy, and tell the Host when it changes. The only
* difference is that our local copy is in lguest_data because the Host needs
- * to set it upon our initial hypercall. */
+ * to set it upon our initial hypercall.
+ */
static void lguest_write_cr3(unsigned long cr3)
{
lguest_data.pgdir = cr3;
@@ -481,7 +569,7 @@ static void lguest_write_cr4(unsigned long val)
* cr3 ---> +---------+
* | --------->+---------+
* | | | PADDR1 |
- * Top-level | | PADDR2 |
+ * Mid-level | | PADDR2 |
* (PMD) page | | |
* | | Lower-level |
* | | (PTE) page |
@@ -501,41 +589,113 @@ static void lguest_write_cr4(unsigned long val)
* Index into top Index into second Offset within page
* page directory page pagetable page
*
- * The kernel spends a lot of time changing both the top-level page directory
- * and lower-level pagetable pages. The Guest doesn't know physical addresses,
- * so while it maintains these page tables exactly like normal, it also needs
- * to keep the Host informed whenever it makes a change: the Host will create
- * the real page tables based on the Guests'.
+ * Now, unfortunately, this isn't the whole story: Intel added Physical Address
+ * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits).
+ * These are held in 64-bit page table entries, so we can now only fit 512
+ * entries in a page, and the neat three-level tree breaks down.
+ *
+ * The result is a four level page table:
+ *
+ * cr3 --> [ 4 Upper ]
+ * [ Level ]
+ * [ Entries ]
+ * [(PUD Page)]---> +---------+
+ * | --------->+---------+
+ * | | | PADDR1 |
+ * Mid-level | | PADDR2 |
+ * (PMD) page | | |
+ * | | Lower-level |
+ * | | (PTE) page |
+ * | | | |
+ * .... ....
+ *
+ *
+ * And the virtual address is decoded as:
+ *
+ * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>|
+ * Index into Index into mid Index into lower Offset within page
+ * top entries directory page pagetable page
+ *
+ * It's too hard to switch between these two formats at runtime, so Linux only
+ * supports one or the other depending on whether CONFIG_X86_PAE is set. Many
+ * distributions turn it on, and not just for people with silly amounts of
+ * memory: the larger PTE entries allow room for the NX bit, which lets the
+ * kernel disable execution of pages and increase security.
+ *
+ * This was a problem for lguest, which couldn't run on these distributions;
+ * then Matias Zabaljauregui figured it all out and implemented it, and only a
+ * handful of puppies were crushed in the process!
+ *
+ * Back to our point: the kernel spends a lot of time changing both the
+ * top-level page directory and lower-level pagetable pages. The Guest doesn't
+ * know physical addresses, so while it maintains these page tables exactly
+ * like normal, it also needs to keep the Host informed whenever it makes a
+ * change: the Host will create the real page tables based on the Guests'.
*/
-/* The Guest calls this to set a second-level entry (pte), ie. to map a page
- * into a process' address space. We set the entry then tell the Host the
- * toplevel and address this corresponds to. The Guest uses one pagetable per
- * process, so we need to tell the Host which one we're changing (mm->pgd). */
+/*
+ * The Guest calls this after it has set a second-level entry (pte), ie. to map
+ * a page into a process' address space. Wetell the Host the toplevel and
+ * address this corresponds to. The Guest uses one pagetable per process, so
+ * we need to tell the Host which one we're changing (mm->pgd).
+ */
static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
+#ifdef CONFIG_X86_PAE
+ /* PAE needs to hand a 64 bit page table entry, so it uses two args. */
+ lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
+ ptep->pte_low, ptep->pte_high);
+#else
lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
+#endif
}
+/* This is the "set and update" combo-meal-deal version. */
static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval)
{
- *ptep = pteval;
+ native_set_pte(ptep, pteval);
lguest_pte_update(mm, addr, ptep);
}
-/* The Guest calls this to set a top-level entry. Again, we set the entry then
- * tell the Host which top-level page we changed, and the index of the entry we
- * changed. */
+/*
+ * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
+ * to set a middle-level entry when PAE is activated.
+ *
+ * Again, we set the entry then tell the Host which page we changed,
+ * and the index of the entry we changed.
+ */
+#ifdef CONFIG_X86_PAE
+static void lguest_set_pud(pud_t *pudp, pud_t pudval)
+{
+ native_set_pud(pudp, pudval);
+
+ /* 32 bytes aligned pdpt address and the index. */
+ lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0,
+ (__pa(pudp) & 0x1F) / sizeof(pud_t));
+}
+
static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
- *pmdp = pmdval;
+ native_set_pmd(pmdp, pmdval);
lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
- (__pa(pmdp) & (PAGE_SIZE - 1)) / 4);
+ (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
}
+#else
+
+/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */
+static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+ native_set_pmd(pmdp, pmdval);
+ lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
+ (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
+}
+#endif
-/* There are a couple of legacy places where the kernel sets a PTE, but we
+/*
+ * There are a couple of legacy places where the kernel sets a PTE, but we
* don't know the top level any more. This is useless for us, since we don't
* know which pagetable is changing or what address, so we just tell the Host
* to forget all of them. Fortunately, this is very rare.
@@ -543,15 +703,43 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
* ... except in early boot when the kernel sets up the initial pagetables,
* which makes booting astonishingly slow: 1.83 seconds! So we don't even tell
* the Host anything changed until we've done the first page table switch,
- * which brings boot back to 0.25 seconds. */
+ * which brings boot back to 0.25 seconds.
+ */
static void lguest_set_pte(pte_t *ptep, pte_t pteval)
{
- *ptep = pteval;
+ native_set_pte(ptep, pteval);
if (cr3_changed)
lazy_hcall1(LHCALL_FLUSH_TLB, 1);
}
-/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
+#ifdef CONFIG_X86_PAE
+/*
+ * With 64-bit PTE values, we need to be careful setting them: if we set 32
+ * bits at a time, the hardware could see a weird half-set entry. These
+ * versions ensure we update all 64 bits at once.
+ */
+static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ native_set_pte_atomic(ptep, pte);
+ if (cr3_changed)
+ lazy_hcall1(LHCALL_FLUSH_TLB, 1);
+}
+
+static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ native_pte_clear(mm, addr, ptep);
+ lguest_pte_update(mm, addr, ptep);
+}
+
+static void lguest_pmd_clear(pmd_t *pmdp)
+{
+ lguest_set_pmd(pmdp, __pmd(0));
+}
+#endif
+
+/*
+ * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
* native page table operations. On native hardware you can set a new page
* table entry whenever you want, but if you want to remove one you have to do
* a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
@@ -560,24 +748,29 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
* called when a valid entry is written, not when it's removed (ie. marked not
* present). Instead, this is where we come when the Guest wants to remove a
* page table entry: we tell the Host to set that entry to 0 (ie. the present
- * bit is zero). */
+ * bit is zero).
+ */
static void lguest_flush_tlb_single(unsigned long addr)
{
/* Simply set it to zero: if it was not, it will fault back in. */
lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
}
-/* This is what happens after the Guest has removed a large number of entries.
+/*
+ * This is what happens after the Guest has removed a large number of entries.
* This tells the Host that any of the page table entries for userspace might
- * have changed, ie. virtual addresses below PAGE_OFFSET. */
+ * have changed, ie. virtual addresses below PAGE_OFFSET.
+ */
static void lguest_flush_tlb_user(void)
{
lazy_hcall1(LHCALL_FLUSH_TLB, 0);
}
-/* This is called when the kernel page tables have changed. That's not very
+/*
+ * This is called when the kernel page tables have changed. That's not very
* common (unless the Guest is using highmem, which makes the Guest extremely
- * slow), so it's worth separating this from the user flushing above. */
+ * slow), so it's worth separating this from the user flushing above.
+ */
static void lguest_flush_tlb_kernel(void)
{
lazy_hcall1(LHCALL_FLUSH_TLB, 1);
@@ -614,30 +807,41 @@ static struct irq_chip lguest_irq_controller = {
.unmask = enable_lguest_irq,
};
-/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
+/*
+ * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
* interrupt (except 128, which is used for system calls), and then tells the
* Linux infrastructure that each interrupt is controlled by our level-based
- * lguest interrupt controller. */
+ * lguest interrupt controller.
+ */
static void __init lguest_init_IRQ(void)
{
unsigned int i;
- for (i = 0; i < LGUEST_IRQS; i++) {
- int vector = FIRST_EXTERNAL_VECTOR + i;
- /* Some systems map "vectors" to interrupts weirdly. Lguest has
- * a straightforward 1 to 1 mapping, so force that here. */
- __get_cpu_var(vector_irq)[vector] = i;
- if (vector != SYSCALL_VECTOR)
- set_intr_gate(vector, interrupt[i]);
+ for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+ /* Some systems map "vectors" to interrupts weirdly. Not us! */
+ __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR;
+ if (i != SYSCALL_VECTOR)
+ set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
}
- /* This call is required to set up for 4k stacks, where we have
- * separate stacks for hard and soft interrupts. */
+
+ /*
+ * This call is required to set up for 4k stacks, where we have
+ * separate stacks for hard and soft interrupts.
+ */
irq_ctx_init(smp_processor_id());
}
+/*
+ * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so
+ * rather than set them in lguest_init_IRQ we are called here every time an
+ * lguest device needs an interrupt.
+ *
+ * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should
+ * pass that up!
+ */
void lguest_setup_irq(unsigned int irq)
{
- irq_to_desc_alloc_cpu(irq, 0);
+ irq_to_desc_alloc_node(irq, 0);
set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
handle_level_irq, "level");
}
@@ -653,31 +857,39 @@ static unsigned long lguest_get_wallclock(void)
return lguest_data.time.tv_sec;
}
-/* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us
+/*
+ * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us
* what speed it runs at, or 0 if it's unusable as a reliable clock source.
* This matches what we want here: if we return 0 from this function, the x86
- * TSC clock will give up and not register itself. */
+ * TSC clock will give up and not register itself.
+ */
static unsigned long lguest_tsc_khz(void)
{
return lguest_data.tsc_khz;
}
-/* If we can't use the TSC, the kernel falls back to our lower-priority
- * "lguest_clock", where we read the time value given to us by the Host. */
+/*
+ * If we can't use the TSC, the kernel falls back to our lower-priority
+ * "lguest_clock", where we read the time value given to us by the Host.
+ */
static cycle_t lguest_clock_read(struct clocksource *cs)
{
unsigned long sec, nsec;
- /* Since the time is in two parts (seconds and nanoseconds), we risk
+ /*
+ * Since the time is in two parts (seconds and nanoseconds), we risk
* reading it just as it's changing from 99 & 0.999999999 to 100 and 0,
* and getting 99 and 0. As Linux tends to come apart under the stress
- * of time travel, we must be careful: */
+ * of time travel, we must be careful:
+ */
do {
/* First we read the seconds part. */
sec = lguest_data.time.tv_sec;
- /* This read memory barrier tells the compiler and the CPU that
+ /*
+ * This read memory barrier tells the compiler and the CPU that
* this can't be reordered: we have to complete the above
- * before going on. */
+ * before going on.
+ */
rmb();
/* Now we read the nanoseconds part. */
nsec = lguest_data.time.tv_nsec;
@@ -701,9 +913,11 @@ static struct clocksource lguest_clock = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
-/* We also need a "struct clock_event_device": Linux asks us to set it to go
+/*
+ * We also need a "struct clock_event_device": Linux asks us to set it to go
* off some time in the future. Actually, James Morris figured all this out, I
- * just applied the patch. */
+ * just applied the patch.
+ */
static int lguest_clockevent_set_next_event(unsigned long delta,
struct clock_event_device *evt)
{
@@ -753,8 +967,10 @@ static struct clock_event_device lguest_clockevent = {
.max_delta_ns = LG_CLOCK_MAX_DELTA,
};
-/* This is the Guest timer interrupt handler (hardware interrupt 0). We just
- * call the clockevent infrastructure and it does whatever needs doing. */
+/*
+ * This is the Guest timer interrupt handler (hardware interrupt 0). We just
+ * call the clockevent infrastructure and it does whatever needs doing.
+ */
static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
{
unsigned long flags;
@@ -765,10 +981,12 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
local_irq_restore(flags);
}
-/* At some point in the boot process, we get asked to set up our timing
+/*
+ * At some point in the boot process, we get asked to set up our timing
* infrastructure. The kernel doesn't expect timer interrupts before this, but
* we cleverly initialized the "blocked_interrupts" field of "struct
- * lguest_data" so that timer interrupts were blocked until now. */
+ * lguest_data" so that timer interrupts were blocked until now.
+ */
static void lguest_time_init(void)
{
/* Set up the timer interrupt (0) to go to our simple timer routine */
@@ -792,14 +1010,16 @@ static void lguest_time_init(void)
* to work. They're pretty simple.
*/
-/* The Guest needs to tell the Host what stack it expects traps to use. For
+/*
+ * The Guest needs to tell the Host what stack it expects traps to use. For
* native hardware, this is part of the Task State Segment mentioned above in
* lguest_load_tr_desc(), but to help hypervisors there's this special call.
*
* We tell the Host the segment we want to use (__KERNEL_DS is the kernel data
* segment), the privilege level (we're privilege level 1, the Host is 0 and
* will not tolerate us trying to use that), the stack pointer, and the number
- * of pages in the stack. */
+ * of pages in the stack.
+ */
static void lguest_load_sp0(struct tss_struct *tss,
struct thread_struct *thread)
{
@@ -813,7 +1033,8 @@ static void lguest_set_debugreg(int regno, unsigned long value)
/* FIXME: Implement */
}
-/* There are times when the kernel wants to make sure that no memory writes are
+/*
+ * There are times when the kernel wants to make sure that no memory writes are
* caught in the cache (that they've all reached real hardware devices). This
* doesn't matter for the Guest which has virtual hardware.
*
@@ -827,11 +1048,13 @@ static void lguest_wbinvd(void)
{
}
-/* If the Guest expects to have an Advanced Programmable Interrupt Controller,
+/*
+ * If the Guest expects to have an Advanced Programmable Interrupt Controller,
* we play dumb by ignoring writes and returning 0 for reads. So it's no
* longer Programmable nor Controlling anything, and I don't think 8 lines of
* code qualifies for Advanced. It will also never interrupt anything. It
- * does, however, allow us to get through the Linux boot code. */
+ * does, however, allow us to get through the Linux boot code.
+ */
#ifdef CONFIG_X86_LOCAL_APIC
static void lguest_apic_write(u32 reg, u32 v)
{
@@ -880,11 +1103,13 @@ static void lguest_safe_halt(void)
kvm_hypercall0(LHCALL_HALT);
}
-/* The SHUTDOWN hypercall takes a string to describe what's happening, and
+/*
+ * The SHUTDOWN hypercall takes a string to describe what's happening, and
* an argument which says whether this to restart (reboot) the Guest or not.
*
* Note that the Host always prefers that the Guest speak in physical addresses
- * rather than virtual addresses, so we use __pa() here. */
+ * rather than virtual addresses, so we use __pa() here.
+ */
static void lguest_power_off(void)
{
kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"),
@@ -915,8 +1140,10 @@ static __init char *lguest_memory_setup(void)
* nice to move it back to lguest_init. Patch welcome... */
atomic_notifier_chain_register(&panic_notifier_list, &paniced);
- /* The Linux bootloader header contains an "e820" memory map: the
- * Launcher populated the first entry with our memory limit. */
+ /*
+ *The Linux bootloader header contains an "e820" memory map: the
+ * Launcher populated the first entry with our memory limit.
+ */
e820_add_region(boot_params.e820_map[0].addr,
boot_params.e820_map[0].size,
boot_params.e820_map[0].type);
@@ -925,16 +1152,17 @@ static __init char *lguest_memory_setup(void)
return "LGUEST";
}
-/* We will eventually use the virtio console device to produce console output,
+/*
+ * We will eventually use the virtio console device to produce console output,
* but before that is set up we use LHCALL_NOTIFY on normal memory to produce
- * console output. */
+ * console output.
+ */
static __init int early_put_chars(u32 vtermno, const char *buf, int count)
{
char scratch[17];
unsigned int len = count;
- /* We use a nul-terminated string, so we have to make a copy. Icky,
- * huh? */
+ /* We use a nul-terminated string, so we make a copy. Icky, huh? */
if (len > sizeof(scratch) - 1)
len = sizeof(scratch) - 1;
scratch[len] = '\0';
@@ -945,8 +1173,10 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
return len;
}
-/* Rebooting also tells the Host we're finished, but the RESTART flag tells the
- * Launcher to reboot us. */
+/*
+ * Rebooting also tells the Host we're finished, but the RESTART flag tells the
+ * Launcher to reboot us.
+ */
static void lguest_restart(char *reason)
{
kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART);
@@ -967,13 +1197,14 @@ static void lguest_restart(char *reason)
*
* Our current solution is to allow the paravirt back end to optionally patch
* over the indirect calls to replace them with something more efficient. We
- * patch the four most commonly called functions: disable interrupts, enable
- * interrupts, restore interrupts and save interrupts. We usually have 6 or 10
- * bytes to patch into: the Guest versions of these operations are small enough
- * that we can fit comfortably.
+ * patch two of the simplest of the most commonly called functions: disable
+ * interrupts and save interrupts. We usually have 6 or 10 bytes to patch
+ * into: the Guest versions of these operations are small enough that we can
+ * fit comfortably.
*
* First we need assembly templates of each of the patchable Guest operations,
- * and these are in i386_head.S. */
+ * and these are in i386_head.S.
+ */
/*G:060 We construct a table from the assembler templates: */
static const struct lguest_insns
@@ -981,14 +1212,14 @@ static const struct lguest_insns
const char *start, *end;
} lguest_insns[] = {
[PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
- [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti },
- [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf },
[PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
};
-/* Now our patch routine is fairly simple (based on the native one in
+/*
+ * Now our patch routine is fairly simple (based on the native one in
* paravirt.c). If we have a replacement, we copy it in and return how much of
- * the available space we used. */
+ * the available space we used.
+ */
static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
unsigned long addr, unsigned len)
{
@@ -1000,8 +1231,7 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
insn_len = lguest_insns[type].end - lguest_insns[type].start;
- /* Similarly if we can't fit replacement (shouldn't happen, but let's
- * be thorough). */
+ /* Similarly if it can't fit (doesn't happen, but let's be thorough). */
if (len < insn_len)
return paravirt_patch_default(type, clobber, ibuf, addr, len);
@@ -1010,33 +1240,40 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
return insn_len;
}
-/*G:030 Once we get to lguest_init(), we know we're a Guest. The various
+/*G:029
+ * Once we get to lguest_init(), we know we're a Guest. The various
* pv_ops structures in the kernel provide points for (almost) every routine we
- * have to override to avoid privileged instructions. */
+ * have to override to avoid privileged instructions.
+ */
__init void lguest_init(void)
{
- /* We're under lguest, paravirt is enabled, and we're running at
- * privilege level 1, not 0 as normal. */
+ /* We're under lguest. */
pv_info.name = "lguest";
+ /* Paravirt is enabled. */
pv_info.paravirt_enabled = 1;
+ /* We're running at privilege level 1, not 0 as normal. */
pv_info.kernel_rpl = 1;
+ /* Everyone except Xen runs with this set. */
+ pv_info.shared_kernel_pmd = 1;
- /* We set up all the lguest overrides for sensitive operations. These
- * are detailed with the operations themselves. */
+ /*
+ * We set up all the lguest overrides for sensitive operations. These
+ * are detailed with the operations themselves.
+ */
- /* interrupt-related operations */
+ /* Interrupt-related operations */
pv_irq_ops.init_IRQ = lguest_init_IRQ;
pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
- pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl);
+ pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
- pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable);
+ pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
pv_irq_ops.safe_halt = lguest_safe_halt;
- /* init-time operations */
+ /* Setup operations */
pv_init_ops.memory_setup = lguest_memory_setup;
pv_init_ops.patch = lguest_patch;
- /* Intercepts of various cpu instructions */
+ /* Intercepts of various CPU instructions */
pv_cpu_ops.load_gdt = lguest_load_gdt;
pv_cpu_ops.cpuid = lguest_cpuid;
pv_cpu_ops.load_idt = lguest_load_idt;
@@ -1054,10 +1291,10 @@ __init void lguest_init(void)
pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
pv_cpu_ops.wbinvd = lguest_wbinvd;
- pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu;
- pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+ pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
+ pv_cpu_ops.end_context_switch = lguest_end_context_switch;
- /* pagetable management */
+ /* Pagetable management */
pv_mmu_ops.write_cr3 = lguest_write_cr3;
pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
@@ -1065,62 +1302,85 @@ __init void lguest_init(void)
pv_mmu_ops.set_pte = lguest_set_pte;
pv_mmu_ops.set_pte_at = lguest_set_pte_at;
pv_mmu_ops.set_pmd = lguest_set_pmd;
+#ifdef CONFIG_X86_PAE
+ pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic;
+ pv_mmu_ops.pte_clear = lguest_pte_clear;
+ pv_mmu_ops.pmd_clear = lguest_pmd_clear;
+ pv_mmu_ops.set_pud = lguest_set_pud;
+#endif
pv_mmu_ops.read_cr2 = lguest_read_cr2;
pv_mmu_ops.read_cr3 = lguest_read_cr3;
pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
- pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+ pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
pv_mmu_ops.pte_update = lguest_pte_update;
pv_mmu_ops.pte_update_defer = lguest_pte_update;
#ifdef CONFIG_X86_LOCAL_APIC
- /* apic read/write intercepts */
+ /* APIC read/write intercepts */
set_lguest_basic_apic_ops();
#endif
- /* time operations */
+ /* Time operations */
pv_time_ops.get_wallclock = lguest_get_wallclock;
pv_time_ops.time_init = lguest_time_init;
pv_time_ops.get_tsc_khz = lguest_tsc_khz;
- /* Now is a good time to look at the implementations of these functions
- * before returning to the rest of lguest_init(). */
+ /*
+ * Now is a good time to look at the implementations of these functions
+ * before returning to the rest of lguest_init().
+ */
- /*G:070 Now we've seen all the paravirt_ops, we return to
+ /*G:070
+ * Now we've seen all the paravirt_ops, we return to
* lguest_init() where the rest of the fairly chaotic boot setup
- * occurs. */
+ * occurs.
+ */
- /* The stack protector is a weird thing where gcc places a canary
+ /*
+ * The stack protector is a weird thing where gcc places a canary
* value on the stack and then checks it on return. This file is
* compiled with -fno-stack-protector it, so we got this far without
* problems. The value of the canary is kept at offset 20 from the
* %gs register, so we need to set that up before calling C functions
- * in other files. */
+ * in other files.
+ */
setup_stack_canary_segment(0);
- /* We could just call load_stack_canary_segment(), but we might as
- * call switch_to_new_gdt() which loads the whole table and sets up
- * the per-cpu segment descriptor register %fs as well. */
+
+ /*
+ * We could just call load_stack_canary_segment(), but we might as well
+ * call switch_to_new_gdt() which loads the whole table and sets up the
+ * per-cpu segment descriptor register %fs as well.
+ */
switch_to_new_gdt(0);
- /* As described in head_32.S, we map the first 128M of memory. */
+ /* We actually boot with all memory mapped, but let's say 128MB. */
max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
- /* The Host<->Guest Switcher lives at the top of our address space, and
+ /*
+ * The Host<->Guest Switcher lives at the top of our address space, and
* the Host told us how big it is when we made LGUEST_INIT hypercall:
- * it put the answer in lguest_data.reserve_mem */
+ * it put the answer in lguest_data.reserve_mem
+ */
reserve_top_address(lguest_data.reserve_mem);
- /* If we don't initialize the lock dependency checker now, it crashes
- * paravirt_disable_iospace. */
+ /*
+ * If we don't initialize the lock dependency checker now, it crashes
+ * paravirt_disable_iospace.
+ */
lockdep_init();
- /* The IDE code spends about 3 seconds probing for disks: if we reserve
+ /*
+ * The IDE code spends about 3 seconds probing for disks: if we reserve
* all the I/O ports up front it can't get them and so doesn't probe.
* Other device drivers are similar (but less severe). This cuts the
- * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */
+ * kernel boot time on my machine from 4.1 seconds to 0.45 seconds.
+ */
paravirt_disable_iospace();
- /* This is messy CPU setup stuff which the native boot code does before
- * start_kernel, so we have to do, too: */
+ /*
+ * This is messy CPU setup stuff which the native boot code does before
+ * start_kernel, so we have to do, too:
+ */
cpu_detect(&new_cpu_data);
/* head.S usually sets up the first capability word, so do it here. */
new_cpu_data.x86_capability[0] = cpuid_edx(1);
@@ -1137,22 +1397,28 @@ __init void lguest_init(void)
acpi_ht = 0;
#endif
- /* We set the preferred console to "hvc". This is the "hypervisor
+ /*
+ * We set the preferred console to "hvc". This is the "hypervisor
* virtual console" driver written by the PowerPC people, which we also
- * adapted for lguest's use. */
+ * adapted for lguest's use.
+ */
add_preferred_console("hvc", 0, NULL);
/* Register our very early console. */
virtio_cons_early_init(early_put_chars);
- /* Last of all, we set the power management poweroff hook to point to
+ /*
+ * Last of all, we set the power management poweroff hook to point to
* the Guest routine to power off, and the reboot hook to our restart
- * routine. */
+ * routine.
+ */
pm_power_off = lguest_power_off;
machine_ops.restart = lguest_restart;
- /* Now we're set up, call i386_start_kernel() in head32.c and we proceed
- * to boot as normal. It never returns. */
+ /*
+ * Now we're set up, call i386_start_kernel() in head32.c and we proceed
+ * to boot as normal. It never returns.
+ */
i386_start_kernel();
}
/*
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index f7954198947..27eac0faee4 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -5,7 +5,8 @@
#include <asm/thread_info.h>
#include <asm/processor-flags.h>
-/*G:020 Our story starts with the kernel booting into startup_32 in
+/*G:020
+ * Our story starts with the kernel booting into startup_32 in
* arch/x86/kernel/head_32.S. It expects a boot header, which is created by
* the bootloader (the Launcher in our case).
*
@@ -21,11 +22,14 @@
* data without remembering to subtract __PAGE_OFFSET!
*
* The .section line puts this code in .init.text so it will be discarded after
- * boot. */
+ * boot.
+ */
.section .init.text, "ax", @progbits
ENTRY(lguest_entry)
- /* We make the "initialization" hypercall now to tell the Host about
- * us, and also find out where it put our page tables. */
+ /*
+ * We make the "initialization" hypercall now to tell the Host about
+ * us, and also find out where it put our page tables.
+ */
movl $LHCALL_LGUEST_INIT, %eax
movl $lguest_data - __PAGE_OFFSET, %ebx
.byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
@@ -33,44 +37,119 @@ ENTRY(lguest_entry)
/* Set up the initial stack so we can run C code. */
movl $(init_thread_union+THREAD_SIZE),%esp
- /* Jumps are relative, and we're running __PAGE_OFFSET too low at the
- * moment. */
+ /* Jumps are relative: we're running __PAGE_OFFSET too low. */
jmp lguest_init+__PAGE_OFFSET
-/*G:055 We create a macro which puts the assembler code between lgstart_ and
- * lgend_ markers. These templates are put in the .text section: they can't be
- * discarded after boot as we may need to patch modules, too. */
+/*G:055
+ * We create a macro which puts the assembler code between lgstart_ and lgend_
+ * markers. These templates are put in the .text section: they can't be
+ * discarded after boot as we may need to patch modules, too.
+ */
.text
#define LGUEST_PATCH(name, insns...) \
lgstart_##name: insns; lgend_##name:; \
.globl lgstart_##name; .globl lgend_##name
LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled)
LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
+
+/*G:033
+ * But using those wrappers is inefficient (we'll see why that doesn't matter
+ * for save_fl and irq_disable later). If we write our routines carefully in
+ * assembler, we can avoid clobbering any registers and avoid jumping through
+ * the wrapper functions.
+ *
+ * I skipped over our first piece of assembler, but this one is worth studying
+ * in a bit more detail so I'll describe in easy stages. First, the routine to
+ * enable interrupts:
+ */
+ENTRY(lg_irq_enable)
+ /*
+ * The reverse of irq_disable, this sets lguest_data.irq_enabled to
+ * X86_EFLAGS_IF (ie. "Interrupts enabled").
+ */
+ movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
+ /*
+ * But now we need to check if the Host wants to know: there might have
+ * been interrupts waiting to be delivered, in which case it will have
+ * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we
+ * jump to send_interrupts, otherwise we're done.
+ */
+ testl $0, lguest_data+LGUEST_DATA_irq_pending
+ jnz send_interrupts
+ /*
+ * One cool thing about x86 is that you can do many things without using
+ * a register. In this case, the normal path hasn't needed to save or
+ * restore any registers at all!
+ */
+ ret
+send_interrupts:
+ /*
+ * OK, now we need a register: eax is used for the hypercall number,
+ * which is LHCALL_SEND_INTERRUPTS.
+ *
+ * We used not to bother with this pending detection at all, which was
+ * much simpler. Sooner or later the Host would realize it had to
+ * send us an interrupt. But that turns out to make performance 7
+ * times worse on a simple tcp benchmark. So now we do this the hard
+ * way.
+ */
+ pushl %eax
+ movl $LHCALL_SEND_INTERRUPTS, %eax
+ /*
+ * This is a vmcall instruction (same thing that KVM uses). Older
+ * assembler versions might not know the "vmcall" instruction, so we
+ * create one manually here.
+ */
+ .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
+ /* Put eax back the way we found it. */
+ popl %eax
+ ret
+
+/*
+ * Finally, the "popf" or "restore flags" routine. The %eax register holds the
+ * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
+ * enabling interrupts again, if it's 0 we're leaving them off.
+ */
+ENTRY(lg_restore_fl)
+ /* This is just "lguest_data.irq_enabled = flags;" */
+ movl %eax, lguest_data+LGUEST_DATA_irq_enabled
+ /*
+ * Now, if the %eax value has enabled interrupts and
+ * lguest_data.irq_pending is set, we want to tell the Host so it can
+ * deliver any outstanding interrupts. Fortunately, both values will
+ * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
+ * instruction will AND them together for us. If both are set, we
+ * jump to send_interrupts.
+ */
+ testl lguest_data+LGUEST_DATA_irq_pending, %eax
+ jnz send_interrupts
+ /* Again, the normal path has used no extra registers. Clever, huh? */
+ ret
/*:*/
/* These demark the EIP range where host should never deliver interrupts. */
.global lguest_noirq_start
.global lguest_noirq_end
-/*M:004 When the Host reflects a trap or injects an interrupt into the Guest,
- * it sets the eflags interrupt bit on the stack based on
- * lguest_data.irq_enabled, so the Guest iret logic does the right thing when
- * restoring it. However, when the Host sets the Guest up for direct traps,
- * such as system calls, the processor is the one to push eflags onto the
- * stack, and the interrupt bit will be 1 (in reality, interrupts are always
- * enabled in the Guest).
+/*M:004
+ * When the Host reflects a trap or injects an interrupt into the Guest, it
+ * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled,
+ * so the Guest iret logic does the right thing when restoring it. However,
+ * when the Host sets the Guest up for direct traps, such as system calls, the
+ * processor is the one to push eflags onto the stack, and the interrupt bit
+ * will be 1 (in reality, interrupts are always enabled in the Guest).
*
* This turns out to be harmless: the only trap which should happen under Linux
* with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
* regions), which has to be reflected through the Host anyway. If another
* trap *does* go off when interrupts are disabled, the Guest will panic, and
- * we'll never get to this iret! :*/
+ * we'll never get to this iret!
+:*/
-/*G:045 There is one final paravirt_op that the Guest implements, and glancing
- * at it you can see why I left it to last. It's *cool*! It's in *assembler*!
+/*G:045
+ * There is one final paravirt_op that the Guest implements, and glancing at it
+ * you can see why I left it to last. It's *cool*! It's in *assembler*!
*
* The "iret" instruction is used to return from an interrupt or trap. The
* stack looks like this:
@@ -94,15 +173,18 @@ LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
* return to userspace or wherever. Our solution to this is to surround the
* code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the
* Host that it is *never* to interrupt us there, even if interrupts seem to be
- * enabled. */
+ * enabled.
+ */
ENTRY(lguest_iret)
pushl %eax
movl 12(%esp), %eax
lguest_noirq_start:
- /* Note the %ss: segment prefix here. Normal data accesses use the
+ /*
+ * Note the %ss: segment prefix here. Normal data accesses use the
* "ds" segment, but that will have already been restored for whatever
* we're returning to (such as userspace): we can't trust it. The %ss:
- * prefix makes sure we use the stack segment, which is still valid. */
+ * prefix makes sure we use the stack segment, which is still valid.
+ */
movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled
popl %eax
iret
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 55e11aa6d66..07c31899c9c 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,7 +2,7 @@
# Makefile for x86 specific library files.
#
-obj-$(CONFIG_SMP) := msr-on-cpu.o
+obj-$(CONFIG_SMP) := msr.o
lib-y := delay.o
lib-y += thunk_$(BITS).o
@@ -10,6 +10,7 @@ lib-y += usercopy_$(BITS).o getuser.o putuser.o
lib-y += memcpy_$(BITS).o
ifeq ($(CONFIG_X86_32),y)
+ obj-y += atomic64_32.o
lib-y += checksum_32.o
lib-y += strstr_32.o
lib-y += semaphore_32.o string_32.o
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
new file mode 100644
index 00000000000..824fa0be55a
--- /dev/null
+++ b/arch/x86/lib/atomic64_32.c
@@ -0,0 +1,230 @@
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include <asm/processor.h>
+#include <asm/cmpxchg.h>
+#include <asm/atomic.h>
+
+static noinline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new)
+{
+ u32 low = new;
+ u32 high = new >> 32;
+
+ asm volatile(
+ LOCK_PREFIX "cmpxchg8b %1\n"
+ : "+A" (old), "+m" (*ptr)
+ : "b" (low), "c" (high)
+ );
+ return old;
+}
+
+u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val)
+{
+ return cmpxchg8b(&ptr->counter, old_val, new_val);
+}
+EXPORT_SYMBOL(atomic64_cmpxchg);
+
+/**
+ * atomic64_xchg - xchg atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ * @new_val: value to assign
+ *
+ * Atomically xchgs the value of @ptr to @new_val and returns
+ * the old value.
+ */
+u64 atomic64_xchg(atomic64_t *ptr, u64 new_val)
+{
+ /*
+ * Try first with a (possibly incorrect) assumption about
+ * what we have there. We'll do two loops most likely,
+ * but we'll get an ownership MESI transaction straight away
+ * instead of a read transaction followed by a
+ * flush-for-ownership transaction:
+ */
+ u64 old_val, real_val = 0;
+
+ do {
+ old_val = real_val;
+
+ real_val = atomic64_cmpxchg(ptr, old_val, new_val);
+
+ } while (real_val != old_val);
+
+ return old_val;
+}
+EXPORT_SYMBOL(atomic64_xchg);
+
+/**
+ * atomic64_set - set atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ * @new_val: value to assign
+ *
+ * Atomically sets the value of @ptr to @new_val.
+ */
+void atomic64_set(atomic64_t *ptr, u64 new_val)
+{
+ atomic64_xchg(ptr, new_val);
+}
+EXPORT_SYMBOL(atomic64_set);
+
+/**
+EXPORT_SYMBOL(atomic64_read);
+ * atomic64_add_return - add and return
+ * @delta: integer value to add
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns @delta + *@ptr
+ */
+noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr)
+{
+ /*
+ * Try first with a (possibly incorrect) assumption about
+ * what we have there. We'll do two loops most likely,
+ * but we'll get an ownership MESI transaction straight away
+ * instead of a read transaction followed by a
+ * flush-for-ownership transaction:
+ */
+ u64 old_val, new_val, real_val = 0;
+
+ do {
+ old_val = real_val;
+ new_val = old_val + delta;
+
+ real_val = atomic64_cmpxchg(ptr, old_val, new_val);
+
+ } while (real_val != old_val);
+
+ return new_val;
+}
+EXPORT_SYMBOL(atomic64_add_return);
+
+u64 atomic64_sub_return(u64 delta, atomic64_t *ptr)
+{
+ return atomic64_add_return(-delta, ptr);
+}
+EXPORT_SYMBOL(atomic64_sub_return);
+
+u64 atomic64_inc_return(atomic64_t *ptr)
+{
+ return atomic64_add_return(1, ptr);
+}
+EXPORT_SYMBOL(atomic64_inc_return);
+
+u64 atomic64_dec_return(atomic64_t *ptr)
+{
+ return atomic64_sub_return(1, ptr);
+}
+EXPORT_SYMBOL(atomic64_dec_return);
+
+/**
+ * atomic64_add - add integer to atomic64 variable
+ * @delta: integer value to add
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr.
+ */
+void atomic64_add(u64 delta, atomic64_t *ptr)
+{
+ atomic64_add_return(delta, ptr);
+}
+EXPORT_SYMBOL(atomic64_add);
+
+/**
+ * atomic64_sub - subtract the atomic64 variable
+ * @delta: integer value to subtract
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr.
+ */
+void atomic64_sub(u64 delta, atomic64_t *ptr)
+{
+ atomic64_add(-delta, ptr);
+}
+EXPORT_SYMBOL(atomic64_sub);
+
+/**
+ * atomic64_sub_and_test - subtract value from variable and test result
+ * @delta: integer value to subtract
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+int atomic64_sub_and_test(u64 delta, atomic64_t *ptr)
+{
+ u64 new_val = atomic64_sub_return(delta, ptr);
+
+ return new_val == 0;
+}
+EXPORT_SYMBOL(atomic64_sub_and_test);
+
+/**
+ * atomic64_inc - increment atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1.
+ */
+void atomic64_inc(atomic64_t *ptr)
+{
+ atomic64_add(1, ptr);
+}
+EXPORT_SYMBOL(atomic64_inc);
+
+/**
+ * atomic64_dec - decrement atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1.
+ */
+void atomic64_dec(atomic64_t *ptr)
+{
+ atomic64_sub(1, ptr);
+}
+EXPORT_SYMBOL(atomic64_dec);
+
+/**
+ * atomic64_dec_and_test - decrement and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+int atomic64_dec_and_test(atomic64_t *ptr)
+{
+ return atomic64_sub_and_test(1, ptr);
+}
+EXPORT_SYMBOL(atomic64_dec_and_test);
+
+/**
+ * atomic64_inc_and_test - increment and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+int atomic64_inc_and_test(atomic64_t *ptr)
+{
+ return atomic64_sub_and_test(-1, ptr);
+}
+EXPORT_SYMBOL(atomic64_inc_and_test);
+
+/**
+ * atomic64_add_negative - add and test if negative
+ * @delta: integer value to add
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+int atomic64_add_negative(u64 delta, atomic64_t *ptr)
+{
+ s64 new_val = atomic64_add_return(delta, ptr);
+
+ return new_val < 0;
+}
+EXPORT_SYMBOL(atomic64_add_negative);
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index 9a10a78bb4a..ebeafcce04a 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -5,15 +5,14 @@
* Zero a page.
* rdi page
*/
- ALIGN
-clear_page_c:
+ENTRY(clear_page_c)
CFI_STARTPROC
movl $4096/8,%ecx
xorl %eax,%eax
rep stosq
ret
CFI_ENDPROC
-ENDPROC(clear_page)
+ENDPROC(clear_page_c)
ENTRY(clear_page)
CFI_STARTPROC
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index f118c110af3..6ba0f7bb85e 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -75,6 +75,7 @@ ENTRY(copy_to_user)
jae bad_to_user
ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
CFI_ENDPROC
+ENDPROC(copy_to_user)
/* Standard copy_from_user with segment limit checking */
ENTRY(copy_from_user)
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index f4568605d7d..ff485d36118 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -55,8 +55,10 @@ static void delay_tsc(unsigned long loops)
preempt_disable();
cpu = smp_processor_id();
+ rdtsc_barrier();
rdtscl(bclock);
for (;;) {
+ rdtsc_barrier();
rdtscl(now);
if ((now - bclock) >= loops)
break;
@@ -78,6 +80,7 @@ static void delay_tsc(unsigned long loops)
if (unlikely(cpu != smp_processor_id())) {
loops -= (now - bclock);
cpu = smp_processor_id();
+ rdtsc_barrier();
rdtscl(bclock);
}
}
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
deleted file mode 100644
index 321cf720dbb..00000000000
--- a/arch/x86/lib/msr-on-cpu.c
+++ /dev/null
@@ -1,97 +0,0 @@
-#include <linux/module.h>
-#include <linux/preempt.h>
-#include <linux/smp.h>
-#include <asm/msr.h>
-
-struct msr_info {
- u32 msr_no;
- u32 l, h;
- int err;
-};
-
-static void __rdmsr_on_cpu(void *info)
-{
- struct msr_info *rv = info;
-
- rdmsr(rv->msr_no, rv->l, rv->h);
-}
-
-static void __wrmsr_on_cpu(void *info)
-{
- struct msr_info *rv = info;
-
- wrmsr(rv->msr_no, rv->l, rv->h);
-}
-
-int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
-{
- int err;
- struct msr_info rv;
-
- rv.msr_no = msr_no;
- err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
- *l = rv.l;
- *h = rv.h;
-
- return err;
-}
-
-int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
-{
- int err;
- struct msr_info rv;
-
- rv.msr_no = msr_no;
- rv.l = l;
- rv.h = h;
- err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
-
- return err;
-}
-
-/* These "safe" variants are slower and should be used when the target MSR
- may not actually exist. */
-static void __rdmsr_safe_on_cpu(void *info)
-{
- struct msr_info *rv = info;
-
- rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
-}
-
-static void __wrmsr_safe_on_cpu(void *info)
-{
- struct msr_info *rv = info;
-
- rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
-}
-
-int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
-{
- int err;
- struct msr_info rv;
-
- rv.msr_no = msr_no;
- err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
- *l = rv.l;
- *h = rv.h;
-
- return err ? err : rv.err;
-}
-
-int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
-{
- int err;
- struct msr_info rv;
-
- rv.msr_no = msr_no;
- rv.l = l;
- rv.h = h;
- err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
-
- return err ? err : rv.err;
-}
-
-EXPORT_SYMBOL(rdmsr_on_cpu);
-EXPORT_SYMBOL(wrmsr_on_cpu);
-EXPORT_SYMBOL(rdmsr_safe_on_cpu);
-EXPORT_SYMBOL(wrmsr_safe_on_cpu);
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
new file mode 100644
index 00000000000..caa24aca811
--- /dev/null
+++ b/arch/x86/lib/msr.c
@@ -0,0 +1,177 @@
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/smp.h>
+#include <asm/msr.h>
+
+struct msr_info {
+ u32 msr_no;
+ struct msr reg;
+ struct msr *msrs;
+ int off;
+ int err;
+};
+
+static void __rdmsr_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+ struct msr *reg;
+ int this_cpu = raw_smp_processor_id();
+
+ if (rv->msrs)
+ reg = &rv->msrs[this_cpu - rv->off];
+ else
+ reg = &rv->reg;
+
+ rdmsr(rv->msr_no, reg->l, reg->h);
+}
+
+static void __wrmsr_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+ struct msr *reg;
+ int this_cpu = raw_smp_processor_id();
+
+ if (rv->msrs)
+ reg = &rv->msrs[this_cpu - rv->off];
+ else
+ reg = &rv->reg;
+
+ wrmsr(rv->msr_no, reg->l, reg->h);
+}
+
+int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
+ *l = rv.reg.l;
+ *h = rv.reg.h;
+
+ return err;
+}
+EXPORT_SYMBOL(rdmsr_on_cpu);
+
+int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ rv.reg.l = l;
+ rv.reg.h = h;
+ err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
+
+ return err;
+}
+EXPORT_SYMBOL(wrmsr_on_cpu);
+
+/* rdmsr on a bunch of CPUs
+ *
+ * @mask: which CPUs
+ * @msr_no: which MSR
+ * @msrs: array of MSR values
+ *
+ */
+void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
+{
+ struct msr_info rv;
+ int this_cpu;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.off = cpumask_first(mask);
+ rv.msrs = msrs;
+ rv.msr_no = msr_no;
+
+ this_cpu = get_cpu();
+
+ if (cpumask_test_cpu(this_cpu, mask))
+ __rdmsr_on_cpu(&rv);
+
+ smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1);
+ put_cpu();
+}
+EXPORT_SYMBOL(rdmsr_on_cpus);
+
+/*
+ * wrmsr on a bunch of CPUs
+ *
+ * @mask: which CPUs
+ * @msr_no: which MSR
+ * @msrs: array of MSR values
+ *
+ */
+void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
+{
+ struct msr_info rv;
+ int this_cpu;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.off = cpumask_first(mask);
+ rv.msrs = msrs;
+ rv.msr_no = msr_no;
+
+ this_cpu = get_cpu();
+
+ if (cpumask_test_cpu(this_cpu, mask))
+ __wrmsr_on_cpu(&rv);
+
+ smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1);
+ put_cpu();
+}
+EXPORT_SYMBOL(wrmsr_on_cpus);
+
+/* These "safe" variants are slower and should be used when the target MSR
+ may not actually exist. */
+static void __rdmsr_safe_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+
+ rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h);
+}
+
+static void __wrmsr_safe_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+
+ rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h);
+}
+
+int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
+ *l = rv.reg.l;
+ *h = rv.reg.h;
+
+ return err ? err : rv.err;
+}
+EXPORT_SYMBOL(rdmsr_safe_on_cpu);
+
+int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ rv.reg.l = l;
+ rv.reg.h = h;
+ err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
+
+ return err ? err : rv.err;
+}
+EXPORT_SYMBOL(wrmsr_safe_on_cpu);
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 7c8ca91bb9e..1f118d462ac 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -751,7 +751,7 @@ survive:
if (retval == -ENOMEM && is_global_init(current)) {
up_read(&current->mm->mmap_sem);
- congestion_wait(WRITE, HZ/50);
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
goto survive;
}
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index ec13cb5f17e..b7c2849ffb6 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -127,7 +127,7 @@ EXPORT_SYMBOL(__strnlen_user);
long strnlen_user(const char __user *s, long n)
{
- if (!access_ok(VERIFY_READ, s, n))
+ if (!access_ok(VERIFY_READ, s, 1))
return 0;
return __strnlen_user(s, n);
}
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index fdd30d08ab5..eefdeee8a87 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -10,6 +10,8 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
obj-$(CONFIG_HIGHMEM) += highmem_32.o
+obj-$(CONFIG_KMEMCHECK) += kmemcheck/
+
obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index e7277cbcfb4..a725b7f760a 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -161,13 +161,14 @@ static void note_page(struct seq_file *m, struct pg_state *st,
st->current_address >= st->marker[1].start_address) {
const char *unit = units;
unsigned long delta;
+ int width = sizeof(unsigned long) * 2;
/*
* Now print the actual finished series
*/
- seq_printf(m, "0x%p-0x%p ",
- (void *)st->start_address,
- (void *)st->current_address);
+ seq_printf(m, "0x%0*lx-0x%0*lx ",
+ width, st->start_address,
+ width, st->current_address);
delta = (st->current_address - st->start_address) >> 10;
while (!(delta & 1023) && unit[1]) {
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a03b7279efa..bfae139182f 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -3,40 +3,18 @@
* Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
* Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
*/
-#include <linux/interrupt.h>
-#include <linux/mmiotrace.h>
-#include <linux/bootmem.h>
-#include <linux/compiler.h>
-#include <linux/highmem.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
-#include <linux/vt_kern.h>
-#include <linux/signal.h>
-#include <linux/kernel.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-#include <linux/errno.h>
-#include <linux/magic.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/mman.h>
-#include <linux/tty.h>
-#include <linux/smp.h>
-#include <linux/mm.h>
-
-#include <asm-generic/sections.h>
-
-#include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
-#include <asm/segment.h>
-#include <asm/system.h>
-#include <asm/proto.h>
-#include <asm/traps.h>
-#include <asm/desc.h>
+#include <linux/magic.h> /* STACK_END_MAGIC */
+#include <linux/sched.h> /* test_thread_flag(), ... */
+#include <linux/kdebug.h> /* oops_begin/end, ... */
+#include <linux/module.h> /* search_exception_table */
+#include <linux/bootmem.h> /* max_low_pfn */
+#include <linux/kprobes.h> /* __kprobes, ... */
+#include <linux/mmiotrace.h> /* kmmio_handler, ... */
+#include <linux/perf_counter.h> /* perf_swcounter_event */
+
+#include <asm/traps.h> /* dotraplinkage, ... */
+#include <asm/pgalloc.h> /* pgd_*(), ... */
+#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
/*
* Page fault error code bits:
@@ -225,12 +203,10 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
if (!pmd_present(*pmd_k))
return NULL;
- if (!pmd_present(*pmd)) {
+ if (!pmd_present(*pmd))
set_pmd(pmd, *pmd_k);
- arch_flush_lazy_mmu_mode();
- } else {
+ else
BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
- }
return pmd_k;
}
@@ -450,10 +426,11 @@ static noinline int vmalloc_fault(unsigned long address)
}
static const char errata93_warning[] =
-KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
-KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
-KERN_ERR "******* Please consider a BIOS update.\n"
-KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
+KERN_ERR
+"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+"******* Working around it, but it may cause SEGVs or burn power.\n"
+"******* Please consider a BIOS update.\n"
+"******* Disabling USB legacy in the BIOS may also help.\n";
/*
* No vm86 mode in 64-bit mode:
@@ -538,8 +515,6 @@ bad:
static int is_errata93(struct pt_regs *regs, unsigned long address)
{
#ifdef CONFIG_X86_64
- static int once;
-
if (address != regs->ip)
return 0;
@@ -549,10 +524,7 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
address |= 0xffffffffUL << 32;
if ((address >= (u64)_stext && address <= (u64)_etext) ||
(address >= MODULES_VADDR && address <= MODULES_END)) {
- if (!once) {
- printk(errata93_warning);
- once = 1;
- }
+ printk_once(errata93_warning);
regs->ip = address;
return 1;
}
@@ -725,7 +697,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
if (!printk_ratelimit())
return;
- printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+ printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
tsk->comm, task_pid_nr(tsk), address,
(void *)regs->ip, (void *)regs->sp, error_code);
@@ -981,11 +953,17 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
tsk = current;
mm = tsk->mm;
- prefetchw(&mm->mmap_sem);
-
/* Get the faulting address: */
address = read_cr2();
+ /*
+ * Detect and handle instructions that would cause a page fault for
+ * both a tracked kernel page and a userspace page.
+ */
+ if (kmemcheck_active(regs))
+ kmemcheck_hide(regs);
+ prefetchw(&mm->mmap_sem);
+
if (unlikely(kmmio_fault(regs, address)))
return;
@@ -1003,9 +981,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
* protection error (error_code & 9) == 0.
*/
if (unlikely(fault_in_kernel_space(address))) {
- if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
- vmalloc_fault(address) >= 0)
- return;
+ if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
+ if (vmalloc_fault(address) >= 0)
+ return;
+
+ if (kmemcheck_fault(regs, address, error_code))
+ return;
+ }
/* Can handle a stale RO->RW TLB: */
if (spurious_fault(error_code, address))
@@ -1044,6 +1026,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
if (unlikely(error_code & PF_RSVD))
pgtable_bad(regs, error_code, address);
+ perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+
/*
* If we're in an interrupt, have no user context or are running
* in an atomic region then we must not take the fault:
@@ -1130,17 +1114,22 @@ good_area:
* make sure we exit gracefully rather than endlessly redo
* the fault:
*/
- fault = handle_mm_fault(mm, vma, address, write);
+ fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, fault);
return;
}
- if (fault & VM_FAULT_MAJOR)
+ if (fault & VM_FAULT_MAJOR) {
tsk->maj_flt++;
- else
+ perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+ regs, address);
+ } else {
tsk->min_flt++;
+ perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+ regs, address);
+ }
check_v8086_mode(regs, address, tsk);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 6340cef6798..71da1bca13c 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -14,7 +14,7 @@
static inline pte_t gup_get_pte(pte_t *ptep)
{
#ifndef CONFIG_X86_PAE
- return *ptep;
+ return ACCESS_ONCE(*ptep);
#else
/*
* With get_user_pages_fast, we walk down the pagetables without taking
@@ -219,6 +219,62 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
return 1;
}
+/*
+ * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
+ * back to the regular GUP.
+ */
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr, len, end;
+ unsigned long next;
+ unsigned long flags;
+ pgd_t *pgdp;
+ int nr = 0;
+
+ start &= PAGE_MASK;
+ addr = start;
+ len = (unsigned long) nr_pages << PAGE_SHIFT;
+ end = start + len;
+ if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+ (void __user *)start, len)))
+ return 0;
+
+ /*
+ * XXX: batch / limit 'nr', to avoid large irq off latency
+ * needs some instrumenting to determine the common sizes used by
+ * important workloads (eg. DB2), and whether limiting the batch size
+ * will decrease performance.
+ *
+ * It seems like we're in the clear for the moment. Direct-IO is
+ * the main guy that batches up lots of get_user_pages, and even
+ * they are limited to 64-at-a-time which is not so many.
+ */
+ /*
+ * This doesn't prevent pagetable teardown, but does prevent
+ * the pagetables and pages from being freed on x86.
+ *
+ * So long as we atomically load page table pointers versus teardown
+ * (which we do on x86, with the above PAE exception), we can follow the
+ * address down to the the page and take a ref on it.
+ */
+ local_irq_save(flags);
+ pgdp = pgd_offset(mm, addr);
+ do {
+ pgd_t pgd = *pgdp;
+
+ next = pgd_addr_end(addr, end);
+ if (pgd_none(pgd))
+ break;
+ if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ break;
+ } while (pgdp++, addr = next, addr != end);
+ local_irq_restore(flags);
+
+ return nr;
+}
+
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
@@ -247,11 +303,16 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
start &= PAGE_MASK;
addr = start;
len = (unsigned long) nr_pages << PAGE_SHIFT;
+
end = start + len;
- if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
- (void __user *)start, len)))
+ if (end < start)
goto slow_irqon;
+#ifdef CONFIG_X86_64
+ if (end >> __VIRTUAL_MASK_SHIFT)
+ goto slow_irqon;
+#endif
+
/*
* XXX: batch / limit 'nr', to avoid large irq off latency
* needs some instrumenting to determine the common sizes used by
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 8126e8d1a2a..2112ed55e7e 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -44,7 +44,6 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
BUG_ON(!pte_none(*(kmap_pte-idx)));
set_pte(kmap_pte-idx, mk_pte(page, prot));
- arch_flush_lazy_mmu_mode();
return (void *)vaddr;
}
@@ -74,7 +73,6 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
#endif
}
- arch_flush_lazy_mmu_mode();
pagefault_enable();
}
@@ -105,6 +103,7 @@ EXPORT_SYMBOL(kmap);
EXPORT_SYMBOL(kunmap);
EXPORT_SYMBOL(kmap_atomic);
EXPORT_SYMBOL(kunmap_atomic);
+EXPORT_SYMBOL(kmap_atomic_prot);
void __init set_highmem_pages_init(void)
{
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ae4f7b5d710..0607119cef9 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1,3 +1,4 @@
+#include <linux/initrd.h>
#include <linux/ioport.h>
#include <linux/swap.h>
@@ -10,6 +11,10 @@
#include <asm/setup.h>
#include <asm/system.h>
#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/proto.h>
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
unsigned long __initdata e820_table_start;
unsigned long __meminitdata e820_table_end;
@@ -23,6 +28,69 @@ int direct_gbpages
#endif
;
+int nx_enabled;
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+static int disable_nx __cpuinitdata;
+
+/*
+ * noexec = on|off
+ *
+ * Control non-executable mappings for processes.
+ *
+ * on Enable
+ * off Disable
+ */
+static int __init noexec_setup(char *str)
+{
+ if (!str)
+ return -EINVAL;
+ if (!strncmp(str, "on", 2)) {
+ __supported_pte_mask |= _PAGE_NX;
+ disable_nx = 0;
+ } else if (!strncmp(str, "off", 3)) {
+ disable_nx = 1;
+ __supported_pte_mask &= ~_PAGE_NX;
+ }
+ return 0;
+}
+early_param("noexec", noexec_setup);
+#endif
+
+#ifdef CONFIG_X86_PAE
+static void __init set_nx(void)
+{
+ unsigned int v[4], l, h;
+
+ if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
+ cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
+
+ if ((v[3] & (1 << 20)) && !disable_nx) {
+ rdmsr(MSR_EFER, l, h);
+ l |= EFER_NX;
+ wrmsr(MSR_EFER, l, h);
+ nx_enabled = 1;
+ __supported_pte_mask |= _PAGE_NX;
+ }
+ }
+}
+#else
+static inline void set_nx(void)
+{
+}
+#endif
+
+#ifdef CONFIG_X86_64
+void __cpuinit check_efer(void)
+{
+ unsigned long efer;
+
+ rdmsrl(MSR_EFER, efer);
+ if (!(efer & EFER_NX) || disable_nx)
+ __supported_pte_mask &= ~_PAGE_NX;
+}
+#endif
+
static void __init find_early_table_space(unsigned long end, int use_pse,
int use_gbpages)
{
@@ -66,12 +134,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
*/
#ifdef CONFIG_X86_32
start = 0x7000;
- e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
- tables, PAGE_SIZE);
-#else /* CONFIG_X86_64 */
+#else
start = 0x8000;
- e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
#endif
+ e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+ tables, PAGE_SIZE);
if (e820_table_start == -1UL)
panic("Cannot find space for the kernel page tables");
@@ -111,20 +178,6 @@ static int __meminit save_mr(struct map_range *mr, int nr_range,
return nr_range;
}
-#ifdef CONFIG_X86_64
-static void __init init_gbpages(void)
-{
- if (direct_gbpages && cpu_has_gbpages)
- printk(KERN_INFO "Using GB pages for direct mapping\n");
- else
- direct_gbpages = 0;
-}
-#else
-static inline void init_gbpages(void)
-{
-}
-#endif
-
/*
* Setup the direct mapping of the physical memory at PAGE_OFFSET.
* This runs before bootmem is initialized and gets pages directly from
@@ -144,10 +197,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
- if (!after_bootmem)
- init_gbpages();
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
/*
* For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
* This will simplify cpa(), which otherwise needs to support splitting
@@ -159,12 +209,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
use_gbpages = direct_gbpages;
#endif
-#ifdef CONFIG_X86_32
-#ifdef CONFIG_X86_PAE
set_nx();
if (nx_enabled)
printk(KERN_INFO "NX (Execute Disable) protection: active\n");
-#endif
/* Enable PSE if available */
if (cpu_has_pse)
@@ -175,7 +222,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
set_in_cr4(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
}
-#endif
if (use_gbpages)
page_size_mask |= 1 << PG_LEVEL_1G;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 749559ed80f..3cd7711bb94 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,12 +49,9 @@
#include <asm/paravirt.h>
#include <asm/setup.h>
#include <asm/cacheflush.h>
+#include <asm/page_types.h>
#include <asm/init.h>
-unsigned long max_low_pfn_mapped;
-unsigned long max_pfn_mapped;
-
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
unsigned long highstart_pfn, highend_pfn;
static noinline int do_test_wp_bit(void);
@@ -114,7 +111,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
pte_t *page_table = NULL;
if (after_bootmem) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
#endif
if (!page_table)
@@ -567,7 +564,7 @@ static inline void save_pg_dir(void)
}
#endif /* !CONFIG_ACPI_SLEEP */
-void zap_low_mappings(void)
+void zap_low_mappings(bool early)
{
int i;
@@ -584,64 +581,16 @@ void zap_low_mappings(void)
set_pgd(swapper_pg_dir+i, __pgd(0));
#endif
}
- flush_tlb_all();
-}
-int nx_enabled;
+ if (early)
+ __flush_tlb();
+ else
+ flush_tlb_all();
+}
pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
EXPORT_SYMBOL_GPL(__supported_pte_mask);
-#ifdef CONFIG_X86_PAE
-
-static int disable_nx __initdata;
-
-/*
- * noexec = on|off
- *
- * Control non executable mappings.
- *
- * on Enable
- * off Disable
- */
-static int __init noexec_setup(char *str)
-{
- if (!str || !strcmp(str, "on")) {
- if (cpu_has_nx) {
- __supported_pte_mask |= _PAGE_NX;
- disable_nx = 0;
- }
- } else {
- if (!strcmp(str, "off")) {
- disable_nx = 1;
- __supported_pte_mask &= ~_PAGE_NX;
- } else {
- return -EINVAL;
- }
- }
-
- return 0;
-}
-early_param("noexec", noexec_setup);
-
-void __init set_nx(void)
-{
- unsigned int v[4], l, h;
-
- if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
- cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
-
- if ((v[3] & (1 << 20)) && !disable_nx) {
- rdmsr(MSR_EFER, l, h);
- l |= EFER_NX;
- wrmsr(MSR_EFER, l, h);
- nx_enabled = 1;
- __supported_pte_mask |= _PAGE_NX;
- }
- }
-}
-#endif
-
/* user-defined highmem size */
static unsigned int highmem_pages = -1;
@@ -761,15 +710,15 @@ void __init initmem_init(unsigned long start_pfn,
highstart_pfn = highend_pfn = max_pfn;
if (max_pfn > max_low_pfn)
highstart_pfn = max_low_pfn;
- memory_present(0, 0, highend_pfn);
e820_register_active_regions(0, 0, highend_pfn);
+ sparse_memory_present_with_active_regions(0);
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
num_physpages = highend_pfn;
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
- memory_present(0, 0, max_low_pfn);
e820_register_active_regions(0, 0, max_low_pfn);
+ sparse_memory_present_with_active_regions(0);
num_physpages = max_low_pfn;
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
@@ -1011,7 +960,7 @@ void __init mem_init(void)
test_wp_bit();
save_pg_dir();
- zap_low_mappings();
+ zap_low_mappings(true);
}
#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1753e8020df..ea56b8cbb6a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -50,18 +50,8 @@
#include <asm/cacheflush.h>
#include <asm/init.h>
-/*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
- */
-unsigned long max_low_pfn_mapped;
-unsigned long max_pfn_mapped;
-
static unsigned long dma_reserve __initdata;
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
static int __init parse_direct_gbpages_off(char *arg)
{
direct_gbpages = 0;
@@ -85,39 +75,6 @@ early_param("gbpages", parse_direct_gbpages_on);
pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
EXPORT_SYMBOL_GPL(__supported_pte_mask);
-static int disable_nx __cpuinitdata;
-
-/*
- * noexec=on|off
- * Control non-executable mappings for 64-bit processes.
- *
- * on Enable (default)
- * off Disable
- */
-static int __init nonx_setup(char *str)
-{
- if (!str)
- return -EINVAL;
- if (!strncmp(str, "on", 2)) {
- __supported_pte_mask |= _PAGE_NX;
- disable_nx = 0;
- } else if (!strncmp(str, "off", 3)) {
- disable_nx = 1;
- __supported_pte_mask &= ~_PAGE_NX;
- }
- return 0;
-}
-early_param("noexec", nonx_setup);
-
-void __cpuinit check_efer(void)
-{
- unsigned long efer;
-
- rdmsrl(MSR_EFER, efer);
- if (!(efer & EFER_NX) || disable_nx)
- __supported_pte_mask &= ~_PAGE_NX;
-}
-
int force_personality32;
/*
@@ -147,7 +104,7 @@ static __ref void *spp_getpage(void)
void *ptr;
if (after_bootmem)
- ptr = (void *) get_zeroed_page(GFP_ATOMIC);
+ ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
else
ptr = alloc_bootmem_pages(PAGE_SIZE);
@@ -324,7 +281,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
void *adr;
if (after_bootmem) {
- adr = (void *)get_zeroed_page(GFP_ATOMIC);
+ adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
*phys = __pa(adr);
return adr;
@@ -570,7 +527,7 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
return phys_pud_init(pud, addr, end, page_size_mask);
}
-unsigned long __init
+unsigned long __meminit
kernel_physical_mapping_init(unsigned long start,
unsigned long end,
unsigned long page_size_mask)
@@ -628,6 +585,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
}
+#endif
void __init paging_init(void)
{
@@ -638,11 +596,19 @@ void __init paging_init(void)
max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
max_zone_pfns[ZONE_NORMAL] = max_pfn;
- memory_present(0, 0, max_pfn);
+ sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
+
+ /*
+ * clear the default setting with node 0
+ * note: don't use nodes_clear here, that is really clearing when
+ * numa support is not compiled in, and later node_set_state
+ * will not set it back.
+ */
+ node_clear_state(0, N_NORMAL_MEMORY);
+
free_area_init_nodes(max_zone_pfns);
}
-#endif
/*
* Memory hotplug specific functions
@@ -830,7 +796,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
return ret;
#else
- reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+ reserve_bootmem(phys, len, flags);
#endif
if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 8056545e2d3..fe6f84ca121 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -82,7 +82,6 @@ iounmap_atomic(void *kvaddr, enum km_type type)
if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
kpte_clear_flush(kmap_pte-idx, vaddr);
- arch_flush_lazy_mmu_mode();
pagefault_enable();
}
EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile
new file mode 100644
index 00000000000..520b3bce409
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/Makefile
@@ -0,0 +1 @@
+obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
new file mode 100644
index 00000000000..4901d0dafda
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -0,0 +1,228 @@
+#include <linux/interrupt.h>
+#include <linux/kdebug.h>
+#include <linux/kmemcheck.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+
+#include "error.h"
+#include "shadow.h"
+
+enum kmemcheck_error_type {
+ KMEMCHECK_ERROR_INVALID_ACCESS,
+ KMEMCHECK_ERROR_BUG,
+};
+
+#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
+
+struct kmemcheck_error {
+ enum kmemcheck_error_type type;
+
+ union {
+ /* KMEMCHECK_ERROR_INVALID_ACCESS */
+ struct {
+ /* Kind of access that caused the error */
+ enum kmemcheck_shadow state;
+ /* Address and size of the erroneous read */
+ unsigned long address;
+ unsigned int size;
+ };
+ };
+
+ struct pt_regs regs;
+ struct stack_trace trace;
+ unsigned long trace_entries[32];
+
+ /* We compress it to a char. */
+ unsigned char shadow_copy[SHADOW_COPY_SIZE];
+ unsigned char memory_copy[SHADOW_COPY_SIZE];
+};
+
+/*
+ * Create a ring queue of errors to output. We can't call printk() directly
+ * from the kmemcheck traps, since this may call the console drivers and
+ * result in a recursive fault.
+ */
+static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
+static unsigned int error_count;
+static unsigned int error_rd;
+static unsigned int error_wr;
+static unsigned int error_missed_count;
+
+static struct kmemcheck_error *error_next_wr(void)
+{
+ struct kmemcheck_error *e;
+
+ if (error_count == ARRAY_SIZE(error_fifo)) {
+ ++error_missed_count;
+ return NULL;
+ }
+
+ e = &error_fifo[error_wr];
+ if (++error_wr == ARRAY_SIZE(error_fifo))
+ error_wr = 0;
+ ++error_count;
+ return e;
+}
+
+static struct kmemcheck_error *error_next_rd(void)
+{
+ struct kmemcheck_error *e;
+
+ if (error_count == 0)
+ return NULL;
+
+ e = &error_fifo[error_rd];
+ if (++error_rd == ARRAY_SIZE(error_fifo))
+ error_rd = 0;
+ --error_count;
+ return e;
+}
+
+void kmemcheck_error_recall(void)
+{
+ static const char *desc[] = {
+ [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated",
+ [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized",
+ [KMEMCHECK_SHADOW_INITIALIZED] = "initialized",
+ [KMEMCHECK_SHADOW_FREED] = "freed",
+ };
+
+ static const char short_desc[] = {
+ [KMEMCHECK_SHADOW_UNALLOCATED] = 'a',
+ [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u',
+ [KMEMCHECK_SHADOW_INITIALIZED] = 'i',
+ [KMEMCHECK_SHADOW_FREED] = 'f',
+ };
+
+ struct kmemcheck_error *e;
+ unsigned int i;
+
+ e = error_next_rd();
+ if (!e)
+ return;
+
+ switch (e->type) {
+ case KMEMCHECK_ERROR_INVALID_ACCESS:
+ printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read "
+ "from %s memory (%p)\n",
+ 8 * e->size, e->state < ARRAY_SIZE(desc) ?
+ desc[e->state] : "(invalid shadow state)",
+ (void *) e->address);
+
+ printk(KERN_INFO);
+ for (i = 0; i < SHADOW_COPY_SIZE; ++i)
+ printk("%02x", e->memory_copy[i]);
+ printk("\n");
+
+ printk(KERN_INFO);
+ for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
+ if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
+ printk(" %c", short_desc[e->shadow_copy[i]]);
+ else
+ printk(" ?");
+ }
+ printk("\n");
+ printk(KERN_INFO "%*c\n", 2 + 2
+ * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
+ break;
+ case KMEMCHECK_ERROR_BUG:
+ printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
+ break;
+ }
+
+ __show_regs(&e->regs, 1);
+ print_stack_trace(&e->trace, 0);
+}
+
+static void do_wakeup(unsigned long data)
+{
+ while (error_count > 0)
+ kmemcheck_error_recall();
+
+ if (error_missed_count > 0) {
+ printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
+ "the queue was too small\n", error_missed_count);
+ error_missed_count = 0;
+ }
+}
+
+static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
+
+/*
+ * Save the context of an error report.
+ */
+void kmemcheck_error_save(enum kmemcheck_shadow state,
+ unsigned long address, unsigned int size, struct pt_regs *regs)
+{
+ static unsigned long prev_ip;
+
+ struct kmemcheck_error *e;
+ void *shadow_copy;
+ void *memory_copy;
+
+ /* Don't report several adjacent errors from the same EIP. */
+ if (regs->ip == prev_ip)
+ return;
+ prev_ip = regs->ip;
+
+ e = error_next_wr();
+ if (!e)
+ return;
+
+ e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
+
+ e->state = state;
+ e->address = address;
+ e->size = size;
+
+ /* Save regs */
+ memcpy(&e->regs, regs, sizeof(*regs));
+
+ /* Save stack trace */
+ e->trace.nr_entries = 0;
+ e->trace.entries = e->trace_entries;
+ e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
+ e->trace.skip = 0;
+ save_stack_trace_bp(&e->trace, regs->bp);
+
+ /* Round address down to nearest 16 bytes */
+ shadow_copy = kmemcheck_shadow_lookup(address
+ & ~(SHADOW_COPY_SIZE - 1));
+ BUG_ON(!shadow_copy);
+
+ memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
+
+ kmemcheck_show_addr(address);
+ memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
+ memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
+ kmemcheck_hide_addr(address);
+
+ tasklet_hi_schedule_first(&kmemcheck_tasklet);
+}
+
+/*
+ * Save the context of a kmemcheck bug.
+ */
+void kmemcheck_error_save_bug(struct pt_regs *regs)
+{
+ struct kmemcheck_error *e;
+
+ e = error_next_wr();
+ if (!e)
+ return;
+
+ e->type = KMEMCHECK_ERROR_BUG;
+
+ memcpy(&e->regs, regs, sizeof(*regs));
+
+ e->trace.nr_entries = 0;
+ e->trace.entries = e->trace_entries;
+ e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
+ e->trace.skip = 1;
+ save_stack_trace(&e->trace);
+
+ tasklet_hi_schedule_first(&kmemcheck_tasklet);
+}
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
new file mode 100644
index 00000000000..0efc2e8d0a2
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.h
@@ -0,0 +1,15 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
+#define ARCH__X86__MM__KMEMCHECK__ERROR_H
+
+#include <linux/ptrace.h>
+
+#include "shadow.h"
+
+void kmemcheck_error_save(enum kmemcheck_shadow state,
+ unsigned long address, unsigned int size, struct pt_regs *regs);
+
+void kmemcheck_error_save_bug(struct pt_regs *regs);
+
+void kmemcheck_error_recall(void);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
new file mode 100644
index 00000000000..2c55ed09865
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -0,0 +1,640 @@
+/**
+ * kmemcheck - a heavyweight memory checker for the linux kernel
+ * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no>
+ * (With a lot of help from Ingo Molnar and Pekka Enberg.)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2) as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/page-flags.h>
+#include <linux/percpu.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include <asm/cacheflush.h>
+#include <asm/kmemcheck.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+
+#include "error.h"
+#include "opcode.h"
+#include "pte.h"
+#include "selftest.h"
+#include "shadow.h"
+
+
+#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
+# define KMEMCHECK_ENABLED 0
+#endif
+
+#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
+# define KMEMCHECK_ENABLED 1
+#endif
+
+#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
+# define KMEMCHECK_ENABLED 2
+#endif
+
+int kmemcheck_enabled = KMEMCHECK_ENABLED;
+
+int __init kmemcheck_init(void)
+{
+#ifdef CONFIG_SMP
+ /*
+ * Limit SMP to use a single CPU. We rely on the fact that this code
+ * runs before SMP is set up.
+ */
+ if (setup_max_cpus > 1) {
+ printk(KERN_INFO
+ "kmemcheck: Limiting number of CPUs to 1.\n");
+ setup_max_cpus = 1;
+ }
+#endif
+
+ if (!kmemcheck_selftest()) {
+ printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
+ kmemcheck_enabled = 0;
+ return -EINVAL;
+ }
+
+ printk(KERN_INFO "kmemcheck: Initialized\n");
+ return 0;
+}
+
+early_initcall(kmemcheck_init);
+
+/*
+ * We need to parse the kmemcheck= option before any memory is allocated.
+ */
+static int __init param_kmemcheck(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ sscanf(str, "%d", &kmemcheck_enabled);
+ return 0;
+}
+
+early_param("kmemcheck", param_kmemcheck);
+
+int kmemcheck_show_addr(unsigned long address)
+{
+ pte_t *pte;
+
+ pte = kmemcheck_pte_lookup(address);
+ if (!pte)
+ return 0;
+
+ set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
+ __flush_tlb_one(address);
+ return 1;
+}
+
+int kmemcheck_hide_addr(unsigned long address)
+{
+ pte_t *pte;
+
+ pte = kmemcheck_pte_lookup(address);
+ if (!pte)
+ return 0;
+
+ set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
+ __flush_tlb_one(address);
+ return 1;
+}
+
+struct kmemcheck_context {
+ bool busy;
+ int balance;
+
+ /*
+ * There can be at most two memory operands to an instruction, but
+ * each address can cross a page boundary -- so we may need up to
+ * four addresses that must be hidden/revealed for each fault.
+ */
+ unsigned long addr[4];
+ unsigned long n_addrs;
+ unsigned long flags;
+
+ /* Data size of the instruction that caused a fault. */
+ unsigned int size;
+};
+
+static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
+
+bool kmemcheck_active(struct pt_regs *regs)
+{
+ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+
+ return data->balance > 0;
+}
+
+/* Save an address that needs to be shown/hidden */
+static void kmemcheck_save_addr(unsigned long addr)
+{
+ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+
+ BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
+ data->addr[data->n_addrs++] = addr;
+}
+
+static unsigned int kmemcheck_show_all(void)
+{
+ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+ unsigned int i;
+ unsigned int n;
+
+ n = 0;
+ for (i = 0; i < data->n_addrs; ++i)
+ n += kmemcheck_show_addr(data->addr[i]);
+
+ return n;
+}
+
+static unsigned int kmemcheck_hide_all(void)
+{
+ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+ unsigned int i;
+ unsigned int n;
+
+ n = 0;
+ for (i = 0; i < data->n_addrs; ++i)
+ n += kmemcheck_hide_addr(data->addr[i]);
+
+ return n;
+}
+
+/*
+ * Called from the #PF handler.
+ */
+void kmemcheck_show(struct pt_regs *regs)
+{
+ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+
+ BUG_ON(!irqs_disabled());
+
+ if (unlikely(data->balance != 0)) {
+ kmemcheck_show_all();
+ kmemcheck_error_save_bug(regs);
+ data->balance = 0;
+ return;
+ }
+
+ /*
+ * None of the addresses actually belonged to kmemcheck. Note that
+ * this is not an error.
+ */
+ if (kmemcheck_show_all() == 0)
+ return;
+
+ ++data->balance;
+
+ /*
+ * The IF needs to be cleared as well, so that the faulting
+ * instruction can run "uninterrupted". Otherwise, we might take
+ * an interrupt and start executing that before we've had a chance
+ * to hide the page again.
+ *
+ * NOTE: In the rare case of multiple faults, we must not override
+ * the original flags:
+ */
+ if (!(regs->flags & X86_EFLAGS_TF))
+ data->flags = regs->flags;
+
+ regs->flags |= X86_EFLAGS_TF;
+ regs->flags &= ~X86_EFLAGS_IF;
+}
+
+/*
+ * Called from the #DB handler.
+ */
+void kmemcheck_hide(struct pt_regs *regs)
+{
+ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+ int n;
+
+ BUG_ON(!irqs_disabled());
+
+ if (data->balance == 0)
+ return;
+
+ if (unlikely(data->balance != 1)) {
+ kmemcheck_show_all();
+ kmemcheck_error_save_bug(regs);
+ data->n_addrs = 0;
+ data->balance = 0;
+
+ if (!(data->flags & X86_EFLAGS_TF))
+ regs->flags &= ~X86_EFLAGS_TF;
+ if (data->flags & X86_EFLAGS_IF)
+ regs->flags |= X86_EFLAGS_IF;
+ return;
+ }
+
+ if (kmemcheck_enabled)
+ n = kmemcheck_hide_all();
+ else
+ n = kmemcheck_show_all();
+
+ if (n == 0)
+ return;
+
+ --data->balance;
+
+ data->n_addrs = 0;
+
+ if (!(data->flags & X86_EFLAGS_TF))
+ regs->flags &= ~X86_EFLAGS_TF;
+ if (data->flags & X86_EFLAGS_IF)
+ regs->flags |= X86_EFLAGS_IF;
+}
+
+void kmemcheck_show_pages(struct page *p, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; ++i) {
+ unsigned long address;
+ pte_t *pte;
+ unsigned int level;
+
+ address = (unsigned long) page_address(&p[i]);
+ pte = lookup_address(address, &level);
+ BUG_ON(!pte);
+ BUG_ON(level != PG_LEVEL_4K);
+
+ set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
+ set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
+ __flush_tlb_one(address);
+ }
+}
+
+bool kmemcheck_page_is_tracked(struct page *p)
+{
+ /* This will also check the "hidden" flag of the PTE. */
+ return kmemcheck_pte_lookup((unsigned long) page_address(p));
+}
+
+void kmemcheck_hide_pages(struct page *p, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; ++i) {
+ unsigned long address;
+ pte_t *pte;
+ unsigned int level;
+
+ address = (unsigned long) page_address(&p[i]);
+ pte = lookup_address(address, &level);
+ BUG_ON(!pte);
+ BUG_ON(level != PG_LEVEL_4K);
+
+ set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
+ set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
+ __flush_tlb_one(address);
+ }
+}
+
+/* Access may NOT cross page boundary */
+static void kmemcheck_read_strict(struct pt_regs *regs,
+ unsigned long addr, unsigned int size)
+{
+ void *shadow;
+ enum kmemcheck_shadow status;
+
+ shadow = kmemcheck_shadow_lookup(addr);
+ if (!shadow)
+ return;
+
+ kmemcheck_save_addr(addr);
+ status = kmemcheck_shadow_test(shadow, size);
+ if (status == KMEMCHECK_SHADOW_INITIALIZED)
+ return;
+
+ if (kmemcheck_enabled)
+ kmemcheck_error_save(status, addr, size, regs);
+
+ if (kmemcheck_enabled == 2)
+ kmemcheck_enabled = 0;
+
+ /* Don't warn about it again. */
+ kmemcheck_shadow_set(shadow, size);
+}
+
+/* Access may cross page boundary */
+static void kmemcheck_read(struct pt_regs *regs,
+ unsigned long addr, unsigned int size)
+{
+ unsigned long page = addr & PAGE_MASK;
+ unsigned long next_addr = addr + size - 1;
+ unsigned long next_page = next_addr & PAGE_MASK;
+
+ if (likely(page == next_page)) {
+ kmemcheck_read_strict(regs, addr, size);
+ return;
+ }
+
+ /*
+ * What we do is basically to split the access across the
+ * two pages and handle each part separately. Yes, this means
+ * that we may now see reads that are 3 + 5 bytes, for
+ * example (and if both are uninitialized, there will be two
+ * reports), but it makes the code a lot simpler.
+ */
+ kmemcheck_read_strict(regs, addr, next_page - addr);
+ kmemcheck_read_strict(regs, next_page, next_addr - next_page);
+}
+
+static void kmemcheck_write_strict(struct pt_regs *regs,
+ unsigned long addr, unsigned int size)
+{
+ void *shadow;
+
+ shadow = kmemcheck_shadow_lookup(addr);
+ if (!shadow)
+ return;
+
+ kmemcheck_save_addr(addr);
+ kmemcheck_shadow_set(shadow, size);
+}
+
+static void kmemcheck_write(struct pt_regs *regs,
+ unsigned long addr, unsigned int size)
+{
+ unsigned long page = addr & PAGE_MASK;
+ unsigned long next_addr = addr + size - 1;
+ unsigned long next_page = next_addr & PAGE_MASK;
+
+ if (likely(page == next_page)) {
+ kmemcheck_write_strict(regs, addr, size);
+ return;
+ }
+
+ /* See comment in kmemcheck_read(). */
+ kmemcheck_write_strict(regs, addr, next_page - addr);
+ kmemcheck_write_strict(regs, next_page, next_addr - next_page);
+}
+
+/*
+ * Copying is hard. We have two addresses, each of which may be split across
+ * a page (and each page will have different shadow addresses).
+ */
+static void kmemcheck_copy(struct pt_regs *regs,
+ unsigned long src_addr, unsigned long dst_addr, unsigned int size)
+{
+ uint8_t shadow[8];
+ enum kmemcheck_shadow status;
+
+ unsigned long page;
+ unsigned long next_addr;
+ unsigned long next_page;
+
+ uint8_t *x;
+ unsigned int i;
+ unsigned int n;
+
+ BUG_ON(size > sizeof(shadow));
+
+ page = src_addr & PAGE_MASK;
+ next_addr = src_addr + size - 1;
+ next_page = next_addr & PAGE_MASK;
+
+ if (likely(page == next_page)) {
+ /* Same page */
+ x = kmemcheck_shadow_lookup(src_addr);
+ if (x) {
+ kmemcheck_save_addr(src_addr);
+ for (i = 0; i < size; ++i)
+ shadow[i] = x[i];
+ } else {
+ for (i = 0; i < size; ++i)
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+ } else {
+ n = next_page - src_addr;
+ BUG_ON(n > sizeof(shadow));
+
+ /* First page */
+ x = kmemcheck_shadow_lookup(src_addr);
+ if (x) {
+ kmemcheck_save_addr(src_addr);
+ for (i = 0; i < n; ++i)
+ shadow[i] = x[i];
+ } else {
+ /* Not tracked */
+ for (i = 0; i < n; ++i)
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+
+ /* Second page */
+ x = kmemcheck_shadow_lookup(next_page);
+ if (x) {
+ kmemcheck_save_addr(next_page);
+ for (i = n; i < size; ++i)
+ shadow[i] = x[i - n];
+ } else {
+ /* Not tracked */
+ for (i = n; i < size; ++i)
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+ }
+
+ page = dst_addr & PAGE_MASK;
+ next_addr = dst_addr + size - 1;
+ next_page = next_addr & PAGE_MASK;
+
+ if (likely(page == next_page)) {
+ /* Same page */
+ x = kmemcheck_shadow_lookup(dst_addr);
+ if (x) {
+ kmemcheck_save_addr(dst_addr);
+ for (i = 0; i < size; ++i) {
+ x[i] = shadow[i];
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+ }
+ } else {
+ n = next_page - dst_addr;
+ BUG_ON(n > sizeof(shadow));
+
+ /* First page */
+ x = kmemcheck_shadow_lookup(dst_addr);
+ if (x) {
+ kmemcheck_save_addr(dst_addr);
+ for (i = 0; i < n; ++i) {
+ x[i] = shadow[i];
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+ }
+
+ /* Second page */
+ x = kmemcheck_shadow_lookup(next_page);
+ if (x) {
+ kmemcheck_save_addr(next_page);
+ for (i = n; i < size; ++i) {
+ x[i - n] = shadow[i];
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+ }
+ }
+
+ status = kmemcheck_shadow_test(shadow, size);
+ if (status == KMEMCHECK_SHADOW_INITIALIZED)
+ return;
+
+ if (kmemcheck_enabled)
+ kmemcheck_error_save(status, src_addr, size, regs);
+
+ if (kmemcheck_enabled == 2)
+ kmemcheck_enabled = 0;
+}
+
+enum kmemcheck_method {
+ KMEMCHECK_READ,
+ KMEMCHECK_WRITE,
+};
+
+static void kmemcheck_access(struct pt_regs *regs,
+ unsigned long fallback_address, enum kmemcheck_method fallback_method)
+{
+ const uint8_t *insn;
+ const uint8_t *insn_primary;
+ unsigned int size;
+
+ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+
+ /* Recursive fault -- ouch. */
+ if (data->busy) {
+ kmemcheck_show_addr(fallback_address);
+ kmemcheck_error_save_bug(regs);
+ return;
+ }
+
+ data->busy = true;
+
+ insn = (const uint8_t *) regs->ip;
+ insn_primary = kmemcheck_opcode_get_primary(insn);
+
+ kmemcheck_opcode_decode(insn, &size);
+
+ switch (insn_primary[0]) {
+#ifdef CONFIG_KMEMCHECK_BITOPS_OK
+ /* AND, OR, XOR */
+ /*
+ * Unfortunately, these instructions have to be excluded from
+ * our regular checking since they access only some (and not
+ * all) bits. This clears out "bogus" bitfield-access warnings.
+ */
+ case 0x80:
+ case 0x81:
+ case 0x82:
+ case 0x83:
+ switch ((insn_primary[1] >> 3) & 7) {
+ /* OR */
+ case 1:
+ /* AND */
+ case 4:
+ /* XOR */
+ case 6:
+ kmemcheck_write(regs, fallback_address, size);
+ goto out;
+
+ /* ADD */
+ case 0:
+ /* ADC */
+ case 2:
+ /* SBB */
+ case 3:
+ /* SUB */
+ case 5:
+ /* CMP */
+ case 7:
+ break;
+ }
+ break;
+#endif
+
+ /* MOVS, MOVSB, MOVSW, MOVSD */
+ case 0xa4:
+ case 0xa5:
+ /*
+ * These instructions are special because they take two
+ * addresses, but we only get one page fault.
+ */
+ kmemcheck_copy(regs, regs->si, regs->di, size);
+ goto out;
+
+ /* CMPS, CMPSB, CMPSW, CMPSD */
+ case 0xa6:
+ case 0xa7:
+ kmemcheck_read(regs, regs->si, size);
+ kmemcheck_read(regs, regs->di, size);
+ goto out;
+ }
+
+ /*
+ * If the opcode isn't special in any way, we use the data from the
+ * page fault handler to determine the address and type of memory
+ * access.
+ */
+ switch (fallback_method) {
+ case KMEMCHECK_READ:
+ kmemcheck_read(regs, fallback_address, size);
+ goto out;
+ case KMEMCHECK_WRITE:
+ kmemcheck_write(regs, fallback_address, size);
+ goto out;
+ }
+
+out:
+ data->busy = false;
+}
+
+bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
+ unsigned long error_code)
+{
+ pte_t *pte;
+
+ /*
+ * XXX: Is it safe to assume that memory accesses from virtual 86
+ * mode or non-kernel code segments will _never_ access kernel
+ * memory (e.g. tracked pages)? For now, we need this to avoid
+ * invoking kmemcheck for PnP BIOS calls.
+ */
+ if (regs->flags & X86_VM_MASK)
+ return false;
+ if (regs->cs != __KERNEL_CS)
+ return false;
+
+ pte = kmemcheck_pte_lookup(address);
+ if (!pte)
+ return false;
+
+ if (error_code & 2)
+ kmemcheck_access(regs, address, KMEMCHECK_WRITE);
+ else
+ kmemcheck_access(regs, address, KMEMCHECK_READ);
+
+ kmemcheck_show(regs);
+ return true;
+}
+
+bool kmemcheck_trap(struct pt_regs *regs)
+{
+ if (!kmemcheck_active(regs))
+ return false;
+
+ /* We're done. */
+ kmemcheck_hide(regs);
+ return true;
+}
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
new file mode 100644
index 00000000000..63c19e27aa6
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -0,0 +1,106 @@
+#include <linux/types.h>
+
+#include "opcode.h"
+
+static bool opcode_is_prefix(uint8_t b)
+{
+ return
+ /* Group 1 */
+ b == 0xf0 || b == 0xf2 || b == 0xf3
+ /* Group 2 */
+ || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
+ || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e
+ /* Group 3 */
+ || b == 0x66
+ /* Group 4 */
+ || b == 0x67;
+}
+
+#ifdef CONFIG_X86_64
+static bool opcode_is_rex_prefix(uint8_t b)
+{
+ return (b & 0xf0) == 0x40;
+}
+#else
+static bool opcode_is_rex_prefix(uint8_t b)
+{
+ return false;
+}
+#endif
+
+#define REX_W (1 << 3)
+
+/*
+ * This is a VERY crude opcode decoder. We only need to find the size of the
+ * load/store that caused our #PF and this should work for all the opcodes
+ * that we care about. Moreover, the ones who invented this instruction set
+ * should be shot.
+ */
+void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
+{
+ /* Default operand size */
+ int operand_size_override = 4;
+
+ /* prefixes */
+ for (; opcode_is_prefix(*op); ++op) {
+ if (*op == 0x66)
+ operand_size_override = 2;
+ }
+
+ /* REX prefix */
+ if (opcode_is_rex_prefix(*op)) {
+ uint8_t rex = *op;
+
+ ++op;
+ if (rex & REX_W) {
+ switch (*op) {
+ case 0x63:
+ *size = 4;
+ return;
+ case 0x0f:
+ ++op;
+
+ switch (*op) {
+ case 0xb6:
+ case 0xbe:
+ *size = 1;
+ return;
+ case 0xb7:
+ case 0xbf:
+ *size = 2;
+ return;
+ }
+
+ break;
+ }
+
+ *size = 8;
+ return;
+ }
+ }
+
+ /* escape opcode */
+ if (*op == 0x0f) {
+ ++op;
+
+ /*
+ * This is move with zero-extend and sign-extend, respectively;
+ * we don't have to think about 0xb6/0xbe, because this is
+ * already handled in the conditional below.
+ */
+ if (*op == 0xb7 || *op == 0xbf)
+ operand_size_override = 2;
+ }
+
+ *size = (*op & 1) ? operand_size_override : 1;
+}
+
+const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
+{
+ /* skip prefixes */
+ while (opcode_is_prefix(*op))
+ ++op;
+ if (opcode_is_rex_prefix(*op))
+ ++op;
+ return op;
+}
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
new file mode 100644
index 00000000000..6956aad66b5
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.h
@@ -0,0 +1,9 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
+#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
+
+#include <linux/types.h>
+
+void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
+const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
new file mode 100644
index 00000000000..4ead26eeaf9
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.c
@@ -0,0 +1,22 @@
+#include <linux/mm.h>
+
+#include <asm/pgtable.h>
+
+#include "pte.h"
+
+pte_t *kmemcheck_pte_lookup(unsigned long address)
+{
+ pte_t *pte;
+ unsigned int level;
+
+ pte = lookup_address(address, &level);
+ if (!pte)
+ return NULL;
+ if (level != PG_LEVEL_4K)
+ return NULL;
+ if (!pte_hidden(*pte))
+ return NULL;
+
+ return pte;
+}
+
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
new file mode 100644
index 00000000000..9f596645649
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.h
@@ -0,0 +1,10 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
+#define ARCH__X86__MM__KMEMCHECK__PTE_H
+
+#include <linux/mm.h>
+
+#include <asm/pgtable.h>
+
+pte_t *kmemcheck_pte_lookup(unsigned long address);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
new file mode 100644
index 00000000000..036efbea8b2
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -0,0 +1,69 @@
+#include <linux/kernel.h>
+
+#include "opcode.h"
+#include "selftest.h"
+
+struct selftest_opcode {
+ unsigned int expected_size;
+ const uint8_t *insn;
+ const char *desc;
+};
+
+static const struct selftest_opcode selftest_opcodes[] = {
+ /* REP MOVS */
+ {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"},
+ {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"},
+
+ /* MOVZX / MOVZXD */
+ {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"},
+ {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"},
+
+ /* MOVSX / MOVSXD */
+ {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"},
+ {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"},
+
+#ifdef CONFIG_X86_64
+ /* MOVZX / MOVZXD */
+ {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"},
+ {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"},
+
+ /* MOVSX / MOVSXD */
+ {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"},
+ {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"},
+ {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"},
+#endif
+};
+
+static bool selftest_opcode_one(const struct selftest_opcode *op)
+{
+ unsigned size;
+
+ kmemcheck_opcode_decode(op->insn, &size);
+
+ if (size == op->expected_size)
+ return true;
+
+ printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
+ op->desc, op->expected_size, size);
+ return false;
+}
+
+static bool selftest_opcodes_all(void)
+{
+ bool pass = true;
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
+ pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
+
+ return pass;
+}
+
+bool kmemcheck_selftest(void)
+{
+ bool pass = true;
+
+ pass = pass && selftest_opcodes_all();
+
+ return pass;
+}
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
new file mode 100644
index 00000000000..8fed4fe11f9
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.h
@@ -0,0 +1,6 @@
+#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
+#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
+
+bool kmemcheck_selftest(void);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
new file mode 100644
index 00000000000..e773b6bd007
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -0,0 +1,162 @@
+#include <linux/kmemcheck.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+#include "pte.h"
+#include "shadow.h"
+
+/*
+ * Return the shadow address for the given address. Returns NULL if the
+ * address is not tracked.
+ *
+ * We need to be extremely careful not to follow any invalid pointers,
+ * because this function can be called for *any* possible address.
+ */
+void *kmemcheck_shadow_lookup(unsigned long address)
+{
+ pte_t *pte;
+ struct page *page;
+
+ if (!virt_addr_valid(address))
+ return NULL;
+
+ pte = kmemcheck_pte_lookup(address);
+ if (!pte)
+ return NULL;
+
+ page = virt_to_page(address);
+ if (!page->shadow)
+ return NULL;
+ return page->shadow + (address & (PAGE_SIZE - 1));
+}
+
+static void mark_shadow(void *address, unsigned int n,
+ enum kmemcheck_shadow status)
+{
+ unsigned long addr = (unsigned long) address;
+ unsigned long last_addr = addr + n - 1;
+ unsigned long page = addr & PAGE_MASK;
+ unsigned long last_page = last_addr & PAGE_MASK;
+ unsigned int first_n;
+ void *shadow;
+
+ /* If the memory range crosses a page boundary, stop there. */
+ if (page == last_page)
+ first_n = n;
+ else
+ first_n = page + PAGE_SIZE - addr;
+
+ shadow = kmemcheck_shadow_lookup(addr);
+ if (shadow)
+ memset(shadow, status, first_n);
+
+ addr += first_n;
+ n -= first_n;
+
+ /* Do full-page memset()s. */
+ while (n >= PAGE_SIZE) {
+ shadow = kmemcheck_shadow_lookup(addr);
+ if (shadow)
+ memset(shadow, status, PAGE_SIZE);
+
+ addr += PAGE_SIZE;
+ n -= PAGE_SIZE;
+ }
+
+ /* Do the remaining page, if any. */
+ if (n > 0) {
+ shadow = kmemcheck_shadow_lookup(addr);
+ if (shadow)
+ memset(shadow, status, n);
+ }
+}
+
+void kmemcheck_mark_unallocated(void *address, unsigned int n)
+{
+ mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
+}
+
+void kmemcheck_mark_uninitialized(void *address, unsigned int n)
+{
+ mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
+}
+
+/*
+ * Fill the shadow memory of the given address such that the memory at that
+ * address is marked as being initialized.
+ */
+void kmemcheck_mark_initialized(void *address, unsigned int n)
+{
+ mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
+}
+EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
+
+void kmemcheck_mark_freed(void *address, unsigned int n)
+{
+ mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
+}
+
+void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; ++i)
+ kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
+}
+
+void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; ++i)
+ kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
+}
+
+void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; ++i)
+ kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
+}
+
+enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
+{
+ uint8_t *x;
+ unsigned int i;
+
+ x = shadow;
+
+#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
+ /*
+ * Make sure _some_ bytes are initialized. Gcc frequently generates
+ * code to access neighboring bytes.
+ */
+ for (i = 0; i < size; ++i) {
+ if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
+ return x[i];
+ }
+#else
+ /* All bytes must be initialized. */
+ for (i = 0; i < size; ++i) {
+ if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
+ return x[i];
+ }
+#endif
+
+ return x[0];
+}
+
+void kmemcheck_shadow_set(void *shadow, unsigned int size)
+{
+ uint8_t *x;
+ unsigned int i;
+
+ x = shadow;
+ for (i = 0; i < size; ++i)
+ x[i] = KMEMCHECK_SHADOW_INITIALIZED;
+}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
new file mode 100644
index 00000000000..af46d9ab9d8
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -0,0 +1,16 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
+#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
+
+enum kmemcheck_shadow {
+ KMEMCHECK_SHADOW_UNALLOCATED,
+ KMEMCHECK_SHADOW_UNINITIALIZED,
+ KMEMCHECK_SHADOW_INITIALIZED,
+ KMEMCHECK_SHADOW_FREED,
+};
+
+void *kmemcheck_shadow_lookup(unsigned long address);
+
+enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
+void kmemcheck_shadow_set(void *shadow, unsigned int size);
+
+#endif
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 50dc802a1c4..16ccbd77917 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -32,7 +32,7 @@ struct kmmio_fault_page {
struct list_head list;
struct kmmio_fault_page *release_next;
unsigned long page; /* location of the fault page */
- bool old_presence; /* page presence prior to arming */
+ pteval_t old_presence; /* page presence prior to arming */
bool armed;
/*
@@ -97,60 +97,62 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
{
struct list_head *head;
- struct kmmio_fault_page *p;
+ struct kmmio_fault_page *f;
page &= PAGE_MASK;
head = kmmio_page_list(page);
- list_for_each_entry_rcu(p, head, list) {
- if (p->page == page)
- return p;
+ list_for_each_entry_rcu(f, head, list) {
+ if (f->page == page)
+ return f;
}
return NULL;
}
-static void set_pmd_presence(pmd_t *pmd, bool present, bool *old)
+static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
{
pmdval_t v = pmd_val(*pmd);
- *old = !!(v & _PAGE_PRESENT);
- v &= ~_PAGE_PRESENT;
- if (present)
- v |= _PAGE_PRESENT;
+ if (clear) {
+ *old = v & _PAGE_PRESENT;
+ v &= ~_PAGE_PRESENT;
+ } else /* presume this has been called with clear==true previously */
+ v |= *old;
set_pmd(pmd, __pmd(v));
}
-static void set_pte_presence(pte_t *pte, bool present, bool *old)
+static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
{
pteval_t v = pte_val(*pte);
- *old = !!(v & _PAGE_PRESENT);
- v &= ~_PAGE_PRESENT;
- if (present)
- v |= _PAGE_PRESENT;
+ if (clear) {
+ *old = v & _PAGE_PRESENT;
+ v &= ~_PAGE_PRESENT;
+ } else /* presume this has been called with clear==true previously */
+ v |= *old;
set_pte_atomic(pte, __pte(v));
}
-static int set_page_presence(unsigned long addr, bool present, bool *old)
+static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
{
unsigned int level;
- pte_t *pte = lookup_address(addr, &level);
+ pte_t *pte = lookup_address(f->page, &level);
if (!pte) {
- pr_err("kmmio: no pte for page 0x%08lx\n", addr);
+ pr_err("kmmio: no pte for page 0x%08lx\n", f->page);
return -1;
}
switch (level) {
case PG_LEVEL_2M:
- set_pmd_presence((pmd_t *)pte, present, old);
+ clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
break;
case PG_LEVEL_4K:
- set_pte_presence(pte, present, old);
+ clear_pte_presence(pte, clear, &f->old_presence);
break;
default:
pr_err("kmmio: unexpected page level 0x%x.\n", level);
return -1;
}
- __flush_tlb_one(addr);
+ __flush_tlb_one(f->page);
return 0;
}
@@ -171,9 +173,9 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n");
if (f->armed) {
pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n",
- f->page, f->count, f->old_presence);
+ f->page, f->count, !!f->old_presence);
}
- ret = set_page_presence(f->page, false, &f->old_presence);
+ ret = clear_page_presence(f, true);
WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page);
f->armed = true;
return ret;
@@ -182,8 +184,7 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
/** Restore the given page to saved presence state. */
static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
{
- bool tmp;
- int ret = set_page_presence(f->page, f->old_presence, &tmp);
+ int ret = clear_page_presence(f, false);
WARN_ONCE(ret < 0,
KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
f->armed = false;
@@ -310,7 +311,12 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
if (!ctx->active) {
- pr_debug("kmmio: spurious debug trap on CPU %d.\n",
+ /*
+ * debug traps without an active context are due to either
+ * something external causing them (f.e. using a debugger while
+ * mmio tracing enabled), or erroneous behaviour
+ */
+ pr_warning("kmmio: unexpected debug trap on CPU %d.\n",
smp_processor_id());
goto out;
}
@@ -439,12 +445,12 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
head,
struct kmmio_delayed_release,
rcu);
- struct kmmio_fault_page *p = dr->release_list;
- while (p) {
- struct kmmio_fault_page *next = p->release_next;
- BUG_ON(p->count);
- kfree(p);
- p = next;
+ struct kmmio_fault_page *f = dr->release_list;
+ while (f) {
+ struct kmmio_fault_page *next = f->release_next;
+ BUG_ON(f->count);
+ kfree(f);
+ f = next;
}
kfree(dr);
}
@@ -453,19 +459,19 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
{
struct kmmio_delayed_release *dr =
container_of(head, struct kmmio_delayed_release, rcu);
- struct kmmio_fault_page *p = dr->release_list;
+ struct kmmio_fault_page *f = dr->release_list;
struct kmmio_fault_page **prevp = &dr->release_list;
unsigned long flags;
spin_lock_irqsave(&kmmio_lock, flags);
- while (p) {
- if (!p->count) {
- list_del_rcu(&p->list);
- prevp = &p->release_next;
+ while (f) {
+ if (!f->count) {
+ list_del_rcu(&f->list);
+ prevp = &f->release_next;
} else {
- *prevp = p->release_next;
+ *prevp = f->release_next;
}
- p = p->release_next;
+ f = f->release_next;
}
spin_unlock_irqrestore(&kmmio_lock, flags);
@@ -528,8 +534,8 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
}
EXPORT_SYMBOL(unregister_kmmio_probe);
-static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
- void *args)
+static int
+kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
{
struct die_args *arg = args;
@@ -544,11 +550,23 @@ static struct notifier_block nb_die = {
.notifier_call = kmmio_die_notifier
};
-static int __init init_kmmio(void)
+int kmmio_init(void)
{
int i;
+
for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
INIT_LIST_HEAD(&kmmio_page_table[i]);
+
return register_die_notifier(&nb_die);
}
-fs_initcall(init_kmmio); /* should be before device_initcall() */
+
+void kmmio_cleanup(void)
+{
+ int i;
+
+ unregister_die_notifier(&nb_die);
+ for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
+ WARN_ONCE(!list_empty(&kmmio_page_table[i]),
+ KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n");
+ }
+}
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 605c8be0621..18d244f7020 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -40,23 +40,22 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
static void __init memtest(u64 pattern, u64 start_phys, u64 size)
{
- u64 i, count;
- u64 *start;
+ u64 *p, *start, *end;
u64 start_bad, last_bad;
u64 start_phys_aligned;
- size_t incr;
+ const size_t incr = sizeof(pattern);
- incr = sizeof(pattern);
start_phys_aligned = ALIGN(start_phys, incr);
- count = (size - (start_phys_aligned - start_phys))/incr;
start = __va(start_phys_aligned);
+ end = start + (size - (start_phys_aligned - start_phys)) / incr;
start_bad = 0;
last_bad = 0;
- for (i = 0; i < count; i++)
- start[i] = pattern;
- for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
- if (*start == pattern)
+ for (p = start; p < end; p++)
+ *p = pattern;
+
+ for (p = start; p < end; p++, start_phys_aligned += incr) {
+ if (*p == pattern)
continue;
if (start_phys_aligned == last_bad + incr) {
last_bad += incr;
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index c9342ed8b40..132772a8ec5 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -451,6 +451,7 @@ void enable_mmiotrace(void)
if (nommiotrace)
pr_info(NAME "MMIO tracing disabled.\n");
+ kmmio_init();
enter_uniprocessor();
spin_lock_irq(&trace_lock);
atomic_inc(&mmiotrace_enabled);
@@ -473,6 +474,7 @@ void disable_mmiotrace(void)
clear_trace_list(); /* guarantees: no more kmmio callbacks */
leave_uniprocessor();
+ kmmio_cleanup();
pr_info(NAME "disabled.\n");
out:
mutex_unlock(&mmiotrace_mutex);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 2d05a12029d..459913beac7 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -179,18 +179,25 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
}
/* Initialize bootmem allocator for a node */
-void __init setup_node_bootmem(int nodeid, unsigned long start,
- unsigned long end)
+void __init
+setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
{
unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
+ const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
unsigned long bootmap_start, nodedata_phys;
void *bootmap;
- const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
int nid;
if (!end)
return;
+ /*
+ * Don't confuse VM with a node that doesn't have the
+ * minimum amount of memory:
+ */
+ if (end && (end - start) < NODE_MIN_SIZE)
+ return;
+
start = roundup(start, ZONE_ALIGN);
printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
@@ -272,9 +279,6 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
-#ifdef CONFIG_ACPI_NUMA
- srat_reserve_add_area(nodeid);
-#endif
node_set_online(nodeid);
}
@@ -578,21 +582,6 @@ unsigned long __init numa_free_all_bootmem(void)
return pages;
}
-void __init paging_init(void)
-{
- unsigned long max_zone_pfns[MAX_NR_ZONES];
-
- memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
- max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
- max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
- max_zone_pfns[ZONE_NORMAL] = max_pfn;
-
- sparse_memory_present_with_active_regions(MAX_NUMNODES);
- sparse_init();
-
- free_area_init_nodes(max_zone_pfns);
-}
-
static __init int numa_setup(char *opt)
{
if (!opt)
@@ -606,8 +595,6 @@ static __init int numa_setup(char *opt)
#ifdef CONFIG_ACPI_NUMA
if (!strncmp(opt, "noacpi", 6))
acpi_numa = -1;
- if (!strncmp(opt, "hotadd=", 7))
- hotadd_percent = simple_strtoul(opt+7, NULL, 10);
#endif
return 0;
}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e17efed088c..7e600c1962d 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -11,6 +11,7 @@
#include <linux/interrupt.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
+#include <linux/pfn.h>
#include <asm/e820.h>
#include <asm/processor.h>
@@ -470,7 +471,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
if (!debug_pagealloc)
spin_unlock(&cpa_lock);
- base = alloc_pages(GFP_KERNEL, 0);
+ base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
if (!debug_pagealloc)
spin_lock(&cpa_lock);
if (!base)
@@ -590,9 +591,12 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
unsigned int level;
pte_t *kpte, old_pte;
- if (cpa->flags & CPA_PAGES_ARRAY)
- address = (unsigned long)page_address(cpa->pages[cpa->curpage]);
- else if (cpa->flags & CPA_ARRAY)
+ if (cpa->flags & CPA_PAGES_ARRAY) {
+ struct page *page = cpa->pages[cpa->curpage];
+ if (unlikely(PageHighMem(page)))
+ return 0;
+ address = (unsigned long)page_address(page);
+ } else if (cpa->flags & CPA_ARRAY)
address = cpa->vaddr[cpa->curpage];
else
address = *cpa->vaddr;
@@ -681,8 +685,9 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
static int cpa_process_alias(struct cpa_data *cpa)
{
struct cpa_data alias_cpa;
- int ret = 0;
- unsigned long temp_cpa_vaddr, vaddr;
+ unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
+ unsigned long vaddr, remapped;
+ int ret;
if (cpa->pfn >= max_pfn_mapped)
return 0;
@@ -695,9 +700,12 @@ static int cpa_process_alias(struct cpa_data *cpa)
* No need to redo, when the primary call touched the direct
* mapping already:
*/
- if (cpa->flags & CPA_PAGES_ARRAY)
- vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]);
- else if (cpa->flags & CPA_ARRAY)
+ if (cpa->flags & CPA_PAGES_ARRAY) {
+ struct page *page = cpa->pages[cpa->curpage];
+ if (unlikely(PageHighMem(page)))
+ return 0;
+ vaddr = (unsigned long)page_address(page);
+ } else if (cpa->flags & CPA_ARRAY)
vaddr = cpa->vaddr[cpa->curpage];
else
vaddr = *cpa->vaddr;
@@ -706,42 +714,55 @@ static int cpa_process_alias(struct cpa_data *cpa)
PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
alias_cpa = *cpa;
- temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
- alias_cpa.vaddr = &temp_cpa_vaddr;
+ alias_cpa.vaddr = &laddr;
alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
-
ret = __change_page_attr_set_clr(&alias_cpa, 0);
+ if (ret)
+ return ret;
}
#ifdef CONFIG_X86_64
- if (ret)
- return ret;
/*
- * No need to redo, when the primary call touched the high
- * mapping already:
- */
- if (within(vaddr, (unsigned long) _text, _brk_end))
- return 0;
-
- /*
- * If the physical address is inside the kernel map, we need
+ * If the primary call didn't touch the high mapping already
+ * and the physical address is inside the kernel map, we need
* to touch the high mapped kernel as well:
*/
- if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn()))
- return 0;
+ if (!within(vaddr, (unsigned long)_text, _brk_end) &&
+ within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) {
+ unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
+ __START_KERNEL_map - phys_base;
+ alias_cpa = *cpa;
+ alias_cpa.vaddr = &temp_cpa_vaddr;
+ alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
- alias_cpa = *cpa;
- temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
- alias_cpa.vaddr = &temp_cpa_vaddr;
- alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
+ /*
+ * The high mapping range is imprecise, so ignore the
+ * return value.
+ */
+ __change_page_attr_set_clr(&alias_cpa, 0);
+ }
+#endif
/*
- * The high mapping range is imprecise, so ignore the return value.
+ * If the PMD page was partially used for per-cpu remapping,
+ * the recycled area needs to be split and modified. Because
+ * the area is always proper subset of a PMD page
+ * cpa->numpages is guaranteed to be 1 for these areas, so
+ * there's no need to loop over and check for further remaps.
*/
- __change_page_attr_set_clr(&alias_cpa, 0);
-#endif
- return ret;
+ remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr);
+ if (remapped) {
+ WARN_ON(cpa->numpages > 1);
+ alias_cpa = *cpa;
+ alias_cpa.vaddr = &remapped;
+ alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
+ ret = __change_page_attr_set_clr(&alias_cpa, 0);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
}
static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
@@ -839,13 +860,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
vm_unmap_aliases();
- /*
- * If we're called with lazy mmu updates enabled, the
- * in-memory pte state may be stale. Flush pending updates to
- * bring them up to date.
- */
- arch_flush_lazy_mmu_mode();
-
cpa.vaddr = addr;
cpa.pages = pages;
cpa.numpages = numpages;
@@ -890,13 +904,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
} else
cpa_flush_all(cache);
- /*
- * If we've been called with lazy mmu updates enabled, then
- * make sure that everything gets flushed out before we
- * return.
- */
- arch_flush_lazy_mmu_mode();
-
out:
return ret;
}
@@ -996,12 +1003,15 @@ EXPORT_SYMBOL(set_memory_array_uc);
int _set_memory_wc(unsigned long addr, int numpages)
{
int ret;
+ unsigned long addr_copy = addr;
+
ret = change_page_attr_set(&addr, numpages,
__pgprot(_PAGE_CACHE_UC_MINUS), 0);
-
if (!ret) {
- ret = change_page_attr_set(&addr, numpages,
- __pgprot(_PAGE_CACHE_WC), 0);
+ ret = change_page_attr_set_clr(&addr_copy, numpages,
+ __pgprot(_PAGE_CACHE_WC),
+ __pgprot(_PAGE_CACHE_MASK),
+ 0, 0, NULL);
}
return ret;
}
@@ -1118,7 +1128,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray)
int free_idx;
for (i = 0; i < addrinarray; i++) {
- start = (unsigned long)page_address(pages[i]);
+ if (PageHighMem(pages[i]))
+ continue;
+ start = page_to_pfn(pages[i]) << PAGE_SHIFT;
end = start + PAGE_SIZE;
if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
goto err_out;
@@ -1131,7 +1143,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray)
err_out:
free_idx = i;
for (i = 0; i < free_idx; i++) {
- start = (unsigned long)page_address(pages[i]);
+ if (PageHighMem(pages[i]))
+ continue;
+ start = page_to_pfn(pages[i]) << PAGE_SHIFT;
end = start + PAGE_SIZE;
free_memtype(start, end);
}
@@ -1160,7 +1174,9 @@ int set_pages_array_wb(struct page **pages, int addrinarray)
return retval;
for (i = 0; i < addrinarray; i++) {
- start = (unsigned long)page_address(pages[i]);
+ if (PageHighMem(pages[i]))
+ continue;
+ start = page_to_pfn(pages[i]) << PAGE_SHIFT;
end = start + PAGE_SIZE;
free_memtype(start, end);
}
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index e6718bb2806..352aa9e927e 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -623,7 +623,8 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
return ret;
if (flags != want_flags) {
- if (strict_prot || !is_new_memtype_allowed(want_flags, flags)) {
+ if (strict_prot ||
+ !is_new_memtype_allowed(paddr, size, want_flags, flags)) {
free_memtype(paddr, paddr + size);
printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
" for %Lx-%Lx, got %s\n",
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 7aa03a5389f..ed34f5e3599 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,9 +4,11 @@
#include <asm/tlb.h>
#include <asm/fixmap.h>
+#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
- return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+ return (pte_t *)__get_free_page(PGALLOC_GFP);
}
pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
@@ -14,16 +16,16 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
struct page *pte;
#ifdef CONFIG_HIGHPTE
- pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+ pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0);
#else
- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+ pte = alloc_pages(PGALLOC_GFP, 0);
#endif
if (pte)
pgtable_page_ctor(pte);
return pte;
}
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
pgtable_page_dtor(pte);
paravirt_release_pte(page_to_pfn(pte));
@@ -31,14 +33,14 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
}
#if PAGETABLE_LEVELS > 2
-void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pmd));
}
#if PAGETABLE_LEVELS > 3
-void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
+void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pud));
@@ -161,7 +163,7 @@ static int preallocate_pmds(pmd_t *pmds[])
bool failed = false;
for(i = 0; i < PREALLOCATED_PMDS; i++) {
- pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+ pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
if (pmd == NULL)
failed = true;
pmds[i] = pmd;
@@ -228,7 +230,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
pmd_t *pmds[PREALLOCATED_PMDS];
unsigned long flags;
- pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
if (pgd == NULL)
goto out;
@@ -327,7 +329,6 @@ void __init reserve_top_address(unsigned long reserve)
printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
(int)-reserve);
__FIXADDR_TOP = -reserve - PAGE_SIZE;
- __VMALLOC_RESERVE += reserve;
#endif
}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 01765955baa..dbb5381f7b3 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -31,17 +31,11 @@ static nodemask_t nodes_parsed __initdata;
static nodemask_t cpu_nodes_parsed __initdata;
static struct bootnode nodes[MAX_NUMNODES] __initdata;
static struct bootnode nodes_add[MAX_NUMNODES];
-static int found_add_area __initdata;
-int hotadd_percent __initdata = 0;
static int num_node_memblks __initdata;
static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
-/* Too small nodes confuse the VM badly. Usually they result
- from BIOS bugs. */
-#define NODE_MIN_SIZE (4*1024*1024)
-
static __init int setup_node(int pxm)
{
return acpi_map_pxm_to_node(pxm);
@@ -66,9 +60,6 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end)
{
struct bootnode *nd = &nodes[i];
- if (found_add_area)
- return;
-
if (nd->start < start) {
nd->start = start;
if (nd->end < nd->start)
@@ -86,11 +77,12 @@ static __init void bad_srat(void)
int i;
printk(KERN_ERR "SRAT: SRAT not used.\n");
acpi_numa = -1;
- found_add_area = 0;
for (i = 0; i < MAX_LOCAL_APIC; i++)
apicid_to_node[i] = NUMA_NO_NODE;
- for (i = 0; i < MAX_NUMNODES; i++)
- nodes_add[i].start = nodes[i].end = 0;
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ nodes[i].start = nodes[i].end = 0;
+ nodes_add[i].start = nodes_add[i].end = 0;
+ }
remove_all_active_ranges();
}
@@ -182,24 +174,21 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
pxm, apic_id, node);
}
-static int update_end_of_memory(unsigned long end) {return -1;}
-static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
static inline int save_add_info(void) {return 1;}
#else
static inline int save_add_info(void) {return 0;}
#endif
/*
- * Update nodes_add and decide if to include add are in the zone.
- * Both SPARSE and RESERVE need nodes_add information.
- * This code supports one contiguous hot add area per node.
+ * Update nodes_add[]
+ * This code supports one contiguous hot add area per node
*/
-static int __init
-reserve_hotadd(int node, unsigned long start, unsigned long end)
+static void __init
+update_nodes_add(int node, unsigned long start, unsigned long end)
{
unsigned long s_pfn = start >> PAGE_SHIFT;
unsigned long e_pfn = end >> PAGE_SHIFT;
- int ret = 0, changed = 0;
+ int changed = 0;
struct bootnode *nd = &nodes_add[node];
/* I had some trouble with strange memory hotadd regions breaking
@@ -210,7 +199,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
mistakes */
if ((signed long)(end - start) < NODE_MIN_SIZE) {
printk(KERN_ERR "SRAT: Hotplug area too small\n");
- return -1;
+ return;
}
/* This check might be a bit too strict, but I'm keeping it for now. */
@@ -218,12 +207,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
printk(KERN_ERR
"SRAT: Hotplug area %lu -> %lu has existing memory\n",
s_pfn, e_pfn);
- return -1;
- }
-
- if (!hotadd_enough_memory(&nodes_add[node])) {
- printk(KERN_ERR "SRAT: Hotplug area too large\n");
- return -1;
+ return;
}
/* Looks good */
@@ -245,11 +229,9 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
}
- ret = update_end_of_memory(nd->end);
-
if (changed)
- printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
- return ret;
+ printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
+ nd->start, nd->end);
}
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
@@ -310,13 +292,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
start, end);
e820_register_active_regions(node, start >> PAGE_SHIFT,
end >> PAGE_SHIFT);
- push_node_boundaries(node, nd->start >> PAGE_SHIFT,
- nd->end >> PAGE_SHIFT);
- if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
- (reserve_hotadd(node, start, end) < 0)) {
- /* Ignore hotadd region. Undo damage */
- printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
+ if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
+ update_nodes_add(node, start, end);
+ /* restore nodes[node] */
*nd = oldnode;
if ((nd->start | nd->end) == 0)
node_clear(node, nodes_parsed);
@@ -345,9 +324,9 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
pxmram = 0;
}
- e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
- /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
- if ((long)(e820ram - pxmram) >= 1*1024*1024) {
+ e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
+ /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
+ if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
printk(KERN_ERR
"SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
(pxmram << PAGE_SHIFT) >> 20,
@@ -357,17 +336,6 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
return 1;
}
-static void __init unparse_node(int node)
-{
- int i;
- node_clear(node, nodes_parsed);
- node_clear(node, cpu_nodes_parsed);
- for (i = 0; i < MAX_LOCAL_APIC; i++) {
- if (apicid_to_node[i] == node)
- apicid_to_node[i] = NUMA_NO_NODE;
- }
-}
-
void __init acpi_numa_arch_fixup(void) {}
/* Use the information discovered above to actually set up the nodes. */
@@ -379,18 +347,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
return -1;
/* First clean up the node list */
- for (i = 0; i < MAX_NUMNODES; i++) {
+ for (i = 0; i < MAX_NUMNODES; i++)
cutoff_node(i, start, end);
- /*
- * don't confuse VM with a node that doesn't have the
- * minimum memory.
- */
- if (nodes[i].end &&
- (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
- unparse_node(i);
- node_set_offline(i);
- }
- }
if (!nodes_cover_memory(nodes)) {
bad_srat();
@@ -423,7 +381,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
if (node == NUMA_NO_NODE)
continue;
- if (!node_isset(node, node_possible_map))
+ if (!node_online(node))
numa_clear_node(i);
}
numa_init_array();
@@ -510,26 +468,6 @@ static int null_slit_node_compare(int a, int b)
}
#endif /* CONFIG_NUMA_EMU */
-void __init srat_reserve_add_area(int nodeid)
-{
- if (found_add_area && nodes_add[nodeid].end) {
- u64 total_mb;
-
- printk(KERN_INFO "SRAT: Reserving hot-add memory space "
- "for node %d at %Lx-%Lx\n",
- nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
- total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
- >> PAGE_SHIFT;
- total_mb *= sizeof(struct page);
- total_mb >>= 20;
- printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
- "pre-allocated memory.\n", (unsigned long long)total_mb);
- reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
- nodes_add[nodeid].end - nodes_add[nodeid].start,
- BOOTMEM_DEFAULT);
- }
-}
-
int __node_distance(int a, int b)
{
int index;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 821e97017e9..c814e144a3f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -183,18 +183,17 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
f->flush_mm = mm;
f->flush_va = va;
- cpumask_andnot(to_cpumask(f->flush_cpumask),
- cpumask, cpumask_of(smp_processor_id()));
-
- /*
- * We have to send the IPI only to
- * CPUs affected.
- */
- apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
- INVALIDATE_TLB_VECTOR_START + sender);
+ if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) {
+ /*
+ * We have to send the IPI only to
+ * CPUs affected.
+ */
+ apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
+ INVALIDATE_TLB_VECTOR_START + sender);
- while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
- cpu_relax();
+ while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
+ cpu_relax();
+ }
f->flush_mm = NULL;
f->flush_va = 0;
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 202864ad49a..89b9a5cd63d 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -40,8 +40,9 @@ static int profile_exceptions_notify(struct notifier_block *self,
switch (val) {
case DIE_NMI:
- if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu)))
- ret = NOTIFY_STOP;
+ case DIE_NMI_IPI:
+ model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
+ ret = NOTIFY_STOP;
break;
default:
break;
@@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy)
static struct notifier_block profile_exceptions_nb = {
.notifier_call = profile_exceptions_notify,
.next = NULL,
- .priority = 0
+ .priority = 2
};
static int nmi_setup(void)
@@ -356,14 +357,11 @@ static void exit_sysfs(void)
#define exit_sysfs() do { } while (0)
#endif /* CONFIG_PM */
-static int p4force;
-module_param(p4force, int, 0);
-
static int __init p4_init(char **cpu_type)
{
__u8 cpu_model = boot_cpu_data.x86_model;
- if (!p4force && (cpu_model > 6 || cpu_model == 5))
+ if (cpu_model > 6 || cpu_model == 5)
return 0;
#ifndef CONFIG_SMP
@@ -389,10 +387,25 @@ static int __init p4_init(char **cpu_type)
return 0;
}
+static int force_arch_perfmon;
+static int force_cpu_type(const char *str, struct kernel_param *kp)
+{
+ if (!strcmp(str, "arch_perfmon")) {
+ force_arch_perfmon = 1;
+ printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
+ }
+
+ return 0;
+}
+module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0);
+
static int __init ppro_init(char **cpu_type)
{
__u8 cpu_model = boot_cpu_data.x86_model;
+ if (force_arch_perfmon && cpu_has_arch_perfmon)
+ return 0;
+
switch (cpu_model) {
case 0 ... 2:
*cpu_type = "i386/ppro";
@@ -414,6 +427,13 @@ static int __init ppro_init(char **cpu_type)
case 15: case 23:
*cpu_type = "i386/core_2";
break;
+ case 26:
+ arch_perfmon_setup_counters();
+ *cpu_type = "i386/core_i7";
+ break;
+ case 28:
+ *cpu_type = "i386/atom";
+ break;
default:
/* Unknown */
return 0;
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 10131fbdaad..4da7230b3d1 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -18,7 +18,7 @@
#include <asm/msr.h>
#include <asm/apic.h>
#include <asm/nmi.h>
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
#include "op_x86_model.h"
#include "op_counter.h"
@@ -136,6 +136,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
u64 val;
int i;
+ /*
+ * This can happen if perf counters are in use when
+ * we steal the die notifier NMI.
+ */
+ if (unlikely(!reset_value))
+ goto out;
+
for (i = 0 ; i < num_counters; ++i) {
if (!reset_value[i])
continue;
@@ -146,6 +153,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
}
}
+out:
/* Only P6 based Pentium M need to re-unmask the apic vector but it
* doesn't hurt other P6 variant */
apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index c0ecf250fe5..1014eb4bfc3 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -38,15 +38,26 @@ count_resource(struct acpi_resource *acpi_res, void *data)
struct acpi_resource_address64 addr;
acpi_status status;
- if (info->res_num >= PCI_BUS_NUM_RESOURCES)
- return AE_OK;
-
status = resource_to_addr(acpi_res, &addr);
if (ACPI_SUCCESS(status))
info->res_num++;
return AE_OK;
}
+static int
+bus_has_transparent_bridge(struct pci_bus *bus)
+{
+ struct pci_dev *dev;
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ u16 class = dev->class >> 8;
+
+ if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent)
+ return true;
+ }
+ return false;
+}
+
static acpi_status
setup_resource(struct acpi_resource *acpi_res, void *data)
{
@@ -56,9 +67,11 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
acpi_status status;
unsigned long flags;
struct resource *root;
+ int max_root_bus_resources = PCI_BUS_NUM_RESOURCES;
+ u64 start, end;
- if (info->res_num >= PCI_BUS_NUM_RESOURCES)
- return AE_OK;
+ if (bus_has_transparent_bridge(info->bus))
+ max_root_bus_resources -= 3;
status = resource_to_addr(acpi_res, &addr);
if (!ACPI_SUCCESS(status))
@@ -75,11 +88,22 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
} else
return AE_OK;
+ start = addr.minimum + addr.translation_offset;
+ end = start + addr.address_length - 1;
+ if (info->res_num >= max_root_bus_resources) {
+ printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx "
+ "from %s for %s due to _CRS returning more than "
+ "%d resource descriptors\n", (unsigned long) start,
+ (unsigned long) end, root->name, info->name,
+ max_root_bus_resources);
+ return AE_OK;
+ }
+
res = &info->res[info->res_num];
res->name = info->name;
res->flags = flags;
- res->start = addr.minimum + addr.translation_offset;
- res->end = res->start + addr.address_length - 1;
+ res->start = start;
+ res->end = end;
res->child = NULL;
if (insert_resource(root, res)) {
@@ -94,23 +118,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
}
static void
-adjust_transparent_bridge_resources(struct pci_bus *bus)
-{
- struct pci_dev *dev;
-
- list_for_each_entry(dev, &bus->devices, bus_list) {
- int i;
- u16 class = dev->class >> 8;
-
- if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent) {
- for(i = 3; i < PCI_BUS_NUM_RESOURCES; i++)
- dev->subordinate->resource[i] =
- dev->bus->resource[i - 3];
- }
- }
-}
-
-static void
get_current_resources(struct acpi_device *device, int busnum,
int domain, struct pci_bus *bus)
{
@@ -137,8 +144,6 @@ get_current_resources(struct acpi_device *device, int busnum,
info.res_num = 0;
acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
&info);
- if (info.res_num)
- adjust_transparent_bridge_resources(bus);
return;
@@ -201,8 +206,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
*/
memcpy(bus->sysdata, sd, sizeof(*sd));
kfree(sd);
- } else
- bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd);
+ } else {
+ bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd);
+ if (bus) {
+ if (pci_probe & PCI_USE__CRS)
+ get_current_resources(device, busnum, domain,
+ bus);
+ bus->subordinate = pci_scan_child_bus(bus);
+ }
+ }
if (!bus)
kfree(sd);
@@ -217,8 +229,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
#endif
}
- if (bus && (pci_probe & PCI_USE__CRS))
- get_current_resources(device, busnum, domain, bus);
return bus;
}
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index f893d6a6e80..3ffa10df20b 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -100,8 +100,9 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
int j;
struct pci_root_info *info;
- /* don't go for it if _CRS is used */
- if (pci_probe & PCI_USE__CRS)
+ /* don't go for it if _CRS is used already */
+ if (b->resource[0] != &ioport_resource ||
+ b->resource[1] != &iomem_resource)
return;
/* if only one root bus, don't need to anything */
@@ -116,6 +117,9 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
if (i == pci_root_num)
return;
+ printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n",
+ b->number);
+
info = &pci_root_info[i];
for (j = 0; j < info->res_num; j++) {
struct resource *res;
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index a85bef20a3b..52e62e57fed 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -35,6 +35,7 @@
#include <asm/pat.h>
#include <asm/e820.h>
#include <asm/pci_x86.h>
+#include <asm/io_apic.h>
static int
@@ -116,7 +117,7 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
struct pci_bus *bus;
struct pci_dev *dev;
int idx;
- struct resource *r, *pr;
+ struct resource *r;
/* Depth-First Search on bus tree */
list_for_each_entry(bus, bus_list, node) {
@@ -126,9 +127,8 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
r = &dev->resource[idx];
if (!r->flags)
continue;
- pr = pci_find_parent_resource(dev, r);
- if (!r->start || !pr ||
- request_resource(pr, r) < 0) {
+ if (!r->start ||
+ pci_claim_resource(dev, idx) < 0) {
dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx);
/*
* Something is wrong with the region.
@@ -149,7 +149,7 @@ static void __init pcibios_allocate_resources(int pass)
struct pci_dev *dev = NULL;
int idx, disabled;
u16 command;
- struct resource *r, *pr;
+ struct resource *r;
for_each_pci_dev(dev) {
pci_read_config_word(dev, PCI_COMMAND, &command);
@@ -168,8 +168,7 @@ static void __init pcibios_allocate_resources(int pass)
(unsigned long long) r->start,
(unsigned long long) r->end,
r->flags, disabled, pass);
- pr = pci_find_parent_resource(dev, r);
- if (!pr || request_resource(pr, r) < 0) {
+ if (pci_claim_resource(dev, idx) < 0) {
dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx);
/* We'll assign a new address later */
r->end -= r->start;
@@ -197,7 +196,7 @@ static void __init pcibios_allocate_resources(int pass)
static int __init pcibios_assign_resources(void)
{
struct pci_dev *dev = NULL;
- struct resource *r, *pr;
+ struct resource *r;
if (!(pci_probe & PCI_ASSIGN_ROMS)) {
/*
@@ -209,8 +208,7 @@ static int __init pcibios_assign_resources(void)
r = &dev->resource[PCI_ROM_RESOURCE];
if (!r->flags || !r->start)
continue;
- pr = pci_find_parent_resource(dev, r);
- if (!pr || request_resource(pr, r) < 0) {
+ if (pci_claim_resource(dev, PCI_ROM_RESOURCE) < 0) {
r->end -= r->start;
r->start = 0;
}
@@ -230,6 +228,12 @@ void __init pcibios_resource_survey(void)
pcibios_allocate_resources(1);
e820_reserve_resources_late();
+ /*
+ * Insert the IO APIC resources after PCI initialization has
+ * occured to handle IO APICS that are mapped in on a BAR in
+ * PCI space, but before trying to assign unassigned pci res.
+ */
+ ioapic_insert_resources();
}
/**
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index fecbce6e7d7..0696d506c4a 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -889,6 +889,9 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
return 0;
}
+ if (io_apic_assign_pci_irqs)
+ return 0;
+
/* Find IRQ routing entry */
if (!pirq_table)
@@ -1039,56 +1042,15 @@ static void __init pcibios_fixup_irqs(void)
pirq_penalty[dev->irq]++;
}
+ if (io_apic_assign_pci_irqs)
+ return;
+
dev = NULL;
while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
if (!pin)
continue;
-#ifdef CONFIG_X86_IO_APIC
- /*
- * Recalculate IRQ numbers if we use the I/O APIC.
- */
- if (io_apic_assign_pci_irqs) {
- int irq;
-
- /*
- * interrupt pins are numbered starting from 1
- */
- irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
- PCI_SLOT(dev->devfn), pin - 1);
- /*
- * Busses behind bridges are typically not listed in the
- * MP-table. In this case we have to look up the IRQ
- * based on the parent bus, parent slot, and pin number.
- * The SMP code detects such bridged busses itself so we
- * should get into this branch reliably.
- */
- if (irq < 0 && dev->bus->parent) {
- /* go back to the bridge */
- struct pci_dev *bridge = dev->bus->self;
- int bus;
-
- pin = pci_swizzle_interrupt_pin(dev, pin);
- bus = bridge->bus->number;
- irq = IO_APIC_get_PCI_irq_vector(bus,
- PCI_SLOT(bridge->devfn), pin - 1);
- if (irq >= 0)
- dev_warn(&dev->dev,
- "using bridge %s INT %c to "
- "get IRQ %d\n",
- pci_name(bridge),
- 'A' + pin - 1, irq);
- }
- if (irq >= 0) {
- dev_info(&dev->dev,
- "PCI->APIC IRQ transform: INT %c "
- "-> IRQ %d\n",
- 'A' + pin - 1, irq);
- dev->irq = irq;
- }
- }
-#endif
/*
* Still no IRQ? Try to lookup one...
*/
@@ -1183,6 +1145,19 @@ int __init pcibios_irq_init(void)
pcibios_enable_irq = pirq_enable_irq;
pcibios_fixup_irqs();
+
+ if (io_apic_assign_pci_irqs && pci_routeirq) {
+ struct pci_dev *dev = NULL;
+ /*
+ * PCI IRQ routing is set up by pci_enable_device(), but we
+ * also do it here in case there are still broken drivers that
+ * don't use pci_enable_device().
+ */
+ printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n");
+ for_each_pci_dev(dev)
+ pirq_enable_irq(dev);
+ }
+
return 0;
}
@@ -1213,16 +1188,23 @@ void pcibios_penalize_isa_irq(int irq, int active)
static int pirq_enable_irq(struct pci_dev *dev)
{
u8 pin;
- struct pci_dev *temp_dev;
pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
- if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
+ if (pin && !pcibios_lookup_irq(dev, 1)) {
char *msg = "";
+ if (!io_apic_assign_pci_irqs && dev->irq)
+ return 0;
+
if (io_apic_assign_pci_irqs) {
+#ifdef CONFIG_X86_IO_APIC
+ struct pci_dev *temp_dev;
int irq;
+ struct io_apic_irq_attr irq_attr;
- irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin - 1);
+ irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
+ PCI_SLOT(dev->devfn),
+ pin - 1, &irq_attr);
/*
* Busses behind bridges are typically not listed in the MP-table.
* In this case we have to look up the IRQ based on the parent bus,
@@ -1235,7 +1217,8 @@ static int pirq_enable_irq(struct pci_dev *dev)
pin = pci_swizzle_interrupt_pin(dev, pin);
irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
- PCI_SLOT(bridge->devfn), pin - 1);
+ PCI_SLOT(bridge->devfn),
+ pin - 1, &irq_attr);
if (irq >= 0)
dev_warn(&dev->dev, "using bridge %s "
"INT %c to get IRQ %d\n",
@@ -1245,12 +1228,15 @@ static int pirq_enable_irq(struct pci_dev *dev)
}
dev = temp_dev;
if (irq >= 0) {
+ io_apic_set_pci_routing(&dev->dev, irq,
+ &irq_attr);
+ dev->irq = irq;
dev_info(&dev->dev, "PCI->APIC IRQ transform: "
"INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
- dev->irq = irq;
return 0;
} else
msg = "; probably buggy MP table";
+#endif
} else if (pci_probe & PCI_BIOS_IRQ_SCAN)
msg = "";
else
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 8766b0e216c..712443ec6d4 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -523,6 +523,69 @@ reject:
static int __initdata known_bridge;
+static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
+
+/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
+struct acpi_mcfg_allocation *pci_mmcfg_config;
+int pci_mmcfg_config_num;
+
+static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
+{
+ if (!strcmp(mcfg->header.oem_id, "SGI"))
+ acpi_mcfg_64bit_base_addr = TRUE;
+
+ return 0;
+}
+
+static int __init pci_parse_mcfg(struct acpi_table_header *header)
+{
+ struct acpi_table_mcfg *mcfg;
+ unsigned long i;
+ int config_size;
+
+ if (!header)
+ return -EINVAL;
+
+ mcfg = (struct acpi_table_mcfg *)header;
+
+ /* how many config structures do we have */
+ pci_mmcfg_config_num = 0;
+ i = header->length - sizeof(struct acpi_table_mcfg);
+ while (i >= sizeof(struct acpi_mcfg_allocation)) {
+ ++pci_mmcfg_config_num;
+ i -= sizeof(struct acpi_mcfg_allocation);
+ };
+ if (pci_mmcfg_config_num == 0) {
+ printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
+ return -ENODEV;
+ }
+
+ config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
+ pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
+ if (!pci_mmcfg_config) {
+ printk(KERN_WARNING PREFIX
+ "No memory for MCFG config tables\n");
+ return -ENOMEM;
+ }
+
+ memcpy(pci_mmcfg_config, &mcfg[1], config_size);
+
+ acpi_mcfg_oem_check(mcfg);
+
+ for (i = 0; i < pci_mmcfg_config_num; ++i) {
+ if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
+ !acpi_mcfg_64bit_base_addr) {
+ printk(KERN_ERR PREFIX
+ "MMCONFIG not in low 4GB of memory\n");
+ kfree(pci_mmcfg_config);
+ pci_mmcfg_config_num = 0;
+ return -ENODEV;
+ }
+ }
+
+ return 0;
+}
+
static void __init __pci_mmcfg_init(int early)
{
/* MMCONFIG disabled */
@@ -543,7 +606,7 @@ static void __init __pci_mmcfg_init(int early)
}
if (!known_bridge)
- acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
+ acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
pci_mmcfg_reject_broken(early);
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index 58b32db3312..a6a198c3362 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -1,7 +1,7 @@
# __restore_processor_state() restores %gs after S3 resume and so should not
# itself be stack-protected
nostackp := $(call cc-option, -fno-stack-protector)
-CFLAGS_cpu_$(BITS).o := $(nostackp)
+CFLAGS_cpu.o := $(nostackp)
-obj-$(CONFIG_PM_SLEEP) += cpu_$(BITS).o
+obj-$(CONFIG_PM_SLEEP) += cpu.o
obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o
diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu.c
index 5343540f260..b3d20b9cac6 100644
--- a/arch/x86/power/cpu_64.c
+++ b/arch/x86/power/cpu.c
@@ -1,5 +1,5 @@
/*
- * Suspend and hibernation support for x86-64
+ * Suspend support specific for i386/x86-64.
*
* Distribute under GPLv2
*
@@ -8,18 +8,28 @@
* Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
*/
-#include <linux/smp.h>
#include <linux/suspend.h>
-#include <asm/proto.h>
-#include <asm/page.h>
+#include <linux/smp.h>
+
#include <asm/pgtable.h>
+#include <asm/proto.h>
#include <asm/mtrr.h>
+#include <asm/page.h>
+#include <asm/mce.h>
#include <asm/xcr.h>
#include <asm/suspend.h>
-static void fix_processor_context(void);
+#ifdef CONFIG_X86_32
+static struct saved_context saved_context;
+unsigned long saved_context_ebx;
+unsigned long saved_context_esp, saved_context_ebp;
+unsigned long saved_context_esi, saved_context_edi;
+unsigned long saved_context_eflags;
+#else
+/* CONFIG_X86_64 */
struct saved_context saved_context;
+#endif
/**
* __save_processor_state - save CPU registers before creating a
@@ -38,19 +48,35 @@ struct saved_context saved_context;
*/
static void __save_processor_state(struct saved_context *ctxt)
{
+#ifdef CONFIG_X86_32
+ mtrr_save_fixed_ranges(NULL);
+#endif
kernel_fpu_begin();
/*
* descriptor tables
*/
+#ifdef CONFIG_X86_32
+ store_gdt(&ctxt->gdt);
+ store_idt(&ctxt->idt);
+#else
+/* CONFIG_X86_64 */
store_gdt((struct desc_ptr *)&ctxt->gdt_limit);
store_idt((struct desc_ptr *)&ctxt->idt_limit);
+#endif
store_tr(ctxt->tr);
/* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
/*
* segment registers
*/
+#ifdef CONFIG_X86_32
+ savesegment(es, ctxt->es);
+ savesegment(fs, ctxt->fs);
+ savesegment(gs, ctxt->gs);
+ savesegment(ss, ctxt->ss);
+#else
+/* CONFIG_X86_64 */
asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds));
asm volatile ("movw %%es, %0" : "=m" (ctxt->es));
asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs));
@@ -62,30 +88,87 @@ static void __save_processor_state(struct saved_context *ctxt)
rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
mtrr_save_fixed_ranges(NULL);
+ rdmsrl(MSR_EFER, ctxt->efer);
+#endif
+
/*
* control registers
*/
- rdmsrl(MSR_EFER, ctxt->efer);
ctxt->cr0 = read_cr0();
ctxt->cr2 = read_cr2();
ctxt->cr3 = read_cr3();
+#ifdef CONFIG_X86_32
+ ctxt->cr4 = read_cr4_safe();
+#else
+/* CONFIG_X86_64 */
ctxt->cr4 = read_cr4();
ctxt->cr8 = read_cr8();
+#endif
}
+/* Needed by apm.c */
void save_processor_state(void)
{
__save_processor_state(&saved_context);
}
+#ifdef CONFIG_X86_32
+EXPORT_SYMBOL(save_processor_state);
+#endif
static void do_fpu_end(void)
{
/*
- * Restore FPU regs if necessary
+ * Restore FPU regs if necessary.
*/
kernel_fpu_end();
}
+static void fix_processor_context(void)
+{
+ int cpu = smp_processor_id();
+ struct tss_struct *t = &per_cpu(init_tss, cpu);
+
+ set_tss_desc(cpu, t); /*
+ * This just modifies memory; should not be
+ * necessary. But... This is necessary, because
+ * 386 hardware has concept of busy TSS or some
+ * similar stupidity.
+ */
+
+#ifdef CONFIG_X86_64
+ get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9;
+
+ syscall_init(); /* This sets MSR_*STAR and related */
+#endif
+ load_TR_desc(); /* This does ltr */
+ load_LDT(&current->active_mm->context); /* This does lldt */
+
+ /*
+ * Now maybe reload the debug registers
+ */
+ if (current->thread.debugreg7) {
+#ifdef CONFIG_X86_32
+ set_debugreg(current->thread.debugreg0, 0);
+ set_debugreg(current->thread.debugreg1, 1);
+ set_debugreg(current->thread.debugreg2, 2);
+ set_debugreg(current->thread.debugreg3, 3);
+ /* no 4 and 5 */
+ set_debugreg(current->thread.debugreg6, 6);
+ set_debugreg(current->thread.debugreg7, 7);
+#else
+ /* CONFIG_X86_64 */
+ loaddebug(&current->thread, 0);
+ loaddebug(&current->thread, 1);
+ loaddebug(&current->thread, 2);
+ loaddebug(&current->thread, 3);
+ /* no 4 and 5 */
+ loaddebug(&current->thread, 6);
+ loaddebug(&current->thread, 7);
+#endif
+ }
+
+}
+
/**
* __restore_processor_state - restore the contents of CPU registers saved
* by __save_processor_state()
@@ -96,9 +179,16 @@ static void __restore_processor_state(struct saved_context *ctxt)
/*
* control registers
*/
+ /* cr4 was introduced in the Pentium CPU */
+#ifdef CONFIG_X86_32
+ if (ctxt->cr4)
+ write_cr4(ctxt->cr4);
+#else
+/* CONFIG X86_64 */
wrmsrl(MSR_EFER, ctxt->efer);
write_cr8(ctxt->cr8);
write_cr4(ctxt->cr4);
+#endif
write_cr3(ctxt->cr3);
write_cr2(ctxt->cr2);
write_cr0(ctxt->cr0);
@@ -107,13 +197,31 @@ static void __restore_processor_state(struct saved_context *ctxt)
* now restore the descriptor tables to their proper values
* ltr is done i fix_processor_context().
*/
+#ifdef CONFIG_X86_32
+ load_gdt(&ctxt->gdt);
+ load_idt(&ctxt->idt);
+#else
+/* CONFIG_X86_64 */
load_gdt((const struct desc_ptr *)&ctxt->gdt_limit);
load_idt((const struct desc_ptr *)&ctxt->idt_limit);
-
+#endif
/*
* segment registers
*/
+#ifdef CONFIG_X86_32
+ loadsegment(es, ctxt->es);
+ loadsegment(fs, ctxt->fs);
+ loadsegment(gs, ctxt->gs);
+ loadsegment(ss, ctxt->ss);
+
+ /*
+ * sysenter MSRs
+ */
+ if (boot_cpu_has(X86_FEATURE_SEP))
+ enable_sep_cpu();
+#else
+/* CONFIG_X86_64 */
asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds));
asm volatile ("movw %0, %%es" :: "r" (ctxt->es));
asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs));
@@ -123,6 +231,7 @@ static void __restore_processor_state(struct saved_context *ctxt)
wrmsrl(MSR_FS_BASE, ctxt->fs_base);
wrmsrl(MSR_GS_BASE, ctxt->gs_base);
wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
+#endif
/*
* restore XCR0 for xsave capable cpu's.
@@ -134,41 +243,17 @@ static void __restore_processor_state(struct saved_context *ctxt)
do_fpu_end();
mtrr_ap_init();
+
+#ifdef CONFIG_X86_OLD_MCE
+ mcheck_init(&boot_cpu_data);
+#endif
}
+/* Needed by apm.c */
void restore_processor_state(void)
{
__restore_processor_state(&saved_context);
}
-
-static void fix_processor_context(void)
-{
- int cpu = smp_processor_id();
- struct tss_struct *t = &per_cpu(init_tss, cpu);
-
- /*
- * This just modifies memory; should not be necessary. But... This
- * is necessary, because 386 hardware has concept of busy TSS or some
- * similar stupidity.
- */
- set_tss_desc(cpu, t);
-
- get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9;
-
- syscall_init(); /* This sets MSR_*STAR and related */
- load_TR_desc(); /* This does ltr */
- load_LDT(&current->active_mm->context); /* This does lldt */
-
- /*
- * Now maybe reload the debug registers
- */
- if (current->thread.debugreg7){
- loaddebug(&current->thread, 0);
- loaddebug(&current->thread, 1);
- loaddebug(&current->thread, 2);
- loaddebug(&current->thread, 3);
- /* no 4 and 5 */
- loaddebug(&current->thread, 6);
- loaddebug(&current->thread, 7);
- }
-}
+#ifdef CONFIG_X86_32
+EXPORT_SYMBOL(restore_processor_state);
+#endif
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c
deleted file mode 100644
index ce702c5b3a2..00000000000
--- a/arch/x86/power/cpu_32.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Suspend support specific for i386.
- *
- * Distribute under GPLv2
- *
- * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
- * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
- */
-
-#include <linux/module.h>
-#include <linux/suspend.h>
-#include <asm/mtrr.h>
-#include <asm/mce.h>
-#include <asm/xcr.h>
-#include <asm/suspend.h>
-
-static struct saved_context saved_context;
-
-unsigned long saved_context_ebx;
-unsigned long saved_context_esp, saved_context_ebp;
-unsigned long saved_context_esi, saved_context_edi;
-unsigned long saved_context_eflags;
-
-static void __save_processor_state(struct saved_context *ctxt)
-{
- mtrr_save_fixed_ranges(NULL);
- kernel_fpu_begin();
-
- /*
- * descriptor tables
- */
- store_gdt(&ctxt->gdt);
- store_idt(&ctxt->idt);
- store_tr(ctxt->tr);
-
- /*
- * segment registers
- */
- savesegment(es, ctxt->es);
- savesegment(fs, ctxt->fs);
- savesegment(gs, ctxt->gs);
- savesegment(ss, ctxt->ss);
-
- /*
- * control registers
- */
- ctxt->cr0 = read_cr0();
- ctxt->cr2 = read_cr2();
- ctxt->cr3 = read_cr3();
- ctxt->cr4 = read_cr4_safe();
-}
-
-/* Needed by apm.c */
-void save_processor_state(void)
-{
- __save_processor_state(&saved_context);
-}
-EXPORT_SYMBOL(save_processor_state);
-
-static void do_fpu_end(void)
-{
- /*
- * Restore FPU regs if necessary.
- */
- kernel_fpu_end();
-}
-
-static void fix_processor_context(void)
-{
- int cpu = smp_processor_id();
- struct tss_struct *t = &per_cpu(init_tss, cpu);
-
- set_tss_desc(cpu, t); /*
- * This just modifies memory; should not be
- * necessary. But... This is necessary, because
- * 386 hardware has concept of busy TSS or some
- * similar stupidity.
- */
-
- load_TR_desc(); /* This does ltr */
- load_LDT(&current->active_mm->context); /* This does lldt */
-
- /*
- * Now maybe reload the debug registers
- */
- if (current->thread.debugreg7) {
- set_debugreg(current->thread.debugreg0, 0);
- set_debugreg(current->thread.debugreg1, 1);
- set_debugreg(current->thread.debugreg2, 2);
- set_debugreg(current->thread.debugreg3, 3);
- /* no 4 and 5 */
- set_debugreg(current->thread.debugreg6, 6);
- set_debugreg(current->thread.debugreg7, 7);
- }
-
-}
-
-static void __restore_processor_state(struct saved_context *ctxt)
-{
- /*
- * control registers
- */
- /* cr4 was introduced in the Pentium CPU */
- if (ctxt->cr4)
- write_cr4(ctxt->cr4);
- write_cr3(ctxt->cr3);
- write_cr2(ctxt->cr2);
- write_cr0(ctxt->cr0);
-
- /*
- * now restore the descriptor tables to their proper values
- * ltr is done i fix_processor_context().
- */
- load_gdt(&ctxt->gdt);
- load_idt(&ctxt->idt);
-
- /*
- * segment registers
- */
- loadsegment(es, ctxt->es);
- loadsegment(fs, ctxt->fs);
- loadsegment(gs, ctxt->gs);
- loadsegment(ss, ctxt->ss);
-
- /*
- * sysenter MSRs
- */
- if (boot_cpu_has(X86_FEATURE_SEP))
- enable_sep_cpu();
-
- /*
- * restore XCR0 for xsave capable cpu's.
- */
- if (cpu_has_xsave)
- xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
-
- fix_processor_context();
- do_fpu_end();
- mtrr_ap_init();
- mcheck_init(&boot_cpu_data);
-}
-
-/* Needed by apm.c */
-void restore_processor_state(void)
-{
- __restore_processor_state(&saved_context);
-}
-EXPORT_SYMBOL(restore_processor_state);
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 16a9020c8f1..88112b49f02 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -123,6 +123,7 @@ quiet_cmd_vdso = VDSO $@
-Wl,-T,$(filter %.lds,$^) $(filter %.o,$^)
VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv)
+GCOV_PROFILE := n
#
# Install the unstripped copy of vdso*.so listed in $(vdso-install-y).
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 1241f118ab5..58bc00f68b1 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -338,6 +338,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
}
}
+ current->mm->context.vdso = (void *)addr;
+
if (compat_uses_vma || !compat) {
/*
* MAYWRITE to allow gdb to COW and set breakpoints
@@ -358,11 +360,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
goto up_fail;
}
- current->mm->context.vdso = (void *)addr;
current_thread_info()->sysenter_return =
VDSO32_SYMBOL(addr, SYSENTER_RETURN);
up_fail:
+ if (ret)
+ current->mm->context.vdso = NULL;
+
up_write(&mm->mmap_sem);
return ret;
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 7133cdf9098..21e1aeb9f3e 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -8,6 +8,7 @@
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/random.h>
+#include <linux/elf.h>
#include <asm/vsyscall.h>
#include <asm/vgtod.h>
#include <asm/proto.h>
@@ -115,15 +116,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
goto up_fail;
}
+ current->mm->context.vdso = (void *)addr;
+
ret = install_special_mapping(mm, addr, vdso_size,
VM_READ|VM_EXEC|
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
VM_ALWAYSDUMP,
vdso_pages);
- if (ret)
+ if (ret) {
+ current->mm->context.vdso = NULL;
goto up_fail;
+ }
- current->mm->context.vdso = (void *)addr;
up_fail:
up_write(&mm->mmap_sem);
return ret;
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 172438f86a0..7410640db17 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -5,6 +5,10 @@ CFLAGS_REMOVE_time.o = -pg
CFLAGS_REMOVE_irq.o = -pg
endif
+# Make sure early boot has no stackprotector
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_enlighten.o := $(nostackp)
+
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
grant-table.o suspend.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f09e8c36ee8..e90540a46a0 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -20,6 +20,7 @@
#include <linux/delay.h>
#include <linux/start_kernel.h>
#include <linux/sched.h>
+#include <linux/kprobes.h>
#include <linux/bootmem.h>
#include <linux/module.h>
#include <linux/mm.h>
@@ -44,6 +45,7 @@
#include <asm/processor.h>
#include <asm/proto.h>
#include <asm/msr-index.h>
+#include <asm/traps.h>
#include <asm/setup.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
@@ -240,10 +242,10 @@ static unsigned long xen_get_debugreg(int reg)
return HYPERVISOR_get_debugreg(reg);
}
-void xen_leave_lazy(void)
+static void xen_end_context_switch(struct task_struct *next)
{
- paravirt_leave_lazy(paravirt_get_lazy_mode());
xen_mc_flush();
+ paravirt_end_context_switch(next);
}
static unsigned long xen_store_tr(void)
@@ -428,11 +430,44 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
static int cvt_gate_to_trap(int vector, const gate_desc *val,
struct trap_info *info)
{
+ unsigned long addr;
+
if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
return 0;
info->vector = vector;
- info->address = gate_offset(*val);
+
+ addr = gate_offset(*val);
+#ifdef CONFIG_X86_64
+ /*
+ * Look for known traps using IST, and substitute them
+ * appropriately. The debugger ones are the only ones we care
+ * about. Xen will handle faults like double_fault and
+ * machine_check, so we should never see them. Warn if
+ * there's an unexpected IST-using fault handler.
+ */
+ if (addr == (unsigned long)debug)
+ addr = (unsigned long)xen_debug;
+ else if (addr == (unsigned long)int3)
+ addr = (unsigned long)xen_int3;
+ else if (addr == (unsigned long)stack_segment)
+ addr = (unsigned long)xen_stack_segment;
+ else if (addr == (unsigned long)double_fault ||
+ addr == (unsigned long)nmi) {
+ /* Don't need to handle these */
+ return 0;
+#ifdef CONFIG_X86_MCE
+ } else if (addr == (unsigned long)machine_check) {
+ return 0;
+#endif
+ } else {
+ /* Some other trap using IST? */
+ if (WARN_ON(val->ist != 0))
+ return 0;
+ }
+#endif /* CONFIG_X86_64 */
+ info->address = addr;
+
info->cs = gate_segment(*val);
info->flags = val->dpl;
/* interrupt gates clear IF */
@@ -623,10 +658,26 @@ static void xen_clts(void)
xen_mc_issue(PARAVIRT_LAZY_CPU);
}
+static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
+
+static unsigned long xen_read_cr0(void)
+{
+ unsigned long cr0 = percpu_read(xen_cr0_value);
+
+ if (unlikely(cr0 == 0)) {
+ cr0 = native_read_cr0();
+ percpu_write(xen_cr0_value, cr0);
+ }
+
+ return cr0;
+}
+
static void xen_write_cr0(unsigned long cr0)
{
struct multicall_space mcs;
+ percpu_write(xen_cr0_value, cr0);
+
/* Only pay attention to cr0.TS; everything else is
ignored. */
mcs = xen_mc_entry(0);
@@ -812,7 +863,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
.clts = xen_clts,
- .read_cr0 = native_read_cr0,
+ .read_cr0 = xen_read_cr0,
.write_cr0 = xen_write_cr0,
.read_cr4 = native_read_cr4,
@@ -860,10 +911,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
/* Xen takes care of %gs when switching to usermode for us */
.swapgs = paravirt_nop,
- .lazy_mode = {
- .enter = paravirt_enter_lazy_cpu,
- .leave = xen_leave_lazy,
- },
+ .start_context_switch = paravirt_start_context_switch,
+ .end_context_switch = xen_end_context_switch,
};
static const struct pv_apic_ops xen_apic_ops __initdata = {
@@ -925,10 +974,6 @@ asmlinkage void __init xen_start_kernel(void)
xen_domain_type = XEN_PV_DOMAIN;
- BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
-
- xen_setup_features();
-
/* Install Xen paravirt ops */
pv_info = xen_info;
pv_init_ops = xen_init_ops;
@@ -937,8 +982,15 @@ asmlinkage void __init xen_start_kernel(void)
pv_apic_ops = xen_apic_ops;
pv_mmu_ops = xen_mmu_ops;
- xen_init_irq_ops();
+#ifdef CONFIG_X86_64
+ /*
+ * Setup percpu state. We only need to do this for 64-bit
+ * because 32-bit already has %fs set properly.
+ */
+ load_percpu_segment(0);
+#endif
+ xen_init_irq_ops();
xen_init_cpuid_mask();
#ifdef CONFIG_X86_LOCAL_APIC
@@ -948,6 +1000,8 @@ asmlinkage void __init xen_start_kernel(void)
set_xen_basic_apic_ops();
#endif
+ xen_setup_features();
+
if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
@@ -955,13 +1009,6 @@ asmlinkage void __init xen_start_kernel(void)
machine_ops = xen_machine_ops;
-#ifdef CONFIG_X86_64
- /*
- * Setup percpu state. We only need to do this for 64-bit
- * because 32-bit already has %fs set properly.
- */
- load_percpu_segment(0);
-#endif
/*
* The only reliable way to retain the initial address of the
* percpu gdt_page is to remember it here, so we can go and
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index fba55b1a402..4ceb2858165 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -452,10 +452,6 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval)
{
- /* updates to init_mm may be done without lock */
- if (mm == &init_mm)
- preempt_disable();
-
ADD_STATS(set_pte_at, 1);
// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
ADD_STATS(set_pte_at_current, mm == current->mm);
@@ -476,9 +472,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
}
xen_set_pte(ptep, pteval);
-out:
- if (mm == &init_mm)
- preempt_enable();
+out: return;
}
pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
@@ -1152,10 +1146,8 @@ static void drop_other_mm_ref(void *info)
/* If this cpu still has a stale cr3 reference, then make sure
it has been flushed. */
- if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
+ if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
load_cr3(swapper_pg_dir);
- arch_flush_lazy_cpu_mode();
- }
}
static void xen_drop_mm_ref(struct mm_struct *mm)
@@ -1168,7 +1160,6 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
load_cr3(swapper_pg_dir);
else
leave_mm(smp_processor_id());
- arch_flush_lazy_cpu_mode();
}
/* Get the "official" set of cpus referring to our pagetable. */
@@ -1876,6 +1867,14 @@ __init void xen_post_allocator_init(void)
xen_mark_init_mm_pinned();
}
+static void xen_leave_lazy_mmu(void)
+{
+ preempt_disable();
+ xen_mc_flush();
+ paravirt_leave_lazy_mmu();
+ preempt_enable();
+}
+
const struct pv_mmu_ops xen_mmu_ops __initdata = {
.pagetable_setup_start = xen_pagetable_setup_start,
.pagetable_setup_done = xen_pagetable_setup_done,
@@ -1949,7 +1948,7 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
.lazy_mode = {
.enter = paravirt_enter_lazy_mmu,
- .leave = xen_leave_lazy,
+ .leave = xen_leave_lazy_mmu,
},
.set_fixmap = xen_set_fixmap,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 15c6c68db6a..ad0047f47cd 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -61,9 +61,9 @@ char * __init xen_memory_setup(void)
* - xen_start_info
* See comment above "struct start_info" in <xen/interface/xen.h>
*/
- e820_add_region(__pa(xen_start_info->mfn_list),
- xen_start_info->pt_base - xen_start_info->mfn_list,
- E820_RESERVED);
+ reserve_early(__pa(xen_start_info->mfn_list),
+ __pa(xen_start_info->pt_base),
+ "XEN START INFO");
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index ca6596b05d5..22494fd4c9b 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,7 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
void xen_ident_map_ISA(void);
void xen_reserve_top(void);
-void xen_leave_lazy(void);
void xen_post_allocator_init(void);
char * __init xen_memory_setup(void);