summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig761
-rw-r--r--arch/x86/Kconfig.cpu111
-rw-r--r--arch/x86/Kconfig.debug74
-rw-r--r--arch/x86/Makefile43
-rw-r--r--arch/x86/boot/tty.c2
-rw-r--r--arch/x86/boot/video-vesa.c11
-rw-r--r--arch/x86/boot/video-vga.c4
-rw-r--r--arch/x86/boot/video.c2
-rw-r--r--arch/x86/configs/i386_defconfig418
-rw-r--r--arch/x86/configs/x86_64_defconfig424
-rw-r--r--arch/x86/crypto/crc32c-intel.c121
-rw-r--r--arch/x86/ia32/ia32_aout.c2
-rw-r--r--arch/x86/ia32/ia32_signal.c445
-rw-r--r--arch/x86/ia32/ia32entry.S16
-rw-r--r--arch/x86/ia32/ipc32.c1
-rw-r--r--arch/x86/ia32/sys_ia32.c2
-rw-r--r--arch/x86/include/asm/a.out-core.h4
-rw-r--r--arch/x86/include/asm/acpi.h4
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h83
-rw-r--r--arch/x86/include/asm/apic.h78
-rw-r--r--arch/x86/include/asm/apicnum.h12
-rw-r--r--arch/x86/include/asm/apm.h (renamed from arch/x86/include/asm/mach-default/apm.h)0
-rw-r--r--arch/x86/include/asm/atomic_32.h10
-rw-r--r--arch/x86/include/asm/atomic_64.h18
-rw-r--r--arch/x86/include/asm/bigsmp/apic.h139
-rw-r--r--arch/x86/include/asm/bigsmp/apicdef.h13
-rw-r--r--arch/x86/include/asm/bigsmp/ipi.h25
-rw-r--r--arch/x86/include/asm/bitops.h24
-rw-r--r--arch/x86/include/asm/bug.h2
-rw-r--r--arch/x86/include/asm/byteorder.h75
-rw-r--r--arch/x86/include/asm/calling.h56
-rw-r--r--arch/x86/include/asm/cpu.h17
-rw-r--r--arch/x86/include/asm/cpufeature.h6
-rw-r--r--arch/x86/include/asm/cpumask.h32
-rw-r--r--arch/x86/include/asm/current.h24
-rw-r--r--arch/x86/include/asm/desc.h10
-rw-r--r--arch/x86/include/asm/dma-mapping.h10
-rw-r--r--arch/x86/include/asm/do_timer.h (renamed from arch/x86/include/asm/mach-default/do_timer.h)0
-rw-r--r--arch/x86/include/asm/ds.h318
-rw-r--r--arch/x86/include/asm/dwarf2.h97
-rw-r--r--arch/x86/include/asm/e820.h1
-rw-r--r--arch/x86/include/asm/efi.h1
-rw-r--r--arch/x86/include/asm/elf.h17
-rw-r--r--arch/x86/include/asm/emergency-restart.h4
-rw-r--r--arch/x86/include/asm/entry_arch.h (renamed from arch/x86/include/asm/mach-default/entry_arch.h)25
-rw-r--r--arch/x86/include/asm/es7000/apic.h193
-rw-r--r--arch/x86/include/asm/es7000/apicdef.h13
-rw-r--r--arch/x86/include/asm/es7000/ipi.h24
-rw-r--r--arch/x86/include/asm/es7000/mpparse.h30
-rw-r--r--arch/x86/include/asm/es7000/wakecpu.h58
-rw-r--r--arch/x86/include/asm/fixmap_32.h4
-rw-r--r--arch/x86/include/asm/fixmap_64.h4
-rw-r--r--arch/x86/include/asm/ftrace.h61
-rw-r--r--arch/x86/include/asm/gart.h33
-rw-r--r--arch/x86/include/asm/genapic.h262
-rw-r--r--arch/x86/include/asm/genapic_32.h126
-rw-r--r--arch/x86/include/asm/genapic_64.h58
-rw-r--r--arch/x86/include/asm/hardirq.h49
-rw-r--r--arch/x86/include/asm/hardirq_32.h28
-rw-r--r--arch/x86/include/asm/hardirq_64.h23
-rw-r--r--arch/x86/include/asm/hw_irq.h28
-rw-r--r--arch/x86/include/asm/hypervisor.h26
-rw-r--r--arch/x86/include/asm/ia32.h18
-rw-r--r--arch/x86/include/asm/idle.h5
-rw-r--r--arch/x86/include/asm/io.h131
-rw-r--r--arch/x86/include/asm/io_32.h88
-rw-r--r--arch/x86/include/asm/io_64.h63
-rw-r--r--arch/x86/include/asm/io_apic.h52
-rw-r--r--arch/x86/include/asm/iomap.h30
-rw-r--r--arch/x86/include/asm/iommu.h36
-rw-r--r--arch/x86/include/asm/ipi.h62
-rw-r--r--arch/x86/include/asm/irq.h11
-rw-r--r--arch/x86/include/asm/irq_regs.h36
-rw-r--r--arch/x86/include/asm/irq_regs_32.h29
-rw-r--r--arch/x86/include/asm/irq_regs_64.h1
-rw-r--r--arch/x86/include/asm/irq_vectors.h200
-rw-r--r--arch/x86/include/asm/kvm.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h47
-rw-r--r--arch/x86/include/asm/kvm_x86_emulate.h11
-rw-r--r--arch/x86/include/asm/lguest.h2
-rw-r--r--arch/x86/include/asm/linkage.h60
-rw-r--r--arch/x86/include/asm/mach-default/mach_apic.h156
-rw-r--r--arch/x86/include/asm/mach-default/mach_apicdef.h24
-rw-r--r--arch/x86/include/asm/mach-default/mach_ipi.h64
-rw-r--r--arch/x86/include/asm/mach-default/mach_mpparse.h17
-rw-r--r--arch/x86/include/asm/mach-default/mach_mpspec.h12
-rw-r--r--arch/x86/include/asm/mach-default/mach_wakecpu.h41
-rw-r--r--arch/x86/include/asm/mach-generic/gpio.h15
-rw-r--r--arch/x86/include/asm/mach-generic/mach_apic.h33
-rw-r--r--arch/x86/include/asm/mach-generic/mach_apicdef.h11
-rw-r--r--arch/x86/include/asm/mach-generic/mach_ipi.h10
-rw-r--r--arch/x86/include/asm/mach-generic/mach_mpparse.h10
-rw-r--r--arch/x86/include/asm/mach-generic/mach_mpspec.h12
-rw-r--r--arch/x86/include/asm/mach-rdc321x/gpio.h60
-rw-r--r--arch/x86/include/asm/mach_timer.h (renamed from arch/x86/include/asm/mach-default/mach_timer.h)0
-rw-r--r--arch/x86/include/asm/mach_traps.h (renamed from arch/x86/include/asm/mach-default/mach_traps.h)0
-rw-r--r--arch/x86/include/asm/math_emu.h29
-rw-r--r--arch/x86/include/asm/mce.h5
-rw-r--r--arch/x86/include/asm/mmu_context.h63
-rw-r--r--arch/x86/include/asm/mmu_context_32.h56
-rw-r--r--arch/x86/include/asm/mmu_context_64.h54
-rw-r--r--arch/x86/include/asm/mmzone_32.h4
-rw-r--r--arch/x86/include/asm/mpspec.h43
-rw-r--r--arch/x86/include/asm/mpspec_def.h125
-rw-r--r--arch/x86/include/asm/msr-index.h31
-rw-r--r--arch/x86/include/asm/msr.h15
-rw-r--r--arch/x86/include/asm/mtrr.h26
-rw-r--r--arch/x86/include/asm/numaq.h2
-rw-r--r--arch/x86/include/asm/numaq/apic.h136
-rw-r--r--arch/x86/include/asm/numaq/apicdef.h14
-rw-r--r--arch/x86/include/asm/numaq/ipi.h25
-rw-r--r--arch/x86/include/asm/numaq/mpparse.h7
-rw-r--r--arch/x86/include/asm/numaq/wakecpu.h43
-rw-r--r--arch/x86/include/asm/page.h19
-rw-r--r--arch/x86/include/asm/page_64.h4
-rw-r--r--arch/x86/include/asm/paravirt.h464
-rw-r--r--arch/x86/include/asm/pat.h4
-rw-r--r--arch/x86/include/asm/pci-functions.h (renamed from arch/x86/include/asm/mach-default/pci-functions.h)0
-rw-r--r--arch/x86/include/asm/pci.h14
-rw-r--r--arch/x86/include/asm/pci_64.h15
-rw-r--r--arch/x86/include/asm/pci_x86.h (renamed from arch/x86/pci/pci.h)18
-rw-r--r--arch/x86/include/asm/pda.h137
-rw-r--r--arch/x86/include/asm/percpu.h169
-rw-r--r--arch/x86/include/asm/pgalloc.h1
-rw-r--r--arch/x86/include/asm/pgtable-2level.h52
-rw-r--r--arch/x86/include/asm/pgtable-3level.h36
-rw-r--r--arch/x86/include/asm/pgtable.h299
-rw-r--r--arch/x86/include/asm/pgtable_32.h55
-rw-r--r--arch/x86/include/asm/pgtable_64.h93
-rw-r--r--arch/x86/include/asm/prctl.h1
-rw-r--r--arch/x86/include/asm/processor.h47
-rw-r--r--arch/x86/include/asm/proto.h4
-rw-r--r--arch/x86/include/asm/ptrace-abi.h2
-rw-r--r--arch/x86/include/asm/ptrace.h47
-rw-r--r--arch/x86/include/asm/rdc321x_defs.h (renamed from arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h)0
-rw-r--r--arch/x86/include/asm/segment.h9
-rw-r--r--arch/x86/include/asm/setup.h26
-rw-r--r--arch/x86/include/asm/setup_arch.h (renamed from arch/x86/include/asm/mach-default/setup_arch.h)0
-rw-r--r--arch/x86/include/asm/sigcontext.h2
-rw-r--r--arch/x86/include/asm/sigcontext32.h2
-rw-r--r--arch/x86/include/asm/sigframe.h70
-rw-r--r--arch/x86/include/asm/signal.h6
-rw-r--r--arch/x86/include/asm/smp.h71
-rw-r--r--arch/x86/include/asm/smpboot_hooks.h (renamed from arch/x86/include/asm/mach-default/smpboot_hooks.h)8
-rw-r--r--arch/x86/include/asm/sparsemem.h2
-rw-r--r--arch/x86/include/asm/spinlock.h70
-rw-r--r--arch/x86/include/asm/stackprotector.h124
-rw-r--r--arch/x86/include/asm/summit/apic.h184
-rw-r--r--arch/x86/include/asm/summit/apicdef.h13
-rw-r--r--arch/x86/include/asm/summit/ipi.h25
-rw-r--r--arch/x86/include/asm/summit/mpparse.h109
-rw-r--r--arch/x86/include/asm/svm.h (renamed from arch/x86/kvm/svm.h)0
-rw-r--r--arch/x86/include/asm/swab.h61
-rw-r--r--arch/x86/include/asm/swiotlb.h38
-rw-r--r--arch/x86/include/asm/sys_ia32.h101
-rw-r--r--arch/x86/include/asm/syscalls.h34
-rw-r--r--arch/x86/include/asm/system.h73
-rw-r--r--arch/x86/include/asm/thread_info.h30
-rw-r--r--arch/x86/include/asm/timex.h13
-rw-r--r--arch/x86/include/asm/tlbflush.h17
-rw-r--r--arch/x86/include/asm/topology.h71
-rw-r--r--arch/x86/include/asm/trampoline.h8
-rw-r--r--arch/x86/include/asm/traps.h11
-rw-r--r--arch/x86/include/asm/tsc.h8
-rw-r--r--arch/x86/include/asm/uaccess.h144
-rw-r--r--arch/x86/include/asm/uaccess_32.h8
-rw-r--r--arch/x86/include/asm/uaccess_64.h8
-rw-r--r--arch/x86/include/asm/unistd_64.h4
-rw-r--r--arch/x86/include/asm/unwind.h13
-rw-r--r--arch/x86/include/asm/uv/bios.h34
-rw-r--r--arch/x86/include/asm/uv/uv.h36
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h47
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h103
-rw-r--r--arch/x86/include/asm/virtext.h132
-rw-r--r--arch/x86/include/asm/vmi.h8
-rw-r--r--arch/x86/include/asm/vmware.h27
-rw-r--r--arch/x86/include/asm/vmx.h (renamed from arch/x86/kvm/vmx.h)28
-rw-r--r--arch/x86/include/asm/voyager.h42
-rw-r--r--arch/x86/include/asm/xen/events.h6
-rw-r--r--arch/x86/include/asm/xen/hypercall.h6
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h39
-rw-r--r--arch/x86/include/asm/xen/page.h7
-rw-r--r--arch/x86/kernel/Makefile38
-rw-r--r--arch/x86/kernel/acpi/boot.c251
-rw-r--r--arch/x86/kernel/acpi/cstate.c74
-rw-r--r--arch/x86/kernel/acpi/sleep.c3
-rw-r--r--arch/x86/kernel/amd_iommu.c707
-rw-r--r--arch/x86/kernel/amd_iommu_init.c29
-rw-r--r--arch/x86/kernel/aperture_64.c5
-rw-r--r--arch/x86/kernel/apic.c386
-rw-r--r--arch/x86/kernel/apm_32.c10
-rw-r--r--arch/x86/kernel/asm-offsets_32.c3
-rw-r--r--arch/x86/kernel/asm-offsets_64.c15
-rw-r--r--arch/x86/kernel/bigsmp_32.c266
-rw-r--r--arch/x86/kernel/bios_uv.c60
-rw-r--r--arch/x86/kernel/check.c161
-rw-r--r--arch/x86/kernel/cpu/Makefile6
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c52
-rw-r--r--arch/x86/kernel/cpu/amd.c11
-rw-r--r--arch/x86/kernel/cpu/common.c281
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig11
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c192
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c14
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c76
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h19
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c72
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c18
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c9
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c58
-rw-r--r--arch/x86/kernel/cpu/intel.c53
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c140
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c131
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c22
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c359
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h18
-rw-r--r--arch/x86/kernel/cpu/vmware.c112
-rw-r--r--arch/x86/kernel/cpuid.c8
-rw-r--r--arch/x86/kernel/crash.c22
-rw-r--r--arch/x86/kernel/ds.c1143
-rw-r--r--arch/x86/kernel/dumpstack.c351
-rw-r--r--arch/x86/kernel/dumpstack.h39
-rw-r--r--arch/x86/kernel/dumpstack_32.c307
-rw-r--r--arch/x86/kernel/dumpstack_64.c322
-rw-r--r--arch/x86/kernel/e820.c37
-rw-r--r--arch/x86/kernel/early-quirks.c41
-rw-r--r--arch/x86/kernel/early_printk.c51
-rw-r--r--arch/x86/kernel/efi.c2
-rw-r--r--arch/x86/kernel/efi_64.c1
-rw-r--r--arch/x86/kernel/entry_32.S925
-rw-r--r--arch/x86/kernel/entry_64.S1488
-rw-r--r--arch/x86/kernel/es7000_32.c498
-rw-r--r--arch/x86/kernel/ftrace.c391
-rw-r--r--arch/x86/kernel/genapic_64.c28
-rw-r--r--arch/x86/kernel/genapic_flat_64.c265
-rw-r--r--arch/x86/kernel/genx2apic_cluster.c166
-rw-r--r--arch/x86/kernel/genx2apic_phys.c167
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c264
-rw-r--r--arch/x86/kernel/head.c1
-rw-r--r--arch/x86/kernel/head32.c3
-rw-r--r--arch/x86/kernel/head64.c26
-rw-r--r--arch/x86/kernel/head_32.S40
-rw-r--r--arch/x86/kernel/head_64.S21
-rw-r--r--arch/x86/kernel/hpet.c34
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/i8237.c17
-rw-r--r--arch/x86/kernel/i8253.c2
-rw-r--r--arch/x86/kernel/i8259.c8
-rw-r--r--arch/x86/kernel/init_task.c2
-rw-r--r--arch/x86/kernel/io_apic.c1410
-rw-r--r--arch/x86/kernel/ioport.c7
-rw-r--r--arch/x86/kernel/ipi.c176
-rw-r--r--arch/x86/kernel/irq.c52
-rw-r--r--arch/x86/kernel/irq_32.c56
-rw-r--r--arch/x86/kernel/irq_64.c105
-rw-r--r--arch/x86/kernel/irqinit_32.c44
-rw-r--r--arch/x86/kernel/irqinit_64.c90
-rw-r--r--arch/x86/kernel/kgdb.c4
-rw-r--r--arch/x86/kernel/kprobes.c11
-rw-r--r--arch/x86/kernel/kvmclock.c12
-rw-r--r--arch/x86/kernel/ldt.c4
-rw-r--r--arch/x86/kernel/mfgpt_32.c4
-rw-r--r--arch/x86/kernel/microcode_amd.c232
-rw-r--r--arch/x86/kernel/microcode_core.c25
-rw-r--r--arch/x86/kernel/microcode_intel.c18
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c3
-rw-r--r--arch/x86/kernel/module_32.c6
-rw-r--r--arch/x86/kernel/module_64.c32
-rw-r--r--arch/x86/kernel/mpparse.c501
-rw-r--r--arch/x86/kernel/msr.c4
-rw-r--r--arch/x86/kernel/nmi.c72
-rw-r--r--arch/x86/kernel/numaq_32.c353
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c13
-rw-r--r--arch/x86/kernel/paravirt.c55
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c12
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c15
-rw-r--r--arch/x86/kernel/pci-calgary_64.c2
-rw-r--r--arch/x86/kernel/pci-dma.c26
-rw-r--r--arch/x86/kernel/pci-gart_64.c10
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c29
-rw-r--r--arch/x86/kernel/probe_32.c411
-rw-r--r--arch/x86/kernel/probe_roms_32.c2
-rw-r--r--arch/x86/kernel/process.c43
-rw-r--r--arch/x86/kernel/process_32.c143
-rw-r--r--arch/x86/kernel/process_64.c109
-rw-r--r--arch/x86/kernel/ptrace.c451
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c116
-rw-r--r--arch/x86/kernel/setup.c211
-rw-r--r--arch/x86/kernel/setup_percpu.c426
-rw-r--r--arch/x86/kernel/sigframe.h42
-rw-r--r--arch/x86/kernel/signal.c (renamed from arch/x86/kernel/signal_32.c)712
-rw-r--r--arch/x86/kernel/signal_64.c516
-rw-r--r--arch/x86/kernel/smp.c63
-rw-r--r--arch/x86/kernel/smpboot.c320
-rw-r--r--arch/x86/kernel/smpcommon.c30
-rw-r--r--arch/x86/kernel/stacktrace.c66
-rw-r--r--arch/x86/kernel/summit_32.c416
-rw-r--r--arch/x86/kernel/syscall_table_32.S22
-rw-r--r--arch/x86/kernel/time_32.c8
-rw-r--r--arch/x86/kernel/time_64.c8
-rw-r--r--arch/x86/kernel/tlb_32.c257
-rw-r--r--arch/x86/kernel/tlb_uv.c86
-rw-r--r--arch/x86/kernel/trampoline.c19
-rw-r--r--arch/x86/kernel/trampoline_64.S19
-rw-r--r--arch/x86/kernel/traps.c93
-rw-r--r--arch/x86/kernel/tsc.c44
-rw-r--r--arch/x86/kernel/tsc_sync.c12
-rw-r--r--arch/x86/kernel/visws_quirks.c36
-rw-r--r--arch/x86/kernel/vm86_32.c20
-rw-r--r--arch/x86/kernel/vmi_32.c155
-rw-r--r--arch/x86/kernel/vmiclock_32.c4
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S10
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S36
-rw-r--r--arch/x86/kernel/vsmp_64.c12
-rw-r--r--arch/x86/kernel/vsyscall_64.c12
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c2
-rw-r--r--arch/x86/kernel/xsave.c4
-rw-r--r--arch/x86/kvm/Kconfig2
-rw-r--r--arch/x86/kvm/Makefile4
-rw-r--r--arch/x86/kvm/i8254.c23
-rw-r--r--arch/x86/kvm/i8259.c52
-rw-r--r--arch/x86/kvm/irq.h6
-rw-r--r--arch/x86/kvm/kvm_svm.h2
-rw-r--r--arch/x86/kvm/lapic.c58
-rw-r--r--arch/x86/kvm/mmu.c448
-rw-r--r--arch/x86/kvm/paging_tmpl.h45
-rw-r--r--arch/x86/kvm/svm.c48
-rw-r--r--arch/x86/kvm/vmx.c349
-rw-r--r--arch/x86/kvm/x86.c120
-rw-r--r--arch/x86/kvm/x86_emulate.c297
-rw-r--r--arch/x86/lguest/boot.c24
-rw-r--r--arch/x86/lguest/i386_head.S15
-rw-r--r--arch/x86/lib/usercopy_32.c12
-rw-r--r--arch/x86/lib/usercopy_64.c8
-rw-r--r--arch/x86/mach-default/Makefile5
-rw-r--r--arch/x86/mach-default/setup.c163
-rw-r--r--arch/x86/mach-generic/Makefile11
-rw-r--r--arch/x86/mach-generic/bigsmp.c58
-rw-r--r--arch/x86/mach-generic/default.c26
-rw-r--r--arch/x86/mach-generic/es7000.c92
-rw-r--r--arch/x86/mach-generic/numaq.c55
-rw-r--r--arch/x86/mach-generic/probe.c139
-rw-r--r--arch/x86/mach-generic/summit.c40
-rw-r--r--arch/x86/mach-rdc321x/Makefile5
-rw-r--r--arch/x86/mach-rdc321x/gpio.c194
-rw-r--r--arch/x86/mach-rdc321x/platform.c69
-rw-r--r--arch/x86/mach-voyager/setup.c13
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c60
-rw-r--r--arch/x86/math-emu/fpu_entry.c6
-rw-r--r--arch/x86/math-emu/fpu_proto.h4
-rw-r--r--arch/x86/math-emu/fpu_system.h16
-rw-r--r--arch/x86/math-emu/get_address.c75
-rw-r--r--arch/x86/mm/Makefile5
-rw-r--r--arch/x86/mm/extable.c6
-rw-r--r--arch/x86/mm/fault.c472
-rw-r--r--arch/x86/mm/init_32.c228
-rw-r--r--arch/x86/mm/init_64.c8
-rw-r--r--arch/x86/mm/iomap_32.c10
-rw-r--r--arch/x86/mm/ioremap.c30
-rw-r--r--arch/x86/mm/k8topology_64.c20
-rw-r--r--arch/x86/mm/mmap.c2
-rw-r--r--arch/x86/mm/numa_32.c35
-rw-r--r--arch/x86/mm/numa_64.c221
-rw-r--r--arch/x86/mm/pageattr.c49
-rw-r--r--arch/x86/mm/pat.c323
-rw-r--r--arch/x86/mm/srat_64.c3
-rw-r--r--arch/x86/mm/tlb.c (renamed from arch/x86/kernel/tlb_64.c)126
-rw-r--r--arch/x86/oprofile/nmi_int.c5
-rw-r--r--arch/x86/oprofile/op_model_amd.c224
-rw-r--r--arch/x86/oprofile/op_model_ppro.c6
-rw-r--r--arch/x86/pci/acpi.c9
-rw-r--r--arch/x86/pci/amd_bus.c2
-rw-r--r--arch/x86/pci/common.c32
-rw-r--r--arch/x86/pci/direct.c6
-rw-r--r--arch/x86/pci/early.c2
-rw-r--r--arch/x86/pci/fixup.c28
-rw-r--r--arch/x86/pci/i386.c18
-rw-r--r--arch/x86/pci/init.c5
-rw-r--r--arch/x86/pci/irq.c58
-rw-r--r--arch/x86/pci/legacy.c2
-rw-r--r--arch/x86/pci/mmconfig-shared.c3
-rw-r--r--arch/x86/pci/mmconfig_32.c2
-rw-r--r--arch/x86/pci/mmconfig_64.c3
-rw-r--r--arch/x86/pci/numaq_32.c8
-rw-r--r--arch/x86/pci/olpc.c2
-rw-r--r--arch/x86/pci/pcbios.c5
-rw-r--r--arch/x86/pci/visws.c23
-rw-r--r--arch/x86/power/hibernate_32.c4
-rw-r--r--arch/x86/vdso/Makefile2
-rw-r--r--arch/x86/vdso/vclock_gettime.c3
-rw-r--r--arch/x86/vdso/vdso32-setup.c2
-rw-r--r--arch/x86/vdso/vma.c2
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/enlighten.c790
-rw-r--r--arch/x86/xen/irq.c39
-rw-r--r--arch/x86/xen/mmu.c811
-rw-r--r--arch/x86/xen/mmu.h3
-rw-r--r--arch/x86/xen/multicalls.c2
-rw-r--r--arch/x86/xen/multicalls.h6
-rw-r--r--arch/x86/xen/setup.c9
-rw-r--r--arch/x86/xen/smp.c70
-rw-r--r--arch/x86/xen/suspend.c4
-rw-r--r--arch/x86/xen/time.c12
-rw-r--r--arch/x86/xen/xen-asm.S142
-rw-r--r--arch/x86/xen/xen-asm.h12
-rw-r--r--arch/x86/xen/xen-asm_32.S343
-rw-r--r--arch/x86/xen/xen-asm_64.S252
-rw-r--r--arch/x86/xen/xen-ops.h14
416 files changed, 19842 insertions, 15836 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 93224b56918..1042d69b267 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -5,7 +5,7 @@ mainmenu "Linux Kernel Configuration for x86"
config 64BIT
bool "64-bit kernel" if ARCH = "x86"
default ARCH = "x86_64"
- help
+ ---help---
Say yes to build a 64-bit kernel - formerly known as x86_64
Say no to build a 32-bit kernel - formerly known as i386
@@ -19,21 +19,27 @@ config X86_64
config X86
def_bool y
select HAVE_AOUT if X86_32
+ select HAVE_READQ
+ select HAVE_WRITEQ
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_IDE
select HAVE_OPROFILE
select HAVE_IOREMAP_PROT
select HAVE_KPROBES
select ARCH_WANT_OPTIONAL_GPIOLIB
+ select ARCH_WANT_FRAME_POINTERS
select HAVE_KRETPROBES
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_DYNAMIC_FTRACE
select HAVE_FUNCTION_TRACER
- select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
- select HAVE_ARCH_KGDB if !X86_VOYAGER
+ select HAVE_FUNCTION_GRAPH_TRACER
+ select HAVE_FUNCTION_TRACE_MCOUNT_TEST
+ select HAVE_KVM
+ select HAVE_ARCH_KGDB
select HAVE_ARCH_TRACEHOOK
select HAVE_GENERIC_DMA_COHERENT if X86_32
select HAVE_EFFICIENT_UNALIGNED_ACCESS
+ select USER_STACKTRACE_SUPPORT
config ARCH_DEFCONFIG
string
@@ -87,6 +93,10 @@ config GENERIC_IOMAP
config GENERIC_BUG
def_bool y
depends on BUG
+ select GENERIC_BUG_RELATIVE_POINTERS if X86_64
+
+config GENERIC_BUG_RELATIVE_POINTERS
+ bool
config GENERIC_HWEIGHT
def_bool y
@@ -123,18 +133,16 @@ config ARCH_HAS_CACHE_LINE_SIZE
def_bool y
config HAVE_SETUP_PER_CPU_AREA
- def_bool X86_64_SMP || (X86_SMP && !X86_VOYAGER)
+ def_bool y
config HAVE_CPUMASK_OF_CPU_MAP
def_bool X86_64_SMP
config ARCH_HIBERNATION_POSSIBLE
def_bool y
- depends on !SMP || !X86_VOYAGER
config ARCH_SUSPEND_POSSIBLE
def_bool y
- depends on !X86_VOYAGER
config ZONE_DMA32
bool
@@ -164,11 +172,9 @@ config GENERIC_PENDING_IRQ
depends on GENERIC_HARDIRQS && SMP
default y
-config X86_SMP
- bool
- depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64)
- select USE_GENERIC_SMP_HELPERS
- default y
+config USE_GENERIC_SMP_HELPERS
+ def_bool y
+ depends on SMP
config X86_32_SMP
def_bool y
@@ -181,19 +187,17 @@ config X86_64_SMP
config X86_HT
bool
depends on SMP
- depends on (X86_32 && !X86_VOYAGER) || X86_64
- default y
-
-config X86_BIOS_REBOOT
- bool
- depends on !X86_VOYAGER
default y
config X86_TRAMPOLINE
bool
- depends on X86_SMP || (X86_VOYAGER && SMP) || (64BIT && ACPI_SLEEP)
+ depends on SMP || (64BIT && ACPI_SLEEP)
default y
+config X86_32_LAZY_GS
+ def_bool y
+ depends on X86_32 && !CC_STACKPROTECTOR
+
config KTIME_SCALAR
def_bool X86_32
source "init/Kconfig"
@@ -231,144 +235,171 @@ config SMP
If you don't know what to do here, say N.
-config X86_HAS_BOOT_CPU_ID
- def_bool y
- depends on X86_VOYAGER
+config SPARSE_IRQ
+ bool "Support sparse irq numbering"
+ depends on PCI_MSI || HT_IRQ
+ ---help---
+ This enables support for sparse irqs. This is useful for distro
+ kernels that want to define a high CONFIG_NR_CPUS value but still
+ want to have low kernel memory footprint on smaller machines.
-config X86_FIND_SMP_CONFIG
- def_bool y
- depends on X86_MPPARSE || X86_VOYAGER
+ ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread
+ out the irq_desc[] array in a more NUMA-friendly way. )
+
+ If you don't know what to do here, say N.
+
+config NUMA_MIGRATE_IRQ_DESC
+ bool "Move irq desc when changing irq smp_affinity"
+ depends on SPARSE_IRQ && NUMA
+ default n
+ ---help---
+ This enables moving irq_desc to cpu/node that irq will use handled.
+
+ If you don't know what to do here, say N.
-if ACPI
config X86_MPPARSE
- def_bool y
- bool "Enable MPS table"
+ bool "Enable MPS table" if ACPI
+ default y
depends on X86_LOCAL_APIC
- help
+ ---help---
For old smp systems that do not have proper acpi support. Newer systems
(esp with 64bit cpus) with acpi support, MADT and DSDT will override it
-endif
-if !ACPI
-config X86_MPPARSE
- def_bool y
- depends on X86_LOCAL_APIC
-endif
+config X86_BIGSMP
+ bool "Support for big SMP systems with more than 8 CPUs"
+ depends on X86_32 && SMP
+ ---help---
+ This option is needed for the systems that have more than 8 CPUs
-choice
- prompt "Subarchitecture Type"
- default X86_PC
+config X86_EXTENDED_PLATFORM
+ bool "Support for extended (non-PC) x86 platforms"
+ default y
+ ---help---
+ If you disable this option then the kernel will only support
+ standard PC platforms. (which covers the vast majority of
+ systems out there.)
-config X86_PC
- bool "PC-compatible"
- help
- Choose this option if your computer is a standard PC or compatible.
+ If you enable this option then you'll be able to select a number
+ of non-PC x86 platforms.
+
+ If you have one of these systems, or if you want to build a
+ generic distribution kernel, say Y here - otherwise say N.
+
+# This is an alphabetically sorted list of 64 bit extended platforms
+# Please maintain the alphabetic order if and when there are additions
+
+config X86_VSMP
+ bool "ScaleMP vSMP"
+ select PARAVIRT
+ depends on X86_64 && PCI
+ depends on X86_EXTENDED_PLATFORM
+ ---help---
+ Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is
+ supposed to run on these EM64T-based machines. Only choose this option
+ if you have one of these machines.
+
+config X86_UV
+ bool "SGI Ultraviolet"
+ depends on X86_64
+ depends on X86_EXTENDED_PLATFORM
+ ---help---
+ This option is needed in order to support SGI Ultraviolet systems.
+ If you don't have one of these, you should say N here.
+
+# Following is an alphabetically sorted list of 32 bit extended platforms
+# Please maintain the alphabetic order if and when there are additions
config X86_ELAN
bool "AMD Elan"
depends on X86_32
- help
+ depends on X86_EXTENDED_PLATFORM
+ ---help---
Select this for an AMD Elan processor.
Do not use this option for K6/Athlon/Opteron processors!
If unsure, choose "PC-compatible" instead.
-config X86_VOYAGER
- bool "Voyager (NCR)"
- depends on X86_32 && (SMP || BROKEN) && !PCI
- help
- Voyager is an MCA-based 32-way capable SMP architecture proprietary
- to NCR Corp. Machine classes 345x/35xx/4100/51xx are Voyager-based.
-
- *** WARNING ***
-
- If you do not specifically know you have a Voyager based machine,
- say N here, otherwise the kernel you build will not be bootable.
-
-config X86_GENERICARCH
- bool "Generic architecture"
+config X86_RDC321X
+ bool "RDC R-321x SoC"
depends on X86_32
- help
- This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default
+ depends on X86_EXTENDED_PLATFORM
+ select M486
+ select X86_REBOOTFIXUPS
+ ---help---
+ This option is needed for RDC R-321x system-on-chip, also known
+ as R-8610-(G).
+ If you don't have one of these chips, you should say N here.
+
+config X86_32_NON_STANDARD
+ bool "Support non-standard 32-bit SMP architectures"
+ depends on X86_32 && SMP
+ depends on X86_EXTENDED_PLATFORM
+ ---help---
+ This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default
subarchitectures. It is intended for a generic binary kernel.
if you select them all, kernel will probe it one by one. and will
fallback to default.
-if X86_GENERICARCH
+# Alphabetically sorted list of Non standard 32 bit platforms
config X86_NUMAQ
bool "NUMAQ (IBM/Sequent)"
- depends on SMP && X86_32 && PCI && X86_MPPARSE
+ depends on X86_32_NON_STANDARD
select NUMA
- help
+ select X86_MPPARSE
+ ---help---
This option is used for getting Linux to run on a NUMAQ (IBM/Sequent)
NUMA multiquad box. This changes the way that processors are
bootstrapped, and uses Clustered Logical APIC addressing mode instead
of Flat Logical. You will need a new lynxer.elf file to flash your
firmware with - send email to <Martin.Bligh@us.ibm.com>.
+config X86_VISWS
+ bool "SGI 320/540 (Visual Workstation)"
+ depends on X86_32 && PCI && X86_MPPARSE && PCI_GODIRECT
+ depends on X86_32_NON_STANDARD
+ ---help---
+ The SGI Visual Workstation series is an IA32-based workstation
+ based on SGI systems chips with some legacy PC hardware attached.
+
+ Say Y here to create a kernel to run on the SGI 320 or 540.
+
+ A kernel compiled for the Visual Workstation will run on general
+ PCs as well. See <file:Documentation/sgi-visws.txt> for details.
+
config X86_SUMMIT
bool "Summit/EXA (IBM x440)"
- depends on X86_32 && SMP
- help
+ depends on X86_32_NON_STANDARD
+ ---help---
This option is needed for IBM systems that use the Summit/EXA chipset.
In particular, it is needed for the x440.
config X86_ES7000
- bool "Support for Unisys ES7000 IA32 series"
- depends on X86_32 && SMP
- help
+ bool "Unisys ES7000 IA32 series"
+ depends on X86_32_NON_STANDARD && X86_BIGSMP
+ ---help---
Support for Unisys ES7000 systems. Say 'Y' here if this kernel is
supposed to run on an IA32-based Unisys ES7000 system.
-config X86_BIGSMP
- bool "Support for big SMP systems with more than 8 CPUs"
- depends on X86_32 && SMP
- help
- This option is needed for the systems that have more than 8 CPUs
- and if the system is not of any sub-arch type above.
-
-endif
-
-config X86_VSMP
- bool "Support for ScaleMP vSMP"
- select PARAVIRT
- depends on X86_64 && PCI
- help
- Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is
- supposed to run on these EM64T-based machines. Only choose this option
- if you have one of these machines.
-
-endchoice
-
-config X86_VISWS
- bool "SGI 320/540 (Visual Workstation)"
- depends on X86_32 && PCI && !X86_VOYAGER && X86_MPPARSE && PCI_GODIRECT
- help
- The SGI Visual Workstation series is an IA32-based workstation
- based on SGI systems chips with some legacy PC hardware attached.
-
- Say Y here to create a kernel to run on the SGI 320 or 540.
+config X86_VOYAGER
+ bool "Voyager (NCR)"
+ depends on SMP && !PCI && BROKEN
+ depends on X86_32_NON_STANDARD
+ ---help---
+ Voyager is an MCA-based 32-way capable SMP architecture proprietary
+ to NCR Corp. Machine classes 345x/35xx/4100/51xx are Voyager-based.
- A kernel compiled for the Visual Workstation will run on general
- PCs as well. See <file:Documentation/sgi-visws.txt> for details.
+ *** WARNING ***
-config X86_RDC321X
- bool "RDC R-321x SoC"
- depends on X86_32
- select M486
- select X86_REBOOTFIXUPS
- help
- This option is needed for RDC R-321x system-on-chip, also known
- as R-8610-(G).
- If you don't have one of these chips, you should say N here.
+ If you do not specifically know you have a Voyager based machine,
+ say N here, otherwise the kernel you build will not be bootable.
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
def_bool y
prompt "Single-depth WCHAN output"
- depends on X86_32
- help
+ depends on X86
+ ---help---
Calculate simpler /proc/<PID>/wchan values. If this option
is disabled then wchan values will recurse back to the
caller function. This provides more accurate wchan values,
@@ -378,7 +409,7 @@ config SCHED_NO_NO_OMIT_FRAME_POINTER
menuconfig PARAVIRT_GUEST
bool "Paravirtualized guest support"
- help
+ ---help---
Say Y here to get to see options related to running Linux under
various hypervisors. This option alone does not add any kernel code.
@@ -392,8 +423,7 @@ config VMI
bool "VMI Guest support"
select PARAVIRT
depends on X86_32
- depends on !X86_VOYAGER
- help
+ ---help---
VMI provides a paravirtualized interface to the VMware ESX server
(it could be used by other hypervisors in theory too, but is not
at the moment), by linking the kernel to a GPL-ed ROM module
@@ -403,8 +433,7 @@ config KVM_CLOCK
bool "KVM paravirtualized clock"
select PARAVIRT
select PARAVIRT_CLOCK
- depends on !X86_VOYAGER
- help
+ ---help---
Turning on this option will allow you to run a paravirtualized clock
when running over the KVM hypervisor. Instead of relying on a PIT
(or probably other) emulation by the underlying device model, the host
@@ -414,17 +443,15 @@ config KVM_CLOCK
config KVM_GUEST
bool "KVM Guest support"
select PARAVIRT
- depends on !X86_VOYAGER
- help
- This option enables various optimizations for running under the KVM
- hypervisor.
+ ---help---
+ This option enables various optimizations for running under the KVM
+ hypervisor.
source "arch/x86/lguest/Kconfig"
config PARAVIRT
bool "Enable paravirtualization code"
- depends on !X86_VOYAGER
- help
+ ---help---
This changes the kernel so it can modify itself when it is run
under a hypervisor, potentially improving performance significantly
over full virtualization. However, when run without a hypervisor
@@ -437,55 +464,51 @@ config PARAVIRT_CLOCK
endif
config PARAVIRT_DEBUG
- bool "paravirt-ops debugging"
- depends on PARAVIRT && DEBUG_KERNEL
- help
- Enable to debug paravirt_ops internals. Specifically, BUG if
- a paravirt_op is missing when it is called.
+ bool "paravirt-ops debugging"
+ depends on PARAVIRT && DEBUG_KERNEL
+ ---help---
+ Enable to debug paravirt_ops internals. Specifically, BUG if
+ a paravirt_op is missing when it is called.
config MEMTEST
bool "Memtest"
- help
+ ---help---
This option adds a kernel parameter 'memtest', which allows memtest
to be set.
- memtest=0, mean disabled; -- default
- memtest=1, mean do 1 test pattern;
- ...
- memtest=4, mean do 4 test patterns.
+ memtest=0, mean disabled; -- default
+ memtest=1, mean do 1 test pattern;
+ ...
+ memtest=4, mean do 4 test patterns.
If you are unsure how to answer this question, answer N.
config X86_SUMMIT_NUMA
def_bool y
- depends on X86_32 && NUMA && X86_GENERICARCH
+ depends on X86_32 && NUMA && X86_32_NON_STANDARD
config X86_CYCLONE_TIMER
def_bool y
- depends on X86_GENERICARCH
-
-config ES7000_CLUSTERED_APIC
- def_bool y
- depends on SMP && X86_ES7000 && MPENTIUMIII
+ depends on X86_32_NON_STANDARD
source "arch/x86/Kconfig.cpu"
config HPET_TIMER
def_bool X86_64
prompt "HPET Timer Support" if X86_32
- help
- Use the IA-PC HPET (High Precision Event Timer) to manage
- time in preference to the PIT and RTC, if a HPET is
- present.
- HPET is the next generation timer replacing legacy 8254s.
- The HPET provides a stable time base on SMP
- systems, unlike the TSC, but it is more expensive to access,
- as it is off-chip. You can find the HPET spec at
- <http://www.intel.com/hardwaredesign/hpetspec.htm>.
+ ---help---
+ Use the IA-PC HPET (High Precision Event Timer) to manage
+ time in preference to the PIT and RTC, if a HPET is
+ present.
+ HPET is the next generation timer replacing legacy 8254s.
+ The HPET provides a stable time base on SMP
+ systems, unlike the TSC, but it is more expensive to access,
+ as it is off-chip. You can find the HPET spec at
+ <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>.
- You can safely choose Y here. However, HPET will only be
- activated if the platform and the BIOS support this feature.
- Otherwise the 8254 will be used for timing services.
+ You can safely choose Y here. However, HPET will only be
+ activated if the platform and the BIOS support this feature.
+ Otherwise the 8254 will be used for timing services.
- Choose N to continue using the legacy 8254 timer.
+ Choose N to continue using the legacy 8254 timer.
config HPET_EMULATE_RTC
def_bool y
@@ -496,7 +519,7 @@ config HPET_EMULATE_RTC
config DMI
default y
bool "Enable DMI scanning" if EMBEDDED
- help
+ ---help---
Enabled scanning of DMI to identify machine quirks. Say Y
here unless you have verified that your setup is not
affected by entries in the DMI blacklist. Required by PNP
@@ -508,7 +531,7 @@ config GART_IOMMU
select SWIOTLB
select AGP
depends on X86_64 && PCI
- help
+ ---help---
Support for full DMA access of devices with 32bit memory access only
on systems with more than 3GB. This is usually needed for USB,
sound, many IDE/SATA chipsets and some other devices.
@@ -523,7 +546,7 @@ config CALGARY_IOMMU
bool "IBM Calgary IOMMU support"
select SWIOTLB
depends on X86_64 && PCI && EXPERIMENTAL
- help
+ ---help---
Support for hardware IOMMUs in IBM's xSeries x366 and x460
systems. Needed to run systems with more than 3GB of memory
properly with 32-bit PCI devices that do not support DAC
@@ -541,7 +564,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
def_bool y
prompt "Should Calgary be enabled by default?"
depends on CALGARY_IOMMU
- help
+ ---help---
Should Calgary be enabled by default? if you choose 'y', Calgary
will be used (if it exists). If you choose 'n', Calgary will not be
used even if it exists. If you choose 'n' and would like to use
@@ -553,7 +576,7 @@ config AMD_IOMMU
select SWIOTLB
select PCI_MSI
depends on X86_64 && PCI && ACPI
- help
+ ---help---
With this option you can enable support for AMD IOMMU hardware in
your system. An IOMMU is a hardware component which provides
remapping of DMA memory accesses from devices. With an AMD IOMMU you
@@ -564,10 +587,20 @@ config AMD_IOMMU
your BIOS for an option to enable it or if you have an IVRS ACPI
table.
+config AMD_IOMMU_STATS
+ bool "Export AMD IOMMU statistics to debugfs"
+ depends on AMD_IOMMU
+ select DEBUG_FS
+ ---help---
+ This option enables code in the AMD IOMMU driver to collect various
+ statistics about whats happening in the driver and exports that
+ information to userspace via debugfs.
+ If unsure, say N.
+
# need this always selected by IOMMU for the VIA workaround
config SWIOTLB
- bool
- help
+ def_bool y if X86_64
+ ---help---
Support for software bounce buffers used on x86-64 systems
which don't have a hardware IOMMU (e.g. the current generation
of Intel's x86-64 CPUs). Using this PCI devices which can only
@@ -577,22 +610,26 @@ config SWIOTLB
config IOMMU_HELPER
def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
+config IOMMU_API
+ def_bool (AMD_IOMMU || DMAR)
+
config MAXSMP
bool "Configure Maximum number of SMP Processors and NUMA Nodes"
- depends on X86_64 && SMP && BROKEN
+ depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
+ select CPUMASK_OFFSTACK
default n
- help
+ ---help---
Configure maximum number of CPUS and NUMA Nodes for this architecture.
If unsure, say N.
config NR_CPUS
- int "Maximum number of CPUs (2-512)" if !MAXSMP
- range 2 512
- depends on SMP
+ int "Maximum number of CPUs" if SMP && !MAXSMP
+ range 2 512 if SMP && !MAXSMP
+ default "1" if !SMP
default "4096" if MAXSMP
- default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
- default "8"
- help
+ default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000)
+ default "8" if SMP
+ ---help---
This allows you to specify the maximum number of CPUs which this
kernel will support. The maximum supported value is 512 and the
minimum value which makes sense is 2.
@@ -603,7 +640,7 @@ config NR_CPUS
config SCHED_SMT
bool "SMT (Hyperthreading) scheduler support"
depends on X86_HT
- help
+ ---help---
SMT scheduler support improves the CPU scheduler's decision making
when dealing with Intel Pentium 4 chips with HyperThreading at a
cost of slightly increased overhead in some places. If unsure say
@@ -613,7 +650,7 @@ config SCHED_MC
def_bool y
prompt "Multi-core scheduler support"
depends on X86_HT
- help
+ ---help---
Multi-core scheduler support improves the CPU scheduler's decision
making when dealing with multi-core CPU chips at a cost of slightly
increased overhead in some places. If unsure say N here.
@@ -622,8 +659,8 @@ source "kernel/Kconfig.preempt"
config X86_UP_APIC
bool "Local APIC support on uniprocessors"
- depends on X86_32 && !SMP && !(X86_VOYAGER || X86_GENERICARCH)
- help
+ depends on X86_32 && !SMP && !X86_32_NON_STANDARD
+ ---help---
A local APIC (Advanced Programmable Interrupt Controller) is an
integrated interrupt controller in the CPU. If you have a single-CPU
system which has a processor with a local APIC, you can say Y here to
@@ -636,7 +673,7 @@ config X86_UP_APIC
config X86_UP_IOAPIC
bool "IO-APIC support on uniprocessors"
depends on X86_UP_APIC
- help
+ ---help---
An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an
SMP-capable replacement for PC-style interrupt controllers. Most
SMP systems and many recent uniprocessor systems have one.
@@ -647,19 +684,42 @@ config X86_UP_IOAPIC
config X86_LOCAL_APIC
def_bool y
- depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
+ depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
config X86_IO_APIC
def_bool y
- depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
+ depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
config X86_VISWS_APIC
def_bool y
depends on X86_32 && X86_VISWS
+config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
+ bool "Reroute for broken boot IRQs"
+ default n
+ depends on X86_IO_APIC
+ ---help---
+ This option enables a workaround that fixes a source of
+ spurious interrupts. This is recommended when threaded
+ interrupt handling is used on systems where the generation of
+ superfluous "boot interrupts" cannot be disabled.
+
+ Some chipsets generate a legacy INTx "boot IRQ" when the IRQ
+ entry in the chipset's IO-APIC is masked (as, e.g. the RT
+ kernel does during interrupt handling). On chipsets where this
+ boot IRQ generation cannot be disabled, this workaround keeps
+ the original IRQ line masked so that only the equivalent "boot
+ IRQ" is delivered to the CPUs. The workaround also tells the
+ kernel to set up the IRQ handler on the boot IRQ line. In this
+ way only one interrupt is delivered to the kernel. Otherwise
+ the spurious second interrupt may cause the kernel to bring
+ down (vital) interrupt lines.
+
+ Only affects "broken" chipsets. Interrupt sharing may be
+ increased on these systems.
+
config X86_MCE
bool "Machine Check Exception"
- depends on !X86_VOYAGER
---help---
Machine Check Exception support allows the processor to notify the
kernel if it detects a problem (e.g. overheating, component failure).
@@ -678,7 +738,7 @@ config X86_MCE_INTEL
def_bool y
prompt "Intel MCE features"
depends on X86_64 && X86_MCE && X86_LOCAL_APIC
- help
+ ---help---
Additional support for intel specific MCE features such as
the thermal monitor.
@@ -686,14 +746,14 @@ config X86_MCE_AMD
def_bool y
prompt "AMD MCE features"
depends on X86_64 && X86_MCE && X86_LOCAL_APIC
- help
+ ---help---
Additional support for AMD specific MCE features such as
the DRAM Error Threshold.
config X86_MCE_NONFATAL
tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
depends on X86_32 && X86_MCE
- help
+ ---help---
Enabling this feature starts a timer that triggers every 5 seconds which
will look at the machine check registers to see if anything happened.
Non-fatal problems automatically get corrected (but still logged).
@@ -706,7 +766,7 @@ config X86_MCE_NONFATAL
config X86_MCE_P4THERMAL
bool "check for P4 thermal throttling interrupt."
depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP)
- help
+ ---help---
Enabling this feature will cause a message to be printed when the P4
enters thermal throttling.
@@ -714,11 +774,11 @@ config VM86
bool "Enable VM86 support" if EMBEDDED
default y
depends on X86_32
- help
- This option is required by programs like DOSEMU to run 16-bit legacy
+ ---help---
+ This option is required by programs like DOSEMU to run 16-bit legacy
code on X86 processors. It also may be needed by software like
- XFree86 to initialize some video cards via BIOS. Disabling this
- option saves about 6k.
+ XFree86 to initialize some video cards via BIOS. Disabling this
+ option saves about 6k.
config TOSHIBA
tristate "Toshiba Laptop support"
@@ -792,33 +852,33 @@ config MICROCODE
module will be called microcode.
config MICROCODE_INTEL
- bool "Intel microcode patch loading support"
- depends on MICROCODE
- default MICROCODE
- select FW_LOADER
- --help---
- This options enables microcode patch loading support for Intel
- processors.
-
- For latest news and information on obtaining all the required
- Intel ingredients for this driver, check:
- <http://www.urbanmyth.org/microcode/>.
+ bool "Intel microcode patch loading support"
+ depends on MICROCODE
+ default MICROCODE
+ select FW_LOADER
+ ---help---
+ This options enables microcode patch loading support for Intel
+ processors.
+
+ For latest news and information on obtaining all the required
+ Intel ingredients for this driver, check:
+ <http://www.urbanmyth.org/microcode/>.
config MICROCODE_AMD
- bool "AMD microcode patch loading support"
- depends on MICROCODE
- select FW_LOADER
- --help---
- If you select this option, microcode patch loading support for AMD
- processors will be enabled.
+ bool "AMD microcode patch loading support"
+ depends on MICROCODE
+ select FW_LOADER
+ ---help---
+ If you select this option, microcode patch loading support for AMD
+ processors will be enabled.
- config MICROCODE_OLD_INTERFACE
+config MICROCODE_OLD_INTERFACE
def_bool y
depends on MICROCODE
config X86_MSR
tristate "/dev/cpu/*/msr - Model-specific register support"
- help
+ ---help---
This device gives privileged processes access to the x86
Model-Specific Registers (MSRs). It is a character device with
major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr.
@@ -827,7 +887,7 @@ config X86_MSR
config X86_CPUID
tristate "/dev/cpu/*/cpuid - CPU information support"
- help
+ ---help---
This device gives processes access to the x86 CPUID instruction to
be executed on a specific processor. It is a character device
with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
@@ -879,7 +939,7 @@ config NOHIGHMEM
config HIGHMEM4G
bool "4GB"
depends on !X86_NUMAQ
- help
+ ---help---
Select this if you have a 32-bit processor and between 1 and 4
gigabytes of physical RAM.
@@ -887,7 +947,7 @@ config HIGHMEM64G
bool "64GB"
depends on !M386 && !M486
select X86_PAE
- help
+ ---help---
Select this if you have a 32-bit processor and more than 4
gigabytes of physical RAM.
@@ -898,7 +958,7 @@ choice
prompt "Memory split" if EMBEDDED
default VMSPLIT_3G
depends on X86_32
- help
+ ---help---
Select the desired split between kernel and user memory.
If the address range available to the kernel is less than the
@@ -944,33 +1004,45 @@ config HIGHMEM
config X86_PAE
bool "PAE (Physical Address Extension) Support"
depends on X86_32 && !HIGHMEM4G
- help
+ ---help---
PAE is required for NX support, and furthermore enables
larger swapspace support for non-overcommit purposes. It
has the cost of more pagetable lookup overhead, and also
consumes more pagetable space per process.
config ARCH_PHYS_ADDR_T_64BIT
- def_bool X86_64 || X86_PAE
+ def_bool X86_64 || X86_PAE
+
+config DIRECT_GBPAGES
+ bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
+ default y
+ depends on X86_64
+ ---help---
+ Allow the kernel linear mapping to use 1GB pages on CPUs that
+ support it. This can improve the kernel's performance a tiny bit by
+ reducing TLB pressure. If in doubt, say "Y".
# Common NUMA Features
config NUMA
- bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)"
+ bool "Numa Memory Allocation and Scheduler Support"
depends on SMP
- depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && BROKEN)
- default n if X86_PC
+ depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
- help
+ ---help---
Enable NUMA (Non Uniform Memory Access) support.
+
The kernel will try to allocate memory used by a CPU on the
local memory controller of the CPU and add some more
NUMA awareness to the kernel.
- For 32-bit this is currently highly experimental and should be only
- used for kernel development. It might also cause boot failures.
- For 64-bit this is recommended on all multiprocessor Opteron systems.
- If the system is EM64T, you should say N unless your system is
- EM64T NUMA.
+ For 64-bit this is recommended if the system is Intel Core i7
+ (or later), AMD Opteron, or EM64T NUMA.
+
+ For 32-bit this is only needed on (rare) 32-bit-only platforms
+ that support NUMA topologies, such as NUMAQ / Summit, or if you
+ boot a 32-bit kernel on a 64-bit NUMA platform.
+
+ Otherwise, you should say N.
comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
@@ -979,19 +1051,19 @@ config K8_NUMA
def_bool y
prompt "Old style AMD Opteron NUMA detection"
depends on X86_64 && NUMA && PCI
- help
- Enable K8 NUMA node topology detection. You should say Y here if
- you have a multi processor AMD K8 system. This uses an old
- method to read the NUMA configuration directly from the builtin
- Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
- instead, which also takes priority if both are compiled in.
+ ---help---
+ Enable K8 NUMA node topology detection. You should say Y here if
+ you have a multi processor AMD K8 system. This uses an old
+ method to read the NUMA configuration directly from the builtin
+ Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
+ instead, which also takes priority if both are compiled in.
config X86_64_ACPI_NUMA
def_bool y
prompt "ACPI NUMA detection"
depends on X86_64 && NUMA && ACPI && PCI
select ACPI_NUMA
- help
+ ---help---
Enable ACPI SRAT based node topology detection.
# Some NUMA nodes have memory ranges that span
@@ -1006,7 +1078,7 @@ config NODES_SPAN_OTHER_NODES
config NUMA_EMU
bool "NUMA emulation"
depends on X86_64 && NUMA
- help
+ ---help---
Enable NUMA emulation. A flat machine will be split
into virtual nodes when booted with "numa=fake=N", where N is the
number of nodes. This is only useful for debugging.
@@ -1019,7 +1091,7 @@ config NODES_SHIFT
default "4" if X86_NUMAQ
default "3"
depends on NEED_MULTIPLE_NODES
- help
+ ---help---
Specify the maximum number of NUMA Nodes available on the target
system. Increases memory reserved to accomodate various tables.
@@ -1057,7 +1129,7 @@ config ARCH_SPARSEMEM_DEFAULT
config ARCH_SPARSEMEM_ENABLE
def_bool y
- depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) || X86_GENERICARCH
+ depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD
select SPARSEMEM_STATIC if X86_32
select SPARSEMEM_VMEMMAP_ENABLE if X86_64
@@ -1074,61 +1146,61 @@ source "mm/Kconfig"
config HIGHPTE
bool "Allocate 3rd-level pagetables from highmem"
depends on X86_32 && (HIGHMEM4G || HIGHMEM64G)
- help
+ ---help---
The VM uses one page table entry for each page of physical memory.
For systems with a lot of RAM, this can be wasteful of precious
low memory. Setting this option will put user-space page table
entries in high memory.
config X86_CHECK_BIOS_CORRUPTION
- bool "Check for low memory corruption"
- help
- Periodically check for memory corruption in low memory, which
- is suspected to be caused by BIOS. Even when enabled in the
- configuration, it is disabled at runtime. Enable it by
- setting "memory_corruption_check=1" on the kernel command
- line. By default it scans the low 64k of memory every 60
- seconds; see the memory_corruption_check_size and
- memory_corruption_check_period parameters in
- Documentation/kernel-parameters.txt to adjust this.
-
- When enabled with the default parameters, this option has
- almost no overhead, as it reserves a relatively small amount
- of memory and scans it infrequently. It both detects corruption
- and prevents it from affecting the running system.
-
- It is, however, intended as a diagnostic tool; if repeatable
- BIOS-originated corruption always affects the same memory,
- you can use memmap= to prevent the kernel from using that
- memory.
+ bool "Check for low memory corruption"
+ ---help---
+ Periodically check for memory corruption in low memory, which
+ is suspected to be caused by BIOS. Even when enabled in the
+ configuration, it is disabled at runtime. Enable it by
+ setting "memory_corruption_check=1" on the kernel command
+ line. By default it scans the low 64k of memory every 60
+ seconds; see the memory_corruption_check_size and
+ memory_corruption_check_period parameters in
+ Documentation/kernel-parameters.txt to adjust this.
+
+ When enabled with the default parameters, this option has
+ almost no overhead, as it reserves a relatively small amount
+ of memory and scans it infrequently. It both detects corruption
+ and prevents it from affecting the running system.
+
+ It is, however, intended as a diagnostic tool; if repeatable
+ BIOS-originated corruption always affects the same memory,
+ you can use memmap= to prevent the kernel from using that
+ memory.
config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
- bool "Set the default setting of memory_corruption_check"
+ bool "Set the default setting of memory_corruption_check"
depends on X86_CHECK_BIOS_CORRUPTION
default y
- help
- Set whether the default state of memory_corruption_check is
- on or off.
+ ---help---
+ Set whether the default state of memory_corruption_check is
+ on or off.
config X86_RESERVE_LOW_64K
- bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
+ bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
default y
- help
- Reserve the first 64K of physical RAM on BIOSes that are known
- to potentially corrupt that memory range. A numbers of BIOSes are
- known to utilize this area during suspend/resume, so it must not
- be used by the kernel.
+ ---help---
+ Reserve the first 64K of physical RAM on BIOSes that are known
+ to potentially corrupt that memory range. A numbers of BIOSes are
+ known to utilize this area during suspend/resume, so it must not
+ be used by the kernel.
- Set this to N if you are absolutely sure that you trust the BIOS
- to get all its memory reservations and usages right.
+ Set this to N if you are absolutely sure that you trust the BIOS
+ to get all its memory reservations and usages right.
- If you have doubts about the BIOS (e.g. suspend/resume does not
- work or there's kernel crashes after certain hardware hotplug
- events) and it's not AMI or Phoenix, then you might want to enable
- X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
- corruption patterns.
+ If you have doubts about the BIOS (e.g. suspend/resume does not
+ work or there's kernel crashes after certain hardware hotplug
+ events) and it's not AMI or Phoenix, then you might want to enable
+ X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
+ corruption patterns.
- Say Y if unsure.
+ Say Y if unsure.
config MATH_EMULATION
bool
@@ -1194,7 +1266,7 @@ config MTRR_SANITIZER
def_bool y
prompt "MTRR cleanup support"
depends on MTRR
- help
+ ---help---
Convert MTRR layout from continuous to discrete, so X drivers can
add writeback entries.
@@ -1209,7 +1281,7 @@ config MTRR_SANITIZER_ENABLE_DEFAULT
range 0 1
default "0"
depends on MTRR_SANITIZER
- help
+ ---help---
Enable mtrr cleanup default value
config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
@@ -1217,7 +1289,7 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
range 0 7
default "1"
depends on MTRR_SANITIZER
- help
+ ---help---
mtrr cleanup spare entries default, it can be changed via
mtrr_spare_reg_nr=N on the kernel command line.
@@ -1225,7 +1297,7 @@ config X86_PAT
bool
prompt "x86 PAT support"
depends on MTRR
- help
+ ---help---
Use PAT attributes to setup page level cache control.
PATs are the modern equivalents of MTRRs and are much more
@@ -1240,20 +1312,20 @@ config EFI
bool "EFI runtime service support"
depends on ACPI
---help---
- This enables the kernel to use EFI runtime services that are
- available (such as the EFI variable services).
+ This enables the kernel to use EFI runtime services that are
+ available (such as the EFI variable services).
- This option is only useful on systems that have EFI firmware.
- In addition, you should use the latest ELILO loader available
- at <http://elilo.sourceforge.net> in order to take advantage
- of EFI runtime services. However, even with this option, the
- resultant kernel should continue to boot on existing non-EFI
- platforms.
+ This option is only useful on systems that have EFI firmware.
+ In addition, you should use the latest ELILO loader available
+ at <http://elilo.sourceforge.net> in order to take advantage
+ of EFI runtime services. However, even with this option, the
+ resultant kernel should continue to boot on existing non-EFI
+ platforms.
config SECCOMP
def_bool y
prompt "Enable seccomp to safely compute untrusted bytecode"
- help
+ ---help---
This kernel feature is useful for number crunching applications
that may need to compute untrusted bytecode during their
execution. By using pipes or other transports made available to
@@ -1266,13 +1338,16 @@ config SECCOMP
If unsure, say Y. Only embedded should say N here.
+config CC_STACKPROTECTOR_ALL
+ bool
+
config CC_STACKPROTECTOR
bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
- depends on X86_64 && EXPERIMENTAL && BROKEN
- help
- This option turns on the -fstack-protector GCC feature. This
- feature puts, at the beginning of critical functions, a canary
- value on the stack just before the return address, and validates
+ select CC_STACKPROTECTOR_ALL
+ ---help---
+ This option turns on the -fstack-protector GCC feature. This
+ feature puts, at the beginning of functions, a canary value on
+ the stack just before the return address, and validates
the value just before actually returning. Stack based buffer
overflows (that need to overwrite this return address) now also
overwrite the canary, which gets detected and the attack is then
@@ -1280,22 +1355,14 @@ config CC_STACKPROTECTOR
This feature requires gcc version 4.2 or above, or a distribution
gcc with the feature backported. Older versions are automatically
- detected and for those versions, this configuration option is ignored.
-
-config CC_STACKPROTECTOR_ALL
- bool "Use stack-protector for all functions"
- depends on CC_STACKPROTECTOR
- help
- Normally, GCC only inserts the canary value protection for
- functions that use large-ish on-stack buffers. By enabling
- this option, GCC will be asked to do this for ALL functions.
+ detected and for those versions, this configuration option is
+ ignored. (and a warning is printed during bootup)
source kernel/Kconfig.hz
config KEXEC
bool "kexec system call"
- depends on X86_BIOS_REBOOT
- help
+ ---help---
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
but it is independent of the system firmware. And like a reboot
@@ -1312,7 +1379,7 @@ config KEXEC
config CRASH_DUMP
bool "kernel crash dumps"
depends on X86_64 || (X86_32 && HIGHMEM)
- help
+ ---help---
Generate crash dump after being started by kexec.
This should be normally only set in special crash dump kernels
which are loaded in the main kernel with kexec-tools into
@@ -1327,7 +1394,7 @@ config KEXEC_JUMP
bool "kexec jump (EXPERIMENTAL)"
depends on EXPERIMENTAL
depends on KEXEC && HIBERNATION && X86_32
- help
+ ---help---
Jump between original kernel and kexeced kernel and invoke
code in physical address mode via KEXEC
@@ -1336,7 +1403,7 @@ config PHYSICAL_START
default "0x1000000" if X86_NUMAQ
default "0x200000" if X86_64
default "0x100000"
- help
+ ---help---
This gives the physical address where the kernel is loaded.
If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then
@@ -1377,7 +1444,7 @@ config PHYSICAL_START
config RELOCATABLE
bool "Build a relocatable kernel (EXPERIMENTAL)"
depends on EXPERIMENTAL
- help
+ ---help---
This builds a kernel image that retains relocation information
so it can be loaded someplace besides the default 1MB.
The relocations tend to make the kernel binary about 10% larger,
@@ -1397,7 +1464,7 @@ config PHYSICAL_ALIGN
default "0x100000" if X86_32
default "0x200000" if X86_64
range 0x2000 0x400000
- help
+ ---help---
This value puts the alignment restrictions on physical address
where kernel is loaded and run from. Kernel is compiled for an
address which meets above alignment restriction.
@@ -1418,7 +1485,7 @@ config PHYSICAL_ALIGN
config HOTPLUG_CPU
bool "Support for hot-pluggable CPUs"
- depends on SMP && HOTPLUG && !X86_VOYAGER
+ depends on SMP && HOTPLUG
---help---
Say Y here to allow turning CPUs off and on. CPUs can be
controlled through /sys/devices/system/cpu.
@@ -1430,7 +1497,7 @@ config COMPAT_VDSO
def_bool y
prompt "Compat VDSO support"
depends on X86_32 || IA32_EMULATION
- help
+ ---help---
Map the 32-bit VDSO to the predictable old-style address too.
---help---
Say N here if you are running a sufficiently recent glibc
@@ -1442,7 +1509,7 @@ config COMPAT_VDSO
config CMDLINE_BOOL
bool "Built-in kernel command line"
default n
- help
+ ---help---
Allow for specifying boot arguments to the kernel at
build time. On some systems (e.g. embedded ones), it is
necessary or convenient to provide some or all of the
@@ -1460,7 +1527,7 @@ config CMDLINE
string "Built-in kernel command string"
depends on CMDLINE_BOOL
default ""
- help
+ ---help---
Enter arguments here that should be compiled into the kernel
image and used at boot time. If the boot loader provides a
command line at boot time, it is appended to this string to
@@ -1477,7 +1544,7 @@ config CMDLINE_OVERRIDE
bool "Built-in command line overrides boot loader arguments"
default n
depends on CMDLINE_BOOL
- help
+ ---help---
Set this option to 'Y' to have the kernel ignore the boot loader
command line, and use ONLY the built-in command line.
@@ -1490,12 +1557,15 @@ config ARCH_ENABLE_MEMORY_HOTPLUG
def_bool y
depends on X86_64 || (X86_32 && HIGHMEM)
+config ARCH_ENABLE_MEMORY_HOTREMOVE
+ def_bool y
+ depends on MEMORY_HOTPLUG
+
config HAVE_ARCH_EARLY_PFN_TO_NID
def_bool X86_64
depends on NUMA
menu "Power management and ACPI options"
- depends on !X86_VOYAGER
config ARCH_HIBERNATION_HEADER
def_bool y
@@ -1573,7 +1643,7 @@ if APM
config APM_IGNORE_USER_SUSPEND
bool "Ignore USER SUSPEND"
- help
+ ---help---
This option will ignore USER SUSPEND requests. On machines with a
compliant APM BIOS, you want to say N. However, on the NEC Versa M
series notebooks, it is necessary to say Y because of a BIOS bug.
@@ -1597,7 +1667,7 @@ config APM_DO_ENABLE
config APM_CPU_IDLE
bool "Make CPU Idle calls when idle"
- help
+ ---help---
Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop.
On some machines, this can activate improved power savings, such as
a slowed CPU clock rate, when the machine is idle. These idle calls
@@ -1608,7 +1678,7 @@ config APM_CPU_IDLE
config APM_DISPLAY_BLANK
bool "Enable console blanking using APM"
- help
+ ---help---
Enable console blanking using the APM. Some laptops can use this to
turn off the LCD backlight when the screen blanker of the Linux
virtual console blanks the screen. Note that this is only used by
@@ -1621,7 +1691,7 @@ config APM_DISPLAY_BLANK
config APM_ALLOW_INTS
bool "Allow interrupts during APM BIOS calls"
- help
+ ---help---
Normally we disable external interrupts while we are making calls to
the APM BIOS as a measure to lessen the effects of a badly behaving
BIOS implementation. The BIOS should reenable interrupts if it
@@ -1629,13 +1699,6 @@ config APM_ALLOW_INTS
many of the newer IBM Thinkpads. If you experience hangs when you
suspend, try setting this to Y. Otherwise, say N.
-config APM_REAL_MODE_POWER_OFF
- bool "Use real mode APM BIOS call to power off"
- help
- Use real mode APM BIOS calls to switch off the computer. This is
- a work-around for a number of buggy BIOSes. Switch this option on if
- your computer crashes instead of powering off properly.
-
endif # APM
source "arch/x86/kernel/cpu/cpufreq/Kconfig"
@@ -1653,7 +1716,7 @@ config PCI
bool "PCI support"
default y
select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
- help
+ ---help---
Find out whether you have a PCI motherboard. PCI is the name of a
bus system, i.e. the way the CPU talks to the other stuff inside
your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or
@@ -1724,40 +1787,51 @@ config PCI_MMCONFIG
config DMAR
bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
depends on X86_64 && PCI_MSI && ACPI && EXPERIMENTAL
- help
+ ---help---
DMA remapping (DMAR) devices support enables independent address
translations for Direct Memory Access (DMA) from devices.
These DMA remapping devices are reported via ACPI tables
and include PCI device scope covered by these DMA
remapping devices.
+config DMAR_DEFAULT_ON
+ def_bool n
+ prompt "Enable DMA Remapping Devices by default"
+ depends on DMAR
+ help
+ Selecting this option will enable a DMAR device at boot time if
+ one is found. If this option is not selected, DMAR support can
+ be enabled by passing intel_iommu=on to the kernel. It is
+ recommended you say N here while the DMAR code remains
+ experimental.
+
config DMAR_GFX_WA
def_bool y
prompt "Support for Graphics workaround"
depends on DMAR
- help
- Current Graphics drivers tend to use physical address
- for DMA and avoid using DMA APIs. Setting this config
- option permits the IOMMU driver to set a unity map for
- all the OS-visible memory. Hence the driver can continue
- to use physical addresses for DMA.
+ ---help---
+ Current Graphics drivers tend to use physical address
+ for DMA and avoid using DMA APIs. Setting this config
+ option permits the IOMMU driver to set a unity map for
+ all the OS-visible memory. Hence the driver can continue
+ to use physical addresses for DMA.
config DMAR_FLOPPY_WA
def_bool y
depends on DMAR
- help
- Floppy disk drivers are know to bypass DMA API calls
- thereby failing to work when IOMMU is enabled. This
- workaround will setup a 1:1 mapping for the first
- 16M to make floppy (an ISA device) work.
+ ---help---
+ Floppy disk drivers are know to bypass DMA API calls
+ thereby failing to work when IOMMU is enabled. This
+ workaround will setup a 1:1 mapping for the first
+ 16M to make floppy (an ISA device) work.
config INTR_REMAP
bool "Support for Interrupt Remapping (EXPERIMENTAL)"
depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
- help
- Supports Interrupt remapping for IO-APIC and MSI devices.
- To use x2apic mode in the CPU's which support x2APIC enhancements or
- to support platforms with CPU's having > 8 bit APIC ID, say Y.
+ ---help---
+ Supports Interrupt remapping for IO-APIC and MSI devices.
+ To use x2apic mode in the CPU's which support x2APIC enhancements or
+ to support platforms with CPU's having > 8 bit APIC ID, say Y.
source "drivers/pci/pcie/Kconfig"
@@ -1771,8 +1845,7 @@ if X86_32
config ISA
bool "ISA support"
- depends on !X86_VOYAGER
- help
+ ---help---
Find out whether you have ISA slots on your motherboard. ISA is the
name of a bus system, i.e. the way the CPU talks to the other stuff
inside your box. Other bus systems are PCI, EISA, MicroChannel
@@ -1798,9 +1871,8 @@ config EISA
source "drivers/eisa/Kconfig"
config MCA
- bool "MCA support" if !X86_VOYAGER
- default y if X86_VOYAGER
- help
+ bool "MCA support"
+ ---help---
MicroChannel Architecture is found in some IBM PS/2 machines and
laptops. It is a bus system similar to PCI or ISA. See
<file:Documentation/mca.txt> (and especially the web page given
@@ -1810,8 +1882,7 @@ source "drivers/mca/Kconfig"
config SCx200
tristate "NatSemi SCx200 support"
- depends on !X86_VOYAGER
- help
+ ---help---
This provides basic support for National Semiconductor's
(now AMD's) Geode processors. The driver probes for the
PCI-IDs of several on-chip devices, so its a good dependency
@@ -1823,7 +1894,7 @@ config SCx200HR_TIMER
tristate "NatSemi SCx200 27MHz High-Resolution Timer Support"
depends on SCx200 && GENERIC_TIME
default y
- help
+ ---help---
This driver provides a clocksource built upon the on-chip
27MHz high-resolution timer. Its also a workaround for
NSC Geode SC-1100's buggy TSC, which loses time when the
@@ -1834,7 +1905,7 @@ config GEODE_MFGPT_TIMER
def_bool y
prompt "Geode Multi-Function General Purpose Timer (MFGPT) events"
depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS
- help
+ ---help---
This driver provides a clock event source based on the MFGPT
timer(s) in the CS5535 and CS5536 companion chip for the geode.
MFGPTs have a better resolution and max interval than the
@@ -1843,7 +1914,7 @@ config GEODE_MFGPT_TIMER
config OLPC
bool "One Laptop Per Child support"
default n
- help
+ ---help---
Add support for detecting the unique features of the OLPC
XO hardware.
@@ -1868,16 +1939,16 @@ config IA32_EMULATION
bool "IA32 Emulation"
depends on X86_64
select COMPAT_BINFMT_ELF
- help
+ ---help---
Include code to run 32-bit programs under a 64-bit kernel. You should
likely turn this on, unless you're 100% sure that you don't have any
32-bit programs left.
config IA32_AOUT
- tristate "IA32 a.out support"
- depends on IA32_EMULATION
- help
- Support old a.out binaries in the 32bit emulation.
+ tristate "IA32 a.out support"
+ depends on IA32_EMULATION
+ ---help---
+ Support old a.out binaries in the 32bit emulation.
config COMPAT
def_bool y
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index b815664fe37..a95eaf0e582 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -50,7 +50,7 @@ config M386
config M486
bool "486"
depends on X86_32
- help
+ ---help---
Select this for a 486 series processor, either Intel or one of the
compatible processors from AMD, Cyrix, IBM, or Intel. Includes DX,
DX2, and DX4 variants; also SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or
@@ -59,7 +59,7 @@ config M486
config M586
bool "586/K5/5x86/6x86/6x86MX"
depends on X86_32
- help
+ ---help---
Select this for an 586 or 686 series processor such as the AMD K5,
the Cyrix 5x86, 6x86 and 6x86MX. This choice does not
assume the RDTSC (Read Time Stamp Counter) instruction.
@@ -67,21 +67,21 @@ config M586
config M586TSC
bool "Pentium-Classic"
depends on X86_32
- help
+ ---help---
Select this for a Pentium Classic processor with the RDTSC (Read
Time Stamp Counter) instruction for benchmarking.
config M586MMX
bool "Pentium-MMX"
depends on X86_32
- help
+ ---help---
Select this for a Pentium with the MMX graphics/multimedia
extended instructions.
config M686
bool "Pentium-Pro"
depends on X86_32
- help
+ ---help---
Select this for Intel Pentium Pro chips. This enables the use of
Pentium Pro extended instructions, and disables the init-time guard
against the f00f bug found in earlier Pentiums.
@@ -89,7 +89,7 @@ config M686
config MPENTIUMII
bool "Pentium-II/Celeron(pre-Coppermine)"
depends on X86_32
- help
+ ---help---
Select this for Intel chips based on the Pentium-II and
pre-Coppermine Celeron core. This option enables an unaligned
copy optimization, compiles the kernel with optimization flags
@@ -99,7 +99,7 @@ config MPENTIUMII
config MPENTIUMIII
bool "Pentium-III/Celeron(Coppermine)/Pentium-III Xeon"
depends on X86_32
- help
+ ---help---
Select this for Intel chips based on the Pentium-III and
Celeron-Coppermine core. This option enables use of some
extended prefetch instructions in addition to the Pentium II
@@ -108,14 +108,14 @@ config MPENTIUMIII
config MPENTIUMM
bool "Pentium M"
depends on X86_32
- help
+ ---help---
Select this for Intel Pentium M (not Pentium-4 M)
notebook chips.
config MPENTIUM4
bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon"
depends on X86_32
- help
+ ---help---
Select this for Intel Pentium 4 chips. This includes the
Pentium 4, Pentium D, P4-based Celeron and Xeon, and
Pentium-4 M (not Pentium M) chips. This option enables compile
@@ -151,7 +151,7 @@ config MPENTIUM4
config MK6
bool "K6/K6-II/K6-III"
depends on X86_32
- help
+ ---help---
Select this for an AMD K6-family processor. Enables use of
some extended instructions, and passes appropriate optimization
flags to GCC.
@@ -159,22 +159,22 @@ config MK6
config MK7
bool "Athlon/Duron/K7"
depends on X86_32
- help
+ ---help---
Select this for an AMD Athlon K7-family processor. Enables use of
some extended instructions, and passes appropriate optimization
flags to GCC.
config MK8
bool "Opteron/Athlon64/Hammer/K8"
- help
- Select this for an AMD Opteron or Athlon64 Hammer-family processor. Enables
- use of some extended instructions, and passes appropriate optimization
- flags to GCC.
+ ---help---
+ Select this for an AMD Opteron or Athlon64 Hammer-family processor.
+ Enables use of some extended instructions, and passes appropriate
+ optimization flags to GCC.
config MCRUSOE
bool "Crusoe"
depends on X86_32
- help
+ ---help---
Select this for a Transmeta Crusoe processor. Treats the processor
like a 586 with TSC, and sets some GCC optimization flags (like a
Pentium Pro with no alignment requirements).
@@ -182,13 +182,13 @@ config MCRUSOE
config MEFFICEON
bool "Efficeon"
depends on X86_32
- help
+ ---help---
Select this for a Transmeta Efficeon processor.
config MWINCHIPC6
bool "Winchip-C6"
depends on X86_32
- help
+ ---help---
Select this for an IDT Winchip C6 chip. Linux and GCC
treat this chip as a 586TSC with some extended instructions
and alignment requirements.
@@ -196,7 +196,7 @@ config MWINCHIPC6
config MWINCHIP3D
bool "Winchip-2/Winchip-2A/Winchip-3"
depends on X86_32
- help
+ ---help---
Select this for an IDT Winchip-2, 2A or 3. Linux and GCC
treat this chip as a 586TSC with some extended instructions
and alignment requirements. Also enable out of order memory
@@ -206,19 +206,19 @@ config MWINCHIP3D
config MGEODEGX1
bool "GeodeGX1"
depends on X86_32
- help
+ ---help---
Select this for a Geode GX1 (Cyrix MediaGX) chip.
config MGEODE_LX
bool "Geode GX/LX"
depends on X86_32
- help
+ ---help---
Select this for AMD Geode GX and LX processors.
config MCYRIXIII
bool "CyrixIII/VIA-C3"
depends on X86_32
- help
+ ---help---
Select this for a Cyrix III or C3 chip. Presently Linux and GCC
treat this chip as a generic 586. Whilst the CPU is 686 class,
it lacks the cmov extension which gcc assumes is present when
@@ -230,7 +230,7 @@ config MCYRIXIII
config MVIAC3_2
bool "VIA C3-2 (Nehemiah)"
depends on X86_32
- help
+ ---help---
Select this for a VIA C3 "Nehemiah". Selecting this enables usage
of SSE and tells gcc to treat the CPU as a 686.
Note, this kernel will not boot on older (pre model 9) C3s.
@@ -238,14 +238,14 @@ config MVIAC3_2
config MVIAC7
bool "VIA C7"
depends on X86_32
- help
+ ---help---
Select this for a VIA C7. Selecting this uses the correct cache
shift and tells gcc to treat the CPU as a 686.
config MPSC
bool "Intel P4 / older Netburst based Xeon"
depends on X86_64
- help
+ ---help---
Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey
Xeon CPUs with Intel 64bit which is compatible with x86-64.
Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the
@@ -255,15 +255,17 @@ config MPSC
config MCORE2
bool "Core 2/newer Xeon"
- help
- Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and 53xx)
- CPUs. You can distinguish newer from older Xeons by the CPU family
- in /proc/cpuinfo. Newer ones have 6 and older ones 15 (not a typo)
+ ---help---
+
+ Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and
+ 53xx) CPUs. You can distinguish newer from older Xeons by the CPU
+ family in /proc/cpuinfo. Newer ones have 6 and older ones 15
+ (not a typo)
config GENERIC_CPU
bool "Generic-x86-64"
depends on X86_64
- help
+ ---help---
Generic x86-64 CPU.
Run equally well on all x86-64 CPUs.
@@ -272,7 +274,7 @@ endchoice
config X86_GENERIC
bool "Generic x86 support"
depends on X86_32
- help
+ ---help---
Instead of just including optimizations for the selected
x86 variant (e.g. PII, Crusoe or Athlon), include some more
generic optimizations as well. This will make the kernel
@@ -292,25 +294,23 @@ config X86_CPU
# Define implied options from the CPU selection here
config X86_L1_CACHE_BYTES
int
- default "128" if GENERIC_CPU || MPSC
- default "64" if MK8 || MCORE2
- depends on X86_64
+ default "128" if MPSC
+ default "64" if GENERIC_CPU || MK8 || MCORE2 || X86_32
config X86_INTERNODE_CACHE_BYTES
int
default "4096" if X86_VSMP
default X86_L1_CACHE_BYTES if !X86_VSMP
- depends on X86_64
config X86_CMPXCHG
def_bool X86_64 || (X86_32 && !M386)
config X86_L1_CACHE_SHIFT
int
- default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC
+ default "7" if MPENTIUM4 || MPSC
default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
- default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7
+ default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 || X86_GENERIC || GENERIC_CPU
config X86_XADD
def_bool y
@@ -319,15 +319,15 @@ config X86_XADD
config X86_PPRO_FENCE
bool "PentiumPro memory ordering errata workaround"
depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 || MGEODEGX1
- help
- Old PentiumPro multiprocessor systems had errata that could cause memory
- operations to violate the x86 ordering standard in rare cases. Enabling this
- option will attempt to work around some (but not all) occurances of
- this problem, at the cost of much heavier spinlock and memory barrier
- operations.
+ ---help---
+ Old PentiumPro multiprocessor systems had errata that could cause
+ memory operations to violate the x86 ordering standard in rare cases.
+ Enabling this option will attempt to work around some (but not all)
+ occurances of this problem, at the cost of much heavier spinlock and
+ memory barrier operations.
- If unsure, say n here. Even distro kernels should think twice before enabling
- this: there are few systems, and an unlikely bug.
+ If unsure, say n here. Even distro kernels should think twice before
+ enabling this: there are few systems, and an unlikely bug.
config X86_F00F_BUG
def_bool y
@@ -408,18 +408,18 @@ config X86_MINIMUM_CPU_FAMILY
config X86_DEBUGCTLMSR
def_bool y
- depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386)
+ depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) && !UML
menuconfig PROCESSOR_SELECT
bool "Supported processor vendors" if EMBEDDED
- help
+ ---help---
This lets you choose what x86 vendor support code your kernel
will include.
config CPU_SUP_INTEL
default y
bool "Support Intel processors" if PROCESSOR_SELECT
- help
+ ---help---
This enables detection, tunings and quirks for Intel processors
You need this enabled if you want your kernel to run on an
@@ -433,7 +433,7 @@ config CPU_SUP_CYRIX_32
default y
bool "Support Cyrix processors" if PROCESSOR_SELECT
depends on !64BIT
- help
+ ---help---
This enables detection, tunings and quirks for Cyrix processors
You need this enabled if you want your kernel to run on a
@@ -446,7 +446,7 @@ config CPU_SUP_CYRIX_32
config CPU_SUP_AMD
default y
bool "Support AMD processors" if PROCESSOR_SELECT
- help
+ ---help---
This enables detection, tunings and quirks for AMD processors
You need this enabled if you want your kernel to run on an
@@ -460,7 +460,7 @@ config CPU_SUP_CENTAUR_32
default y
bool "Support Centaur processors" if PROCESSOR_SELECT
depends on !64BIT
- help
+ ---help---
This enables detection, tunings and quirks for Centaur processors
You need this enabled if you want your kernel to run on a
@@ -474,7 +474,7 @@ config CPU_SUP_CENTAUR_64
default y
bool "Support Centaur processors" if PROCESSOR_SELECT
depends on 64BIT
- help
+ ---help---
This enables detection, tunings and quirks for Centaur processors
You need this enabled if you want your kernel to run on a
@@ -488,7 +488,7 @@ config CPU_SUP_TRANSMETA_32
default y
bool "Support Transmeta processors" if PROCESSOR_SELECT
depends on !64BIT
- help
+ ---help---
This enables detection, tunings and quirks for Transmeta processors
You need this enabled if you want your kernel to run on a
@@ -502,7 +502,7 @@ config CPU_SUP_UMC_32
default y
bool "Support UMC processors" if PROCESSOR_SELECT
depends on !64BIT
- help
+ ---help---
This enables detection, tunings and quirks for UMC processors
You need this enabled if you want your kernel to run on a
@@ -515,12 +515,13 @@ config CPU_SUP_UMC_32
config X86_DS
def_bool X86_PTRACE_BTS
depends on X86_DEBUGCTLMSR
+ select HAVE_HW_BRANCH_TRACER
config X86_PTRACE_BTS
bool "Branch Trace Store"
default y
depends on X86_DEBUGCTLMSR
- help
+ ---help---
This adds a ptrace interface to the hardware's branch trace store.
Debuggers may use it to collect an execution trace of the debugged
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 2a3dfbd5e67..ba4781b9389 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -7,7 +7,7 @@ source "lib/Kconfig.debug"
config STRICT_DEVMEM
bool "Filter access to /dev/mem"
- help
+ ---help---
If this option is disabled, you allow userspace (root) access to all
of memory, including kernel and userspace memory. Accidental
access to this is obviously disastrous, but specific access can
@@ -25,7 +25,7 @@ config STRICT_DEVMEM
config X86_VERBOSE_BOOTUP
bool "Enable verbose x86 bootup info messages"
default y
- help
+ ---help---
Enables the informational output from the decompression stage
(e.g. bzImage) of the boot. If you disable this you will still
see errors. Disable this if you want silent bootup.
@@ -33,7 +33,7 @@ config X86_VERBOSE_BOOTUP
config EARLY_PRINTK
bool "Early printk" if EMBEDDED
default y
- help
+ ---help---
Write kernel log output directly into the VGA buffer or to a serial
port.
@@ -47,7 +47,7 @@ config EARLY_PRINTK_DBGP
bool "Early printk via EHCI debug port"
default n
depends on EARLY_PRINTK && PCI
- help
+ ---help---
Write kernel log output directly into the EHCI debug port.
This is useful for kernel debugging when your machine crashes very
@@ -59,14 +59,14 @@ config EARLY_PRINTK_DBGP
config DEBUG_STACKOVERFLOW
bool "Check for stack overflows"
depends on DEBUG_KERNEL
- help
+ ---help---
This option will cause messages to be printed if free stack space
drops below a certain limit.
config DEBUG_STACK_USAGE
bool "Stack utilization instrumentation"
depends on DEBUG_KERNEL
- help
+ ---help---
Enables the display of the minimum amount of free stack which each
task has ever had available in the sysrq-T and sysrq-P debug output.
@@ -75,7 +75,7 @@ config DEBUG_STACK_USAGE
config DEBUG_PAGEALLOC
bool "Debug page memory allocations"
depends on DEBUG_KERNEL
- help
+ ---help---
Unmap pages from the kernel linear mapping after free_pages().
This results in a large slowdown, but helps to find certain types
of memory corruptions.
@@ -83,9 +83,9 @@ config DEBUG_PAGEALLOC
config DEBUG_PER_CPU_MAPS
bool "Debug access to per_cpu maps"
depends on DEBUG_KERNEL
- depends on X86_SMP
+ depends on SMP
default n
- help
+ ---help---
Say Y to verify that the per_cpu map being accessed has
been setup. Adds a fair amount of code to kernel memory
and decreases performance.
@@ -96,7 +96,7 @@ config X86_PTDUMP
bool "Export kernel pagetable layout to userspace via debugfs"
depends on DEBUG_KERNEL
select DEBUG_FS
- help
+ ---help---
Say Y here if you want to show the kernel pagetable layout in a
debugfs file. This information is only useful for kernel developers
who are working in architecture specific areas of the kernel.
@@ -108,28 +108,17 @@ config DEBUG_RODATA
bool "Write protect kernel read-only data structures"
default y
depends on DEBUG_KERNEL
- help
+ ---help---
Mark the kernel read-only data as write-protected in the pagetables,
in order to catch accidental (and incorrect) writes to such const
data. This is recommended so that we can catch kernel bugs sooner.
If in doubt, say "Y".
-config DIRECT_GBPAGES
- bool "Enable gbpages-mapped kernel pagetables"
- depends on DEBUG_KERNEL && EXPERIMENTAL && X86_64
- help
- Enable gigabyte pages support (if the CPU supports it). This can
- improve the kernel's performance a tiny bit by reducing TLB
- pressure.
-
- This is experimental code.
-
- If in doubt, say "N".
-
config DEBUG_RODATA_TEST
bool "Testcase for the DEBUG_RODATA feature"
depends on DEBUG_RODATA
- help
+ default y
+ ---help---
This option enables a testcase for the DEBUG_RODATA
feature as well as for the change_page_attr() infrastructure.
If in doubt, say "N"
@@ -137,7 +126,7 @@ config DEBUG_RODATA_TEST
config DEBUG_NX_TEST
tristate "Testcase for the NX non-executable stack feature"
depends on DEBUG_KERNEL && m
- help
+ ---help---
This option enables a testcase for the CPU NX capability
and the software setup of this feature.
If in doubt, say "N"
@@ -145,7 +134,7 @@ config DEBUG_NX_TEST
config 4KSTACKS
bool "Use 4Kb for kernel stacks instead of 8Kb"
depends on X86_32
- help
+ ---help---
If you say Y here the kernel will use a 4Kb stacksize for the
kernel stack attached to each process/thread. This facilitates
running more threads on a system and also reduces the pressure
@@ -156,7 +145,7 @@ config DOUBLEFAULT
default y
bool "Enable doublefault exception handler" if EMBEDDED
depends on X86_32
- help
+ ---help---
This option allows trapping of rare doublefault exceptions that
would otherwise cause a system to silently reboot. Disabling this
option saves about 4k and might cause you much additional grey
@@ -166,7 +155,7 @@ config IOMMU_DEBUG
bool "Enable IOMMU debugging"
depends on GART_IOMMU && DEBUG_KERNEL
depends on X86_64
- help
+ ---help---
Force the IOMMU to on even when you have less than 4GB of
memory and add debugging code. On overflow always panic. And
allow to enable IOMMU leak tracing. Can be disabled at boot
@@ -182,18 +171,14 @@ config IOMMU_LEAK
bool "IOMMU leak tracing"
depends on DEBUG_KERNEL
depends on IOMMU_DEBUG
- help
+ ---help---
Add a simple leak tracer to the IOMMU code. This is useful when you
are debugging a buggy device driver that leaks IOMMU mappings.
-config MMIOTRACE_HOOKS
- bool
-
config MMIOTRACE
bool "Memory mapped IO tracing"
depends on DEBUG_KERNEL && PCI
select TRACING
- select MMIOTRACE_HOOKS
help
Mmiotrace traces Memory Mapped I/O access and is meant for
debugging and reverse engineering. It is called from the ioremap
@@ -239,25 +224,25 @@ choice
config IO_DELAY_0X80
bool "port 0x80 based port-IO delay [recommended]"
- help
+ ---help---
This is the traditional Linux IO delay used for in/out_p.
It is the most tested hence safest selection here.
config IO_DELAY_0XED
bool "port 0xed based port-IO delay"
- help
+ ---help---
Use port 0xed as the IO delay. This frees up port 0x80 which is
often used as a hardware-debug port.
config IO_DELAY_UDELAY
bool "udelay based port-IO delay"
- help
+ ---help---
Use udelay(2) as the IO delay method. This provides the delay
while not having any side-effect on the IO port space.
config IO_DELAY_NONE
bool "no port-IO delay"
- help
+ ---help---
No port-IO delay. Will break on old boxes that require port-IO
delay for certain operations. Should work on most new machines.
@@ -291,28 +276,27 @@ config DEBUG_BOOT_PARAMS
bool "Debug boot parameters"
depends on DEBUG_KERNEL
depends on DEBUG_FS
- help
+ ---help---
This option will cause struct boot_params to be exported via debugfs.
config CPA_DEBUG
bool "CPA self-test code"
depends on DEBUG_KERNEL
- help
+ ---help---
Do change_page_attr() self-tests every 30 seconds.
config OPTIMIZE_INLINING
bool "Allow gcc to uninline functions marked 'inline'"
- help
+ ---help---
This option determines if the kernel forces gcc to inline the functions
developers have marked 'inline'. Doing so takes away freedom from gcc to
do what it thinks is best, which is desirable for the gcc 3.x series of
compilers. The gcc 4.x series have a rewritten inlining algorithm and
- disabling this option will generate a smaller kernel there. Hopefully
- this algorithm is so good that allowing gcc4 to make the decision can
- become the default in the future, until then this option is there to
- test gcc for this.
+ enabling this option will generate a smaller kernel there. Hopefully
+ this algorithm is so good that allowing gcc 4.x and above to make the
+ decision will become the default in the future. Until then this option
+ is there to test gcc for this.
If unsure, say N.
endmenu
-
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index d1a47adb5ae..1836191839e 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -70,14 +70,17 @@ else
# this works around some issues with generating unwind tables in older gccs
# newer gccs do it by default
KBUILD_CFLAGS += -maccumulate-outgoing-args
+endif
- stackp := $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh
- stackp-$(CONFIG_CC_STACKPROTECTOR) := $(shell $(stackp) \
- "$(CC)" -fstack-protector )
- stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(stackp) \
- "$(CC)" -fstack-protector-all )
-
- KBUILD_CFLAGS += $(stackp-y)
+ifdef CONFIG_CC_STACKPROTECTOR
+ cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh
+ ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC)),y)
+ stackp-y := -fstack-protector
+ stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += -fstack-protector-all
+ KBUILD_CFLAGS += $(stackp-y)
+ else
+ $(warning stack protector enabled but no compiler support)
+ endif
endif
# Stackpointer is addressed different for 32 bit and 64 bit x86
@@ -102,29 +105,6 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
# prevent gcc from generating any FP code by mistake
KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
-###
-# Sub architecture support
-# fcore-y is linked before mcore-y files.
-
-# Default subarch .c files
-mcore-y := arch/x86/mach-default/
-
-# Voyager subarch support
-mflags-$(CONFIG_X86_VOYAGER) := -Iarch/x86/include/asm/mach-voyager
-mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/
-
-# generic subarchitecture
-mflags-$(CONFIG_X86_GENERICARCH):= -Iarch/x86/include/asm/mach-generic
-fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/
-mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/
-
-# default subarch .h files
-mflags-y += -Iarch/x86/include/asm/mach-default
-
-# 64 bit does not support subarch support - clear sub arch variables
-fcore-$(CONFIG_X86_64) :=
-mcore-$(CONFIG_X86_64) :=
-
KBUILD_CFLAGS += $(mflags-y)
KBUILD_AFLAGS += $(mflags-y)
@@ -150,9 +130,6 @@ core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
core-y += arch/x86/kernel/
core-y += arch/x86/mm/
-# Remaining sub architecture files
-core-y += $(mcore-y)
-
core-y += arch/x86/crypto/
core-y += arch/x86/vdso/
core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index 0be77b39328..7e8e8b25f5f 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -74,7 +74,7 @@ static int kbd_pending(void)
{
u8 pending;
asm volatile("int $0x16; setnz %0"
- : "=rm" (pending)
+ : "=qm" (pending)
: "a" (0x0100));
return pending;
}
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 75115849af3..4a58c8ce3f6 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -269,9 +269,8 @@ void vesa_store_edid(void)
we genuinely have to assume all registers are destroyed here. */
asm("pushw %%es; movw %2,%%es; "INT10"; popw %%es"
- : "+a" (ax), "+b" (bx)
- : "c" (cx), "D" (di)
- : "esi");
+ : "+a" (ax), "+b" (bx), "+c" (cx), "+D" (di)
+ : : "esi", "edx");
if (ax != 0x004f)
return; /* No EDID */
@@ -285,9 +284,9 @@ void vesa_store_edid(void)
dx = 0; /* EDID block number */
di =(size_t) &boot_params.edid_info; /* (ES:)Pointer to block */
asm(INT10
- : "+a" (ax), "+b" (bx), "+d" (dx), "=m" (boot_params.edid_info)
- : "c" (cx), "D" (di)
- : "esi");
+ : "+a" (ax), "+b" (bx), "+d" (dx), "=m" (boot_params.edid_info),
+ "+c" (cx), "+D" (di)
+ : : "esi");
#endif /* CONFIG_FIRMWARE_EDID */
}
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index b939cb476de..5d4742ed4aa 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -34,7 +34,7 @@ static struct mode_info cga_modes[] = {
{ VIDEO_80x25, 80, 25, 0 },
};
-__videocard video_vga;
+static __videocard video_vga;
/* Set basic 80x25 mode */
static u8 vga_set_basic_mode(void)
@@ -259,7 +259,7 @@ static int vga_probe(void)
return mode_count[adapter];
}
-__videocard video_vga = {
+static __videocard video_vga = {
.card_name = "VGA",
.probe = vga_probe,
.set_mode = vga_set_mode,
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index 83598b23093..3bef2c1febe 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -226,7 +226,7 @@ static unsigned int mode_menu(void)
#ifdef CONFIG_VIDEO_RETAIN
/* Save screen content to the heap */
-struct saved_screen {
+static struct saved_screen {
int x, y;
int curx, cury;
u16 *data;
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 13b8c86ae98..096dd5359cd 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,14 +1,13 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.27-rc5
-# Wed Sep 3 17:23:09 2008
+# Linux kernel version: 2.6.29-rc4
+# Thu Feb 12 12:57:57 2009
#
# CONFIG_64BIT is not set
CONFIG_X86_32=y
# CONFIG_X86_64 is not set
CONFIG_X86=y
CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
-# CONFIG_GENERIC_LOCKBREAK is not set
CONFIG_GENERIC_TIME=y
CONFIG_GENERIC_CMOS_UPDATE=y
CONFIG_CLOCKSOURCE_WATCHDOG=y
@@ -24,16 +23,14 @@ CONFIG_GENERIC_ISA_DMA=y
CONFIG_GENERIC_IOMAP=y
CONFIG_GENERIC_BUG=y
CONFIG_GENERIC_HWEIGHT=y
-# CONFIG_GENERIC_GPIO is not set
CONFIG_ARCH_MAY_HAVE_PC_FDC=y
# CONFIG_RWSEM_GENERIC_SPINLOCK is not set
CONFIG_RWSEM_XCHGADD_ALGORITHM=y
-# CONFIG_ARCH_HAS_ILOG2_U32 is not set
-# CONFIG_ARCH_HAS_ILOG2_U64 is not set
CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y
CONFIG_GENERIC_CALIBRATE_DELAY=y
# CONFIG_GENERIC_TIME_VSYSCALL is not set
CONFIG_ARCH_HAS_CPU_RELAX=y
+CONFIG_ARCH_HAS_DEFAULT_IDLE=y
CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
CONFIG_HAVE_SETUP_PER_CPU_AREA=y
# CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set
@@ -42,12 +39,12 @@ CONFIG_ARCH_SUSPEND_POSSIBLE=y
# CONFIG_ZONE_DMA32 is not set
CONFIG_ARCH_POPULATES_NODE_MAP=y
# CONFIG_AUDIT_ARCH is not set
-CONFIG_ARCH_SUPPORTS_AOUT=y
CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
CONFIG_GENERIC_HARDIRQS=y
CONFIG_GENERIC_IRQ_PROBE=y
CONFIG_GENERIC_PENDING_IRQ=y
CONFIG_X86_SMP=y
+CONFIG_USE_GENERIC_SMP_HELPERS=y
CONFIG_X86_32_SMP=y
CONFIG_X86_HT=y
CONFIG_X86_BIOS_REBOOT=y
@@ -76,30 +73,44 @@ CONFIG_TASK_IO_ACCOUNTING=y
CONFIG_AUDIT=y
CONFIG_AUDITSYSCALL=y
CONFIG_AUDIT_TREE=y
+
+#
+# RCU Subsystem
+#
+# CONFIG_CLASSIC_RCU is not set
+CONFIG_TREE_RCU=y
+# CONFIG_PREEMPT_RCU is not set
+# CONFIG_RCU_TRACE is not set
+CONFIG_RCU_FANOUT=32
+# CONFIG_RCU_FANOUT_EXACT is not set
+# CONFIG_TREE_RCU_TRACE is not set
+# CONFIG_PREEMPT_RCU_TRACE is not set
# CONFIG_IKCONFIG is not set
-CONFIG_LOG_BUF_SHIFT=17
-CONFIG_CGROUPS=y
-# CONFIG_CGROUP_DEBUG is not set
-CONFIG_CGROUP_NS=y
-# CONFIG_CGROUP_DEVICE is not set
-CONFIG_CPUSETS=y
+CONFIG_LOG_BUF_SHIFT=18
CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y
CONFIG_GROUP_SCHED=y
CONFIG_FAIR_GROUP_SCHED=y
# CONFIG_RT_GROUP_SCHED is not set
# CONFIG_USER_SCHED is not set
CONFIG_CGROUP_SCHED=y
+CONFIG_CGROUPS=y
+# CONFIG_CGROUP_DEBUG is not set
+CONFIG_CGROUP_NS=y
+CONFIG_CGROUP_FREEZER=y
+# CONFIG_CGROUP_DEVICE is not set
+CONFIG_CPUSETS=y
+CONFIG_PROC_PID_CPUSET=y
CONFIG_CGROUP_CPUACCT=y
CONFIG_RESOURCE_COUNTERS=y
# CONFIG_CGROUP_MEM_RES_CTLR is not set
# CONFIG_SYSFS_DEPRECATED_V2 is not set
-CONFIG_PROC_PID_CPUSET=y
CONFIG_RELAY=y
CONFIG_NAMESPACES=y
CONFIG_UTS_NS=y
CONFIG_IPC_NS=y
CONFIG_USER_NS=y
CONFIG_PID_NS=y
+CONFIG_NET_NS=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_INITRAMFS_SOURCE=""
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
@@ -124,12 +135,15 @@ CONFIG_SIGNALFD=y
CONFIG_TIMERFD=y
CONFIG_EVENTFD=y
CONFIG_SHMEM=y
+CONFIG_AIO=y
CONFIG_VM_EVENT_COUNTERS=y
+CONFIG_PCI_QUIRKS=y
CONFIG_SLUB_DEBUG=y
# CONFIG_SLAB is not set
CONFIG_SLUB=y
# CONFIG_SLOB is not set
CONFIG_PROFILING=y
+CONFIG_TRACEPOINTS=y
CONFIG_MARKERS=y
# CONFIG_OPROFILE is not set
CONFIG_HAVE_OPROFILE=y
@@ -139,15 +153,10 @@ CONFIG_KRETPROBES=y
CONFIG_HAVE_IOREMAP_PROT=y
CONFIG_HAVE_KPROBES=y
CONFIG_HAVE_KRETPROBES=y
-# CONFIG_HAVE_ARCH_TRACEHOOK is not set
-# CONFIG_HAVE_DMA_ATTRS is not set
-CONFIG_USE_GENERIC_SMP_HELPERS=y
-# CONFIG_HAVE_CLK is not set
-CONFIG_PROC_PAGE_MONITOR=y
+CONFIG_HAVE_ARCH_TRACEHOOK=y
CONFIG_HAVE_GENERIC_DMA_COHERENT=y
CONFIG_SLABINFO=y
CONFIG_RT_MUTEXES=y
-# CONFIG_TINY_SHMEM is not set
CONFIG_BASE_SMALL=0
CONFIG_MODULES=y
# CONFIG_MODULE_FORCE_LOAD is not set
@@ -155,12 +164,10 @@ CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
# CONFIG_MODVERSIONS is not set
# CONFIG_MODULE_SRCVERSION_ALL is not set
-CONFIG_KMOD=y
CONFIG_STOP_MACHINE=y
CONFIG_BLOCK=y
# CONFIG_LBD is not set
CONFIG_BLK_DEV_IO_TRACE=y
-# CONFIG_LSF is not set
CONFIG_BLK_DEV_BSG=y
# CONFIG_BLK_DEV_INTEGRITY is not set
@@ -176,7 +183,7 @@ CONFIG_IOSCHED_CFQ=y
CONFIG_DEFAULT_CFQ=y
# CONFIG_DEFAULT_NOOP is not set
CONFIG_DEFAULT_IOSCHED="cfq"
-CONFIG_CLASSIC_RCU=y
+CONFIG_FREEZER=y
#
# Processor type and features
@@ -186,15 +193,15 @@ CONFIG_NO_HZ=y
CONFIG_HIGH_RES_TIMERS=y
CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
CONFIG_SMP=y
+CONFIG_SPARSE_IRQ=y
CONFIG_X86_FIND_SMP_CONFIG=y
CONFIG_X86_MPPARSE=y
-CONFIG_X86_PC=y
# CONFIG_X86_ELAN is not set
# CONFIG_X86_VOYAGER is not set
# CONFIG_X86_GENERICARCH is not set
# CONFIG_X86_VSMP is not set
# CONFIG_X86_RDC321X is not set
-CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
+CONFIG_SCHED_OMIT_FRAME_POINTER=y
# CONFIG_PARAVIRT_GUEST is not set
# CONFIG_MEMTEST is not set
# CONFIG_M386 is not set
@@ -238,10 +245,19 @@ CONFIG_X86_TSC=y
CONFIG_X86_CMOV=y
CONFIG_X86_MINIMUM_CPU_FAMILY=4
CONFIG_X86_DEBUGCTLMSR=y
+CONFIG_CPU_SUP_INTEL=y
+CONFIG_CPU_SUP_CYRIX_32=y
+CONFIG_CPU_SUP_AMD=y
+CONFIG_CPU_SUP_CENTAUR_32=y
+CONFIG_CPU_SUP_TRANSMETA_32=y
+CONFIG_CPU_SUP_UMC_32=y
+CONFIG_X86_DS=y
+CONFIG_X86_PTRACE_BTS=y
CONFIG_HPET_TIMER=y
CONFIG_HPET_EMULATE_RTC=y
CONFIG_DMI=y
# CONFIG_IOMMU_HELPER is not set
+# CONFIG_IOMMU_API is not set
CONFIG_NR_CPUS=64
CONFIG_SCHED_SMT=y
CONFIG_SCHED_MC=y
@@ -250,12 +266,15 @@ CONFIG_PREEMPT_VOLUNTARY=y
# CONFIG_PREEMPT is not set
CONFIG_X86_LOCAL_APIC=y
CONFIG_X86_IO_APIC=y
+CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
# CONFIG_X86_MCE is not set
CONFIG_VM86=y
# CONFIG_TOSHIBA is not set
# CONFIG_I8K is not set
CONFIG_X86_REBOOTFIXUPS=y
CONFIG_MICROCODE=y
+CONFIG_MICROCODE_INTEL=y
+CONFIG_MICROCODE_AMD=y
CONFIG_MICROCODE_OLD_INTERFACE=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
@@ -264,6 +283,7 @@ CONFIG_HIGHMEM4G=y
# CONFIG_HIGHMEM64G is not set
CONFIG_PAGE_OFFSET=0xC0000000
CONFIG_HIGHMEM=y
+# CONFIG_ARCH_PHYS_ADDR_T_64BIT is not set
CONFIG_ARCH_FLATMEM_ENABLE=y
CONFIG_ARCH_SPARSEMEM_ENABLE=y
CONFIG_ARCH_SELECT_MEMORY_MODEL=y
@@ -274,14 +294,17 @@ CONFIG_FLATMEM_MANUAL=y
CONFIG_FLATMEM=y
CONFIG_FLAT_NODE_MEM_MAP=y
CONFIG_SPARSEMEM_STATIC=y
-# CONFIG_SPARSEMEM_VMEMMAP_ENABLE is not set
CONFIG_PAGEFLAGS_EXTENDED=y
CONFIG_SPLIT_PTLOCK_CPUS=4
-CONFIG_RESOURCES_64BIT=y
+# CONFIG_PHYS_ADDR_T_64BIT is not set
CONFIG_ZONE_DMA_FLAG=1
CONFIG_BOUNCE=y
CONFIG_VIRT_TO_BUS=y
+CONFIG_UNEVICTABLE_LRU=y
CONFIG_HIGHPTE=y
+CONFIG_X86_CHECK_BIOS_CORRUPTION=y
+CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
+CONFIG_X86_RESERVE_LOW_64K=y
# CONFIG_MATH_EMULATION is not set
CONFIG_MTRR=y
# CONFIG_MTRR_SANITIZER is not set
@@ -298,14 +321,15 @@ CONFIG_KEXEC=y
CONFIG_CRASH_DUMP=y
# CONFIG_KEXEC_JUMP is not set
CONFIG_PHYSICAL_START=0x1000000
-CONFIG_RELOCATABLE=y
+# CONFIG_RELOCATABLE is not set
CONFIG_PHYSICAL_ALIGN=0x200000
CONFIG_HOTPLUG_CPU=y
# CONFIG_COMPAT_VDSO is not set
+# CONFIG_CMDLINE_BOOL is not set
CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
#
-# Power management options
+# Power management and ACPI options
#
CONFIG_PM=y
CONFIG_PM_DEBUG=y
@@ -331,19 +355,13 @@ CONFIG_ACPI_BATTERY=y
CONFIG_ACPI_BUTTON=y
CONFIG_ACPI_FAN=y
CONFIG_ACPI_DOCK=y
-# CONFIG_ACPI_BAY is not set
CONFIG_ACPI_PROCESSOR=y
CONFIG_ACPI_HOTPLUG_CPU=y
CONFIG_ACPI_THERMAL=y
-# CONFIG_ACPI_WMI is not set
-# CONFIG_ACPI_ASUS is not set
-# CONFIG_ACPI_TOSHIBA is not set
# CONFIG_ACPI_CUSTOM_DSDT is not set
CONFIG_ACPI_BLACKLIST_YEAR=0
# CONFIG_ACPI_DEBUG is not set
-CONFIG_ACPI_EC=y
# CONFIG_ACPI_PCI_SLOT is not set
-CONFIG_ACPI_POWER=y
CONFIG_ACPI_SYSTEM=y
CONFIG_X86_PM_TIMER=y
CONFIG_ACPI_CONTAINER=y
@@ -388,7 +406,6 @@ CONFIG_X86_ACPI_CPUFREQ=y
#
# shared options
#
-# CONFIG_X86_ACPI_CPUFREQ_PROC_INTF is not set
# CONFIG_X86_SPEEDSTEP_LIB is not set
CONFIG_CPU_IDLE=y
CONFIG_CPU_IDLE_GOV_LADDER=y
@@ -415,6 +432,7 @@ CONFIG_ARCH_SUPPORTS_MSI=y
CONFIG_PCI_MSI=y
# CONFIG_PCI_LEGACY is not set
# CONFIG_PCI_DEBUG is not set
+# CONFIG_PCI_STUB is not set
CONFIG_HT_IRQ=y
CONFIG_ISA_DMA_API=y
# CONFIG_ISA is not set
@@ -452,13 +470,17 @@ CONFIG_HOTPLUG_PCI=y
# Executable file formats / Emulations
#
CONFIG_BINFMT_ELF=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
+CONFIG_HAVE_AOUT=y
# CONFIG_BINFMT_AOUT is not set
CONFIG_BINFMT_MISC=y
+CONFIG_HAVE_ATOMIC_IOMAP=y
CONFIG_NET=y
#
# Networking options
#
+CONFIG_COMPAT_NET_DEV_OPS=y
CONFIG_PACKET=y
CONFIG_PACKET_MMAP=y
CONFIG_UNIX=y
@@ -519,7 +541,6 @@ CONFIG_DEFAULT_CUBIC=y
# CONFIG_DEFAULT_RENO is not set
CONFIG_DEFAULT_TCP_CONG="cubic"
CONFIG_TCP_MD5SIG=y
-# CONFIG_IP_VS is not set
CONFIG_IPV6=y
# CONFIG_IPV6_PRIVACY is not set
# CONFIG_IPV6_ROUTER_PREF is not set
@@ -557,19 +578,21 @@ CONFIG_NF_CONNTRACK_IRC=y
CONFIG_NF_CONNTRACK_SIP=y
CONFIG_NF_CT_NETLINK=y
CONFIG_NETFILTER_XTABLES=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
CONFIG_NETFILTER_XT_TARGET_MARK=y
CONFIG_NETFILTER_XT_TARGET_NFLOG=y
CONFIG_NETFILTER_XT_TARGET_SECMARK=y
-CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
CONFIG_NETFILTER_XT_MATCH_MARK=y
CONFIG_NETFILTER_XT_MATCH_POLICY=y
CONFIG_NETFILTER_XT_MATCH_STATE=y
+# CONFIG_IP_VS is not set
#
# IP: Netfilter Configuration
#
+CONFIG_NF_DEFRAG_IPV4=y
CONFIG_NF_CONNTRACK_IPV4=y
CONFIG_NF_CONNTRACK_PROC_COMPAT=y
CONFIG_IP_NF_IPTABLES=y
@@ -595,8 +618,8 @@ CONFIG_IP_NF_MANGLE=y
CONFIG_NF_CONNTRACK_IPV6=y
CONFIG_IP6_NF_IPTABLES=y
CONFIG_IP6_NF_MATCH_IPV6HEADER=y
-CONFIG_IP6_NF_FILTER=y
CONFIG_IP6_NF_TARGET_LOG=y
+CONFIG_IP6_NF_FILTER=y
CONFIG_IP6_NF_TARGET_REJECT=y
CONFIG_IP6_NF_MANGLE=y
# CONFIG_IP_DCCP is not set
@@ -604,6 +627,7 @@ CONFIG_IP6_NF_MANGLE=y
# CONFIG_TIPC is not set
# CONFIG_ATM is not set
# CONFIG_BRIDGE is not set
+# CONFIG_NET_DSA is not set
# CONFIG_VLAN_8021Q is not set
# CONFIG_DECNET is not set
CONFIG_LLC=y
@@ -623,6 +647,7 @@ CONFIG_NET_SCHED=y
# CONFIG_NET_SCH_HTB is not set
# CONFIG_NET_SCH_HFSC is not set
# CONFIG_NET_SCH_PRIO is not set
+# CONFIG_NET_SCH_MULTIQ is not set
# CONFIG_NET_SCH_RED is not set
# CONFIG_NET_SCH_SFQ is not set
# CONFIG_NET_SCH_TEQL is not set
@@ -630,6 +655,7 @@ CONFIG_NET_SCHED=y
# CONFIG_NET_SCH_GRED is not set
# CONFIG_NET_SCH_DSMARK is not set
# CONFIG_NET_SCH_NETEM is not set
+# CONFIG_NET_SCH_DRR is not set
# CONFIG_NET_SCH_INGRESS is not set
#
@@ -644,6 +670,7 @@ CONFIG_NET_CLS=y
# CONFIG_NET_CLS_RSVP is not set
# CONFIG_NET_CLS_RSVP6 is not set
# CONFIG_NET_CLS_FLOW is not set
+# CONFIG_NET_CLS_CGROUP is not set
CONFIG_NET_EMATCH=y
CONFIG_NET_EMATCH_STACK=32
# CONFIG_NET_EMATCH_CMP is not set
@@ -659,7 +686,9 @@ CONFIG_NET_CLS_ACT=y
# CONFIG_NET_ACT_NAT is not set
# CONFIG_NET_ACT_PEDIT is not set
# CONFIG_NET_ACT_SIMP is not set
+# CONFIG_NET_ACT_SKBEDIT is not set
CONFIG_NET_SCH_FIFO=y
+# CONFIG_DCB is not set
#
# Network testing
@@ -676,29 +705,33 @@ CONFIG_HAMRADIO=y
# CONFIG_IRDA is not set
# CONFIG_BT is not set
# CONFIG_AF_RXRPC is not set
+# CONFIG_PHONET is not set
CONFIG_FIB_RULES=y
-
-#
-# Wireless
-#
+CONFIG_WIRELESS=y
CONFIG_CFG80211=y
+# CONFIG_CFG80211_REG_DEBUG is not set
CONFIG_NL80211=y
+CONFIG_WIRELESS_OLD_REGULATORY=y
CONFIG_WIRELESS_EXT=y
CONFIG_WIRELESS_EXT_SYSFS=y
+# CONFIG_LIB80211 is not set
CONFIG_MAC80211=y
#
# Rate control algorithm selection
#
-CONFIG_MAC80211_RC_PID=y
-CONFIG_MAC80211_RC_DEFAULT_PID=y
-CONFIG_MAC80211_RC_DEFAULT="pid"
+CONFIG_MAC80211_RC_MINSTREL=y
+# CONFIG_MAC80211_RC_DEFAULT_PID is not set
+CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y
+CONFIG_MAC80211_RC_DEFAULT="minstrel"
# CONFIG_MAC80211_MESH is not set
CONFIG_MAC80211_LEDS=y
# CONFIG_MAC80211_DEBUGFS is not set
# CONFIG_MAC80211_DEBUG_MENU is not set
-# CONFIG_IEEE80211 is not set
-# CONFIG_RFKILL is not set
+# CONFIG_WIMAX is not set
+CONFIG_RFKILL=y
+# CONFIG_RFKILL_INPUT is not set
+CONFIG_RFKILL_LEDS=y
# CONFIG_NET_9P is not set
#
@@ -722,7 +755,7 @@ CONFIG_PROC_EVENTS=y
# CONFIG_MTD is not set
# CONFIG_PARPORT is not set
CONFIG_PNP=y
-# CONFIG_PNP_DEBUG is not set
+CONFIG_PNP_DEBUG_MESSAGES=y
#
# Protocols
@@ -750,20 +783,19 @@ CONFIG_BLK_DEV_RAM_SIZE=16384
CONFIG_MISC_DEVICES=y
# CONFIG_IBM_ASM is not set
# CONFIG_PHANTOM is not set
-# CONFIG_EEPROM_93CX6 is not set
# CONFIG_SGI_IOC4 is not set
# CONFIG_TIFM_CORE is not set
-# CONFIG_ACER_WMI is not set
-# CONFIG_ASUS_LAPTOP is not set
-# CONFIG_FUJITSU_LAPTOP is not set
-# CONFIG_TC1100_WMI is not set
-# CONFIG_MSI_LAPTOP is not set
-# CONFIG_COMPAL_LAPTOP is not set
-# CONFIG_SONY_LAPTOP is not set
-# CONFIG_THINKPAD_ACPI is not set
-# CONFIG_INTEL_MENLOW is not set
+# CONFIG_ICS932S401 is not set
# CONFIG_ENCLOSURE_SERVICES is not set
# CONFIG_HP_ILO is not set
+# CONFIG_C2PORT is not set
+
+#
+# EEPROM support
+#
+# CONFIG_EEPROM_AT24 is not set
+# CONFIG_EEPROM_LEGACY is not set
+# CONFIG_EEPROM_93CX6 is not set
CONFIG_HAVE_IDE=y
# CONFIG_IDE is not set
@@ -802,7 +834,7 @@ CONFIG_SCSI_WAIT_SCAN=m
#
CONFIG_SCSI_SPI_ATTRS=y
# CONFIG_SCSI_FC_ATTRS is not set
-CONFIG_SCSI_ISCSI_ATTRS=y
+# CONFIG_SCSI_ISCSI_ATTRS is not set
# CONFIG_SCSI_SAS_ATTRS is not set
# CONFIG_SCSI_SAS_LIBSAS is not set
# CONFIG_SCSI_SRP_ATTRS is not set
@@ -875,6 +907,7 @@ CONFIG_PATA_OLDPIIX=y
CONFIG_PATA_SCH=y
CONFIG_MD=y
CONFIG_BLK_DEV_MD=y
+CONFIG_MD_AUTODETECT=y
# CONFIG_MD_LINEAR is not set
# CONFIG_MD_RAID0 is not set
# CONFIG_MD_RAID1 is not set
@@ -930,6 +963,9 @@ CONFIG_PHYLIB=y
# CONFIG_BROADCOM_PHY is not set
# CONFIG_ICPLUS_PHY is not set
# CONFIG_REALTEK_PHY is not set
+# CONFIG_NATIONAL_PHY is not set
+# CONFIG_STE10XP is not set
+# CONFIG_LSI_ET1011C_PHY is not set
# CONFIG_FIXED_PHY is not set
# CONFIG_MDIO_BITBANG is not set
CONFIG_NET_ETHERNET=y
@@ -953,6 +989,9 @@ CONFIG_NET_TULIP=y
# CONFIG_IBM_NEW_EMAC_RGMII is not set
# CONFIG_IBM_NEW_EMAC_TAH is not set
# CONFIG_IBM_NEW_EMAC_EMAC4 is not set
+# CONFIG_IBM_NEW_EMAC_NO_FLOW_CTRL is not set
+# CONFIG_IBM_NEW_EMAC_MAL_CLR_ICINTSTAT is not set
+# CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set
CONFIG_NET_PCI=y
# CONFIG_PCNET32 is not set
# CONFIG_AMD8111_ETH is not set
@@ -960,7 +999,6 @@ CONFIG_NET_PCI=y
# CONFIG_B44 is not set
CONFIG_FORCEDETH=y
# CONFIG_FORCEDETH_NAPI is not set
-# CONFIG_EEPRO100 is not set
CONFIG_E100=y
# CONFIG_FEALNX is not set
# CONFIG_NATSEMI is not set
@@ -974,15 +1012,16 @@ CONFIG_8139TOO=y
# CONFIG_R6040 is not set
# CONFIG_SIS900 is not set
# CONFIG_EPIC100 is not set
+# CONFIG_SMSC9420 is not set
# CONFIG_SUNDANCE is not set
# CONFIG_TLAN is not set
# CONFIG_VIA_RHINE is not set
# CONFIG_SC92031 is not set
+# CONFIG_ATL2 is not set
CONFIG_NETDEV_1000=y
# CONFIG_ACENIC is not set
# CONFIG_DL2K is not set
CONFIG_E1000=y
-# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
CONFIG_E1000E=y
# CONFIG_IP1000 is not set
# CONFIG_IGB is not set
@@ -1000,18 +1039,23 @@ CONFIG_BNX2=y
# CONFIG_QLA3XXX is not set
# CONFIG_ATL1 is not set
# CONFIG_ATL1E is not set
+# CONFIG_JME is not set
CONFIG_NETDEV_10000=y
# CONFIG_CHELSIO_T1 is not set
+CONFIG_CHELSIO_T3_DEPENDS=y
# CONFIG_CHELSIO_T3 is not set
+# CONFIG_ENIC is not set
# CONFIG_IXGBE is not set
# CONFIG_IXGB is not set
# CONFIG_S2IO is not set
# CONFIG_MYRI10GE is not set
# CONFIG_NETXEN_NIC is not set
# CONFIG_NIU is not set
+# CONFIG_MLX4_EN is not set
# CONFIG_MLX4_CORE is not set
# CONFIG_TEHUTI is not set
# CONFIG_BNX2X is not set
+# CONFIG_QLGE is not set
# CONFIG_SFC is not set
CONFIG_TR=y
# CONFIG_IBMOL is not set
@@ -1025,9 +1069,8 @@ CONFIG_TR=y
# CONFIG_WLAN_PRE80211 is not set
CONFIG_WLAN_80211=y
# CONFIG_PCMCIA_RAYCS is not set
-# CONFIG_IPW2100 is not set
-# CONFIG_IPW2200 is not set
# CONFIG_LIBERTAS is not set
+# CONFIG_LIBERTAS_THINFIRM is not set
# CONFIG_AIRO is not set
# CONFIG_HERMES is not set
# CONFIG_ATMEL is not set
@@ -1044,6 +1087,8 @@ CONFIG_WLAN_80211=y
CONFIG_ATH5K=y
# CONFIG_ATH5K_DEBUG is not set
# CONFIG_ATH9K is not set
+# CONFIG_IPW2100 is not set
+# CONFIG_IPW2200 is not set
# CONFIG_IWLCORE is not set
# CONFIG_IWLWIFI_LEDS is not set
# CONFIG_IWLAGN is not set
@@ -1055,6 +1100,10 @@ CONFIG_ATH5K=y
# CONFIG_RT2X00 is not set
#
+# Enable WiMAX (Networking options) to see the WiMAX drivers
+#
+
+#
# USB Network Adapters
#
# CONFIG_USB_CATC is not set
@@ -1062,6 +1111,7 @@ CONFIG_ATH5K=y
# CONFIG_USB_PEGASUS is not set
# CONFIG_USB_RTL8150 is not set
# CONFIG_USB_USBNET is not set
+# CONFIG_USB_HSO is not set
CONFIG_NET_PCMCIA=y
# CONFIG_PCMCIA_3C589 is not set
# CONFIG_PCMCIA_3C574 is not set
@@ -1123,6 +1173,7 @@ CONFIG_MOUSE_PS2_LOGIPS2PP=y
CONFIG_MOUSE_PS2_SYNAPTICS=y
CONFIG_MOUSE_PS2_LIFEBOOK=y
CONFIG_MOUSE_PS2_TRACKPOINT=y
+# CONFIG_MOUSE_PS2_ELANTECH is not set
# CONFIG_MOUSE_PS2_TOUCHKIT is not set
# CONFIG_MOUSE_SERIAL is not set
# CONFIG_MOUSE_APPLETOUCH is not set
@@ -1160,15 +1211,16 @@ CONFIG_INPUT_TOUCHSCREEN=y
# CONFIG_TOUCHSCREEN_FUJITSU is not set
# CONFIG_TOUCHSCREEN_GUNZE is not set
# CONFIG_TOUCHSCREEN_ELO is not set
+# CONFIG_TOUCHSCREEN_WACOM_W8001 is not set
# CONFIG_TOUCHSCREEN_MTOUCH is not set
# CONFIG_TOUCHSCREEN_INEXIO is not set
# CONFIG_TOUCHSCREEN_MK712 is not set
# CONFIG_TOUCHSCREEN_PENMOUNT is not set
# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
-# CONFIG_TOUCHSCREEN_UCB1400 is not set
# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
+# CONFIG_TOUCHSCREEN_TSC2007 is not set
CONFIG_INPUT_MISC=y
# CONFIG_INPUT_PCSPKR is not set
# CONFIG_INPUT_APANEL is not set
@@ -1179,6 +1231,7 @@ CONFIG_INPUT_MISC=y
# CONFIG_INPUT_KEYSPAN_REMOTE is not set
# CONFIG_INPUT_POWERMATE is not set
# CONFIG_INPUT_YEALINK is not set
+# CONFIG_INPUT_CM109 is not set
# CONFIG_INPUT_UINPUT is not set
#
@@ -1245,6 +1298,7 @@ CONFIG_SERIAL_CORE=y
CONFIG_SERIAL_CORE_CONSOLE=y
# CONFIG_SERIAL_JSM is not set
CONFIG_UNIX98_PTYS=y
+# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set
# CONFIG_LEGACY_PTYS is not set
# CONFIG_IPMI_HANDLER is not set
CONFIG_HW_RANDOM=y
@@ -1279,6 +1333,7 @@ CONFIG_I2C=y
CONFIG_I2C_BOARDINFO=y
# CONFIG_I2C_CHARDEV is not set
CONFIG_I2C_HELPER_AUTO=y
+CONFIG_I2C_ALGOBIT=y
#
# I2C Hardware Bus support
@@ -1331,8 +1386,6 @@ CONFIG_I2C_I801=y
# Miscellaneous I2C Chip support
#
# CONFIG_DS1682 is not set
-# CONFIG_AT24 is not set
-# CONFIG_SENSORS_EEPROM is not set
# CONFIG_SENSORS_PCF8574 is not set
# CONFIG_PCF8575 is not set
# CONFIG_SENSORS_PCA9539 is not set
@@ -1351,8 +1404,78 @@ CONFIG_POWER_SUPPLY=y
# CONFIG_POWER_SUPPLY_DEBUG is not set
# CONFIG_PDA_POWER is not set
# CONFIG_BATTERY_DS2760 is not set
-# CONFIG_HWMON is not set
+# CONFIG_BATTERY_BQ27x00 is not set
+CONFIG_HWMON=y
+# CONFIG_HWMON_VID is not set
+# CONFIG_SENSORS_ABITUGURU is not set
+# CONFIG_SENSORS_ABITUGURU3 is not set
+# CONFIG_SENSORS_AD7414 is not set
+# CONFIG_SENSORS_AD7418 is not set
+# CONFIG_SENSORS_ADM1021 is not set
+# CONFIG_SENSORS_ADM1025 is not set
+# CONFIG_SENSORS_ADM1026 is not set
+# CONFIG_SENSORS_ADM1029 is not set
+# CONFIG_SENSORS_ADM1031 is not set
+# CONFIG_SENSORS_ADM9240 is not set
+# CONFIG_SENSORS_ADT7462 is not set
+# CONFIG_SENSORS_ADT7470 is not set
+# CONFIG_SENSORS_ADT7473 is not set
+# CONFIG_SENSORS_ADT7475 is not set
+# CONFIG_SENSORS_K8TEMP is not set
+# CONFIG_SENSORS_ASB100 is not set
+# CONFIG_SENSORS_ATXP1 is not set
+# CONFIG_SENSORS_DS1621 is not set
+# CONFIG_SENSORS_I5K_AMB is not set
+# CONFIG_SENSORS_F71805F is not set
+# CONFIG_SENSORS_F71882FG is not set
+# CONFIG_SENSORS_F75375S is not set
+# CONFIG_SENSORS_FSCHER is not set
+# CONFIG_SENSORS_FSCPOS is not set
+# CONFIG_SENSORS_FSCHMD is not set
+# CONFIG_SENSORS_GL518SM is not set
+# CONFIG_SENSORS_GL520SM is not set
+# CONFIG_SENSORS_CORETEMP is not set
+# CONFIG_SENSORS_IT87 is not set
+# CONFIG_SENSORS_LM63 is not set
+# CONFIG_SENSORS_LM75 is not set
+# CONFIG_SENSORS_LM77 is not set
+# CONFIG_SENSORS_LM78 is not set
+# CONFIG_SENSORS_LM80 is not set
+# CONFIG_SENSORS_LM83 is not set
+# CONFIG_SENSORS_LM85 is not set
+# CONFIG_SENSORS_LM87 is not set
+# CONFIG_SENSORS_LM90 is not set
+# CONFIG_SENSORS_LM92 is not set
+# CONFIG_SENSORS_LM93 is not set
+# CONFIG_SENSORS_LTC4245 is not set
+# CONFIG_SENSORS_MAX1619 is not set
+# CONFIG_SENSORS_MAX6650 is not set
+# CONFIG_SENSORS_PC87360 is not set
+# CONFIG_SENSORS_PC87427 is not set
+# CONFIG_SENSORS_SIS5595 is not set
+# CONFIG_SENSORS_DME1737 is not set
+# CONFIG_SENSORS_SMSC47M1 is not set
+# CONFIG_SENSORS_SMSC47M192 is not set
+# CONFIG_SENSORS_SMSC47B397 is not set
+# CONFIG_SENSORS_ADS7828 is not set
+# CONFIG_SENSORS_THMC50 is not set
+# CONFIG_SENSORS_VIA686A is not set
+# CONFIG_SENSORS_VT1211 is not set
+# CONFIG_SENSORS_VT8231 is not set
+# CONFIG_SENSORS_W83781D is not set
+# CONFIG_SENSORS_W83791D is not set
+# CONFIG_SENSORS_W83792D is not set
+# CONFIG_SENSORS_W83793 is not set
+# CONFIG_SENSORS_W83L785TS is not set
+# CONFIG_SENSORS_W83L786NG is not set
+# CONFIG_SENSORS_W83627HF is not set
+# CONFIG_SENSORS_W83627EHF is not set
+# CONFIG_SENSORS_HDAPS is not set
+# CONFIG_SENSORS_LIS3LV02D is not set
+# CONFIG_SENSORS_APPLESMC is not set
+# CONFIG_HWMON_DEBUG_CHIP is not set
CONFIG_THERMAL=y
+# CONFIG_THERMAL_HWMON is not set
CONFIG_WATCHDOG=y
# CONFIG_WATCHDOG_NOWAYOUT is not set
@@ -1372,6 +1495,7 @@ CONFIG_WATCHDOG=y
# CONFIG_I6300ESB_WDT is not set
# CONFIG_ITCO_WDT is not set
# CONFIG_IT8712F_WDT is not set
+# CONFIG_IT87_WDT is not set
# CONFIG_HP_WATCHDOG is not set
# CONFIG_SC1200_WDT is not set
# CONFIG_PC87413_WDT is not set
@@ -1379,9 +1503,11 @@ CONFIG_WATCHDOG=y
# CONFIG_SBC8360_WDT is not set
# CONFIG_SBC7240_WDT is not set
# CONFIG_CPU5_WDT is not set
+# CONFIG_SMSC_SCH311X_WDT is not set
# CONFIG_SMSC37B787_WDT is not set
# CONFIG_W83627HF_WDT is not set
# CONFIG_W83697HF_WDT is not set
+# CONFIG_W83697UG_WDT is not set
# CONFIG_W83877F_WDT is not set
# CONFIG_W83977F_WDT is not set
# CONFIG_MACHZ_WDT is not set
@@ -1397,11 +1523,11 @@ CONFIG_WATCHDOG=y
# USB-based Watchdog Cards
#
# CONFIG_USBPCWATCHDOG is not set
+CONFIG_SSB_POSSIBLE=y
#
# Sonics Silicon Backplane
#
-CONFIG_SSB_POSSIBLE=y
# CONFIG_SSB is not set
#
@@ -1410,7 +1536,13 @@ CONFIG_SSB_POSSIBLE=y
# CONFIG_MFD_CORE is not set
# CONFIG_MFD_SM501 is not set
# CONFIG_HTC_PASIC3 is not set
+# CONFIG_TWL4030_CORE is not set
# CONFIG_MFD_TMIO is not set
+# CONFIG_PMIC_DA903X is not set
+# CONFIG_MFD_WM8400 is not set
+# CONFIG_MFD_WM8350_I2C is not set
+# CONFIG_MFD_PCF50633 is not set
+# CONFIG_REGULATOR is not set
#
# Multimedia devices
@@ -1450,6 +1582,7 @@ CONFIG_DRM=y
# CONFIG_DRM_I810 is not set
# CONFIG_DRM_I830 is not set
CONFIG_DRM_I915=y
+# CONFIG_DRM_I915_KMS is not set
# CONFIG_DRM_MGA is not set
# CONFIG_DRM_SIS is not set
# CONFIG_DRM_VIA is not set
@@ -1459,6 +1592,7 @@ CONFIG_DRM_I915=y
CONFIG_FB=y
# CONFIG_FIRMWARE_EDID is not set
# CONFIG_FB_DDC is not set
+# CONFIG_FB_BOOT_VESA_SUPPORT is not set
CONFIG_FB_CFB_FILLRECT=y
CONFIG_FB_CFB_COPYAREA=y
CONFIG_FB_CFB_IMAGEBLIT=y
@@ -1487,7 +1621,6 @@ CONFIG_FB_TILEBLITTING=y
# CONFIG_FB_UVESA is not set
# CONFIG_FB_VESA is not set
CONFIG_FB_EFI=y
-# CONFIG_FB_IMAC is not set
# CONFIG_FB_N411 is not set
# CONFIG_FB_HGA is not set
# CONFIG_FB_S1D13XXX is not set
@@ -1503,6 +1636,7 @@ CONFIG_FB_EFI=y
# CONFIG_FB_S3 is not set
# CONFIG_FB_SAVAGE is not set
# CONFIG_FB_SIS is not set
+# CONFIG_FB_VIA is not set
# CONFIG_FB_NEOMAGIC is not set
# CONFIG_FB_KYRO is not set
# CONFIG_FB_3DFX is not set
@@ -1515,12 +1649,15 @@ CONFIG_FB_EFI=y
# CONFIG_FB_CARMINE is not set
# CONFIG_FB_GEODE is not set
# CONFIG_FB_VIRTUAL is not set
+# CONFIG_FB_METRONOME is not set
+# CONFIG_FB_MB862XX is not set
CONFIG_BACKLIGHT_LCD_SUPPORT=y
# CONFIG_LCD_CLASS_DEVICE is not set
CONFIG_BACKLIGHT_CLASS_DEVICE=y
-# CONFIG_BACKLIGHT_CORGI is not set
+CONFIG_BACKLIGHT_GENERIC=y
# CONFIG_BACKLIGHT_PROGEAR is not set
# CONFIG_BACKLIGHT_MBP_NVIDIA is not set
+# CONFIG_BACKLIGHT_SAHARA is not set
#
# Display device support
@@ -1540,10 +1677,12 @@ CONFIG_LOGO=y
# CONFIG_LOGO_LINUX_VGA16 is not set
CONFIG_LOGO_LINUX_CLUT224=y
CONFIG_SOUND=y
+CONFIG_SOUND_OSS_CORE=y
CONFIG_SND=y
CONFIG_SND_TIMER=y
CONFIG_SND_PCM=y
CONFIG_SND_HWDEP=y
+CONFIG_SND_JACK=y
CONFIG_SND_SEQUENCER=y
CONFIG_SND_SEQ_DUMMY=y
CONFIG_SND_OSSEMUL=y
@@ -1551,6 +1690,8 @@ CONFIG_SND_MIXER_OSS=y
CONFIG_SND_PCM_OSS=y
CONFIG_SND_PCM_OSS_PLUGINS=y
CONFIG_SND_SEQUENCER_OSS=y
+CONFIG_SND_HRTIMER=y
+CONFIG_SND_SEQ_HRTIMER_DEFAULT=y
CONFIG_SND_DYNAMIC_MINORS=y
CONFIG_SND_SUPPORT_OLD_API=y
CONFIG_SND_VERBOSE_PROCFS=y
@@ -1605,11 +1746,16 @@ CONFIG_SND_PCI=y
# CONFIG_SND_FM801 is not set
CONFIG_SND_HDA_INTEL=y
CONFIG_SND_HDA_HWDEP=y
+# CONFIG_SND_HDA_RECONFIG is not set
+# CONFIG_SND_HDA_INPUT_BEEP is not set
CONFIG_SND_HDA_CODEC_REALTEK=y
CONFIG_SND_HDA_CODEC_ANALOG=y
CONFIG_SND_HDA_CODEC_SIGMATEL=y
CONFIG_SND_HDA_CODEC_VIA=y
CONFIG_SND_HDA_CODEC_ATIHDMI=y
+CONFIG_SND_HDA_CODEC_NVHDMI=y
+CONFIG_SND_HDA_CODEC_INTELHDMI=y
+CONFIG_SND_HDA_ELD=y
CONFIG_SND_HDA_CODEC_CONEXANT=y
CONFIG_SND_HDA_CODEC_CMEDIA=y
CONFIG_SND_HDA_CODEC_SI3054=y
@@ -1643,6 +1789,7 @@ CONFIG_SND_USB=y
# CONFIG_SND_USB_AUDIO is not set
# CONFIG_SND_USB_USX2Y is not set
# CONFIG_SND_USB_CAIAQ is not set
+# CONFIG_SND_USB_US122L is not set
CONFIG_SND_PCMCIA=y
# CONFIG_SND_VXPOCKET is not set
# CONFIG_SND_PDAUDIOCF is not set
@@ -1657,15 +1804,37 @@ CONFIG_HIDRAW=y
# USB Input Devices
#
CONFIG_USB_HID=y
-CONFIG_USB_HIDINPUT_POWERBOOK=y
-CONFIG_HID_FF=y
CONFIG_HID_PID=y
+CONFIG_USB_HIDDEV=y
+
+#
+# Special HID drivers
+#
+CONFIG_HID_COMPAT=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_LOGITECH=y
CONFIG_LOGITECH_FF=y
# CONFIG_LOGIRUMBLEPAD2_FF is not set
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_PANTHERLORD=y
CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SUNPLUS=y
+# CONFIG_GREENASIA_FF is not set
+CONFIG_HID_TOPSEED=y
CONFIG_THRUSTMASTER_FF=y
CONFIG_ZEROPLUS_FF=y
-CONFIG_USB_HIDDEV=y
CONFIG_USB_SUPPORT=y
CONFIG_USB_ARCH_HAS_HCD=y
CONFIG_USB_ARCH_HAS_OHCI=y
@@ -1683,6 +1852,8 @@ CONFIG_USB_DEVICEFS=y
CONFIG_USB_SUSPEND=y
# CONFIG_USB_OTG is not set
CONFIG_USB_MON=y
+# CONFIG_USB_WUSB is not set
+# CONFIG_USB_WUSB_CBAF is not set
#
# USB Host Controller Drivers
@@ -1691,6 +1862,7 @@ CONFIG_USB_MON=y
CONFIG_USB_EHCI_HCD=y
# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
# CONFIG_USB_EHCI_TT_NEWSCHED is not set
+# CONFIG_USB_OXU210HP_HCD is not set
# CONFIG_USB_ISP116X_HCD is not set
# CONFIG_USB_ISP1760_HCD is not set
CONFIG_USB_OHCI_HCD=y
@@ -1700,6 +1872,8 @@ CONFIG_USB_OHCI_LITTLE_ENDIAN=y
CONFIG_USB_UHCI_HCD=y
# CONFIG_USB_SL811_HCD is not set
# CONFIG_USB_R8A66597_HCD is not set
+# CONFIG_USB_WHCI_HCD is not set
+# CONFIG_USB_HWA_HCD is not set
#
# USB Device Class drivers
@@ -1707,20 +1881,20 @@ CONFIG_USB_UHCI_HCD=y
# CONFIG_USB_ACM is not set
CONFIG_USB_PRINTER=y
# CONFIG_USB_WDM is not set
+# CONFIG_USB_TMC is not set
#
-# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
+# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed;
#
#
-# may also be needed; see USB_STORAGE Help for more information
+# see USB_STORAGE Help for more information
#
CONFIG_USB_STORAGE=y
# CONFIG_USB_STORAGE_DEBUG is not set
# CONFIG_USB_STORAGE_DATAFAB is not set
# CONFIG_USB_STORAGE_FREECOM is not set
# CONFIG_USB_STORAGE_ISD200 is not set
-# CONFIG_USB_STORAGE_DPCM is not set
# CONFIG_USB_STORAGE_USBAT is not set
# CONFIG_USB_STORAGE_SDDR09 is not set
# CONFIG_USB_STORAGE_SDDR55 is not set
@@ -1728,7 +1902,6 @@ CONFIG_USB_STORAGE=y
# CONFIG_USB_STORAGE_ALAUDA is not set
# CONFIG_USB_STORAGE_ONETOUCH is not set
# CONFIG_USB_STORAGE_KARMA is not set
-# CONFIG_USB_STORAGE_SIERRA is not set
# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
CONFIG_USB_LIBUSUAL=y
@@ -1749,6 +1922,7 @@ CONFIG_USB_LIBUSUAL=y
# CONFIG_USB_EMI62 is not set
# CONFIG_USB_EMI26 is not set
# CONFIG_USB_ADUTUX is not set
+# CONFIG_USB_SEVSEG is not set
# CONFIG_USB_RIO500 is not set
# CONFIG_USB_LEGOTOWER is not set
# CONFIG_USB_LCD is not set
@@ -1766,7 +1940,13 @@ CONFIG_USB_LIBUSUAL=y
# CONFIG_USB_IOWARRIOR is not set
# CONFIG_USB_TEST is not set
# CONFIG_USB_ISIGHTFW is not set
+# CONFIG_USB_VST is not set
# CONFIG_USB_GADGET is not set
+
+#
+# OTG and related infrastructure
+#
+# CONFIG_UWB is not set
# CONFIG_MMC is not set
# CONFIG_MEMSTICK is not set
CONFIG_NEW_LEDS=y
@@ -1775,6 +1955,7 @@ CONFIG_LEDS_CLASS=y
#
# LED drivers
#
+# CONFIG_LEDS_ALIX2 is not set
# CONFIG_LEDS_PCA9532 is not set
# CONFIG_LEDS_CLEVO_MAIL is not set
# CONFIG_LEDS_PCA955X is not set
@@ -1785,6 +1966,7 @@ CONFIG_LEDS_CLASS=y
CONFIG_LEDS_TRIGGERS=y
# CONFIG_LEDS_TRIGGER_TIMER is not set
# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
+# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
# CONFIG_ACCESSIBILITY is not set
# CONFIG_INFINIBAND is not set
@@ -1824,6 +2006,7 @@ CONFIG_RTC_INTF_DEV=y
# CONFIG_RTC_DRV_M41T80 is not set
# CONFIG_RTC_DRV_S35390A is not set
# CONFIG_RTC_DRV_FM3130 is not set
+# CONFIG_RTC_DRV_RX8581 is not set
#
# SPI RTC drivers
@@ -1833,12 +2016,15 @@ CONFIG_RTC_INTF_DEV=y
# Platform RTC drivers
#
CONFIG_RTC_DRV_CMOS=y
+# CONFIG_RTC_DRV_DS1286 is not set
# CONFIG_RTC_DRV_DS1511 is not set
# CONFIG_RTC_DRV_DS1553 is not set
# CONFIG_RTC_DRV_DS1742 is not set
# CONFIG_RTC_DRV_STK17TA8 is not set
# CONFIG_RTC_DRV_M48T86 is not set
+# CONFIG_RTC_DRV_M48T35 is not set
# CONFIG_RTC_DRV_M48T59 is not set
+# CONFIG_RTC_DRV_BQ4802 is not set
# CONFIG_RTC_DRV_V3020 is not set
#
@@ -1851,6 +2037,22 @@ CONFIG_DMADEVICES=y
#
# CONFIG_INTEL_IOATDMA is not set
# CONFIG_UIO is not set
+# CONFIG_STAGING is not set
+CONFIG_X86_PLATFORM_DEVICES=y
+# CONFIG_ACER_WMI is not set
+# CONFIG_ASUS_LAPTOP is not set
+# CONFIG_FUJITSU_LAPTOP is not set
+# CONFIG_TC1100_WMI is not set
+# CONFIG_MSI_LAPTOP is not set
+# CONFIG_PANASONIC_LAPTOP is not set
+# CONFIG_COMPAL_LAPTOP is not set
+# CONFIG_SONY_LAPTOP is not set
+# CONFIG_THINKPAD_ACPI is not set
+# CONFIG_INTEL_MENLOW is not set
+CONFIG_EEEPC_LAPTOP=y
+# CONFIG_ACPI_WMI is not set
+# CONFIG_ACPI_ASUS is not set
+# CONFIG_ACPI_TOSHIBA is not set
#
# Firmware Drivers
@@ -1861,8 +2063,7 @@ CONFIG_EFI_VARS=y
# CONFIG_DELL_RBU is not set
# CONFIG_DCDBAS is not set
CONFIG_DMIID=y
-CONFIG_ISCSI_IBFT_FIND=y
-CONFIG_ISCSI_IBFT=y
+# CONFIG_ISCSI_IBFT_FIND is not set
#
# File systems
@@ -1872,21 +2073,24 @@ CONFIG_EXT3_FS=y
CONFIG_EXT3_FS_XATTR=y
CONFIG_EXT3_FS_POSIX_ACL=y
CONFIG_EXT3_FS_SECURITY=y
-# CONFIG_EXT4DEV_FS is not set
+# CONFIG_EXT4_FS is not set
CONFIG_JBD=y
# CONFIG_JBD_DEBUG is not set
CONFIG_FS_MBCACHE=y
# CONFIG_REISERFS_FS is not set
# CONFIG_JFS_FS is not set
CONFIG_FS_POSIX_ACL=y
+CONFIG_FILE_LOCKING=y
# CONFIG_XFS_FS is not set
# CONFIG_OCFS2_FS is not set
+# CONFIG_BTRFS_FS is not set
CONFIG_DNOTIFY=y
CONFIG_INOTIFY=y
CONFIG_INOTIFY_USER=y
CONFIG_QUOTA=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
# CONFIG_PRINT_QUOTA_WARNING is not set
+CONFIG_QUOTA_TREE=y
# CONFIG_QFMT_V1 is not set
CONFIG_QFMT_V2=y
CONFIG_QUOTACTL=y
@@ -1920,16 +2124,14 @@ CONFIG_PROC_FS=y
CONFIG_PROC_KCORE=y
CONFIG_PROC_VMCORE=y
CONFIG_PROC_SYSCTL=y
+CONFIG_PROC_PAGE_MONITOR=y
CONFIG_SYSFS=y
CONFIG_TMPFS=y
CONFIG_TMPFS_POSIX_ACL=y
CONFIG_HUGETLBFS=y
CONFIG_HUGETLB_PAGE=y
# CONFIG_CONFIGFS_FS is not set
-
-#
-# Miscellaneous filesystems
-#
+CONFIG_MISC_FILESYSTEMS=y
# CONFIG_ADFS_FS is not set
# CONFIG_AFFS_FS is not set
# CONFIG_ECRYPT_FS is not set
@@ -1939,6 +2141,7 @@ CONFIG_HUGETLB_PAGE=y
# CONFIG_BFS_FS is not set
# CONFIG_EFS_FS is not set
# CONFIG_CRAMFS is not set
+# CONFIG_SQUASHFS is not set
# CONFIG_VXFS_FS is not set
# CONFIG_MINIX_FS is not set
# CONFIG_OMFS_FS is not set
@@ -1960,6 +2163,7 @@ CONFIG_NFS_ACL_SUPPORT=y
CONFIG_NFS_COMMON=y
CONFIG_SUNRPC=y
CONFIG_SUNRPC_GSS=y
+# CONFIG_SUNRPC_REGISTER_V4 is not set
CONFIG_RPCSEC_GSS_KRB5=y
# CONFIG_RPCSEC_GSS_SPKM3 is not set
# CONFIG_SMB_FS is not set
@@ -2036,7 +2240,7 @@ CONFIG_NLS_UTF8=y
#
CONFIG_TRACE_IRQFLAGS_SUPPORT=y
CONFIG_PRINTK_TIME=y
-CONFIG_ENABLE_WARN_DEPRECATED=y
+# CONFIG_ENABLE_WARN_DEPRECATED is not set
CONFIG_ENABLE_MUST_CHECK=y
CONFIG_FRAME_WARN=2048
CONFIG_MAGIC_SYSRQ=y
@@ -2066,33 +2270,54 @@ CONFIG_TIMER_STATS=y
CONFIG_DEBUG_BUGVERBOSE=y
# CONFIG_DEBUG_INFO is not set
# CONFIG_DEBUG_VM is not set
+# CONFIG_DEBUG_VIRTUAL is not set
# CONFIG_DEBUG_WRITECOUNT is not set
CONFIG_DEBUG_MEMORY_INIT=y
# CONFIG_DEBUG_LIST is not set
# CONFIG_DEBUG_SG is not set
+# CONFIG_DEBUG_NOTIFIERS is not set
+CONFIG_ARCH_WANT_FRAME_POINTERS=y
CONFIG_FRAME_POINTER=y
# CONFIG_BOOT_PRINTK_DELAY is not set
# CONFIG_RCU_TORTURE_TEST is not set
+# CONFIG_RCU_CPU_STALL_DETECTOR is not set
# CONFIG_KPROBES_SANITY_TEST is not set
# CONFIG_BACKTRACE_SELF_TEST is not set
+# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set
# CONFIG_LKDTM is not set
# CONFIG_FAULT_INJECTION is not set
# CONFIG_LATENCYTOP is not set
CONFIG_SYSCTL_SYSCALL_CHECK=y
-CONFIG_HAVE_FTRACE=y
+CONFIG_USER_STACKTRACE_SUPPORT=y
+CONFIG_HAVE_FUNCTION_TRACER=y
+CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
+CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
CONFIG_HAVE_DYNAMIC_FTRACE=y
-# CONFIG_FTRACE is not set
+CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_HAVE_HW_BRANCH_TRACER=y
+
+#
+# Tracers
+#
+# CONFIG_FUNCTION_TRACER is not set
# CONFIG_IRQSOFF_TRACER is not set
# CONFIG_SYSPROF_TRACER is not set
# CONFIG_SCHED_TRACER is not set
# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_POWER_TRACER is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_HW_BRANCH_TRACER is not set
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
+# CONFIG_DYNAMIC_PRINTK_DEBUG is not set
# CONFIG_SAMPLES is not set
CONFIG_HAVE_ARCH_KGDB=y
# CONFIG_KGDB is not set
# CONFIG_STRICT_DEVMEM is not set
CONFIG_X86_VERBOSE_BOOTUP=y
CONFIG_EARLY_PRINTK=y
+CONFIG_EARLY_PRINTK_DBGP=y
CONFIG_DEBUG_STACKOVERFLOW=y
CONFIG_DEBUG_STACK_USAGE=y
# CONFIG_DEBUG_PAGEALLOC is not set
@@ -2123,8 +2348,10 @@ CONFIG_OPTIMIZE_INLINING=y
CONFIG_KEYS=y
CONFIG_KEYS_DEBUG_PROC_KEYS=y
CONFIG_SECURITY=y
+# CONFIG_SECURITYFS is not set
CONFIG_SECURITY_NETWORK=y
# CONFIG_SECURITY_NETWORK_XFRM is not set
+# CONFIG_SECURITY_PATH is not set
CONFIG_SECURITY_FILE_CAPABILITIES=y
# CONFIG_SECURITY_ROOTPLUG is not set
CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536
@@ -2135,7 +2362,6 @@ CONFIG_SECURITY_SELINUX_DISABLE=y
CONFIG_SECURITY_SELINUX_DEVELOP=y
CONFIG_SECURITY_SELINUX_AVC_STATS=y
CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
-# CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT is not set
# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
# CONFIG_SECURITY_SMACK is not set
CONFIG_CRYPTO=y
@@ -2143,11 +2369,18 @@ CONFIG_CRYPTO=y
#
# Crypto core or helper
#
+# CONFIG_CRYPTO_FIPS is not set
CONFIG_CRYPTO_ALGAPI=y
+CONFIG_CRYPTO_ALGAPI2=y
CONFIG_CRYPTO_AEAD=y
+CONFIG_CRYPTO_AEAD2=y
CONFIG_CRYPTO_BLKCIPHER=y
+CONFIG_CRYPTO_BLKCIPHER2=y
CONFIG_CRYPTO_HASH=y
+CONFIG_CRYPTO_HASH2=y
+CONFIG_CRYPTO_RNG2=y
CONFIG_CRYPTO_MANAGER=y
+CONFIG_CRYPTO_MANAGER2=y
# CONFIG_CRYPTO_GF128MUL is not set
# CONFIG_CRYPTO_NULL is not set
# CONFIG_CRYPTO_CRYPTD is not set
@@ -2182,6 +2415,7 @@ CONFIG_CRYPTO_HMAC=y
# Digest
#
# CONFIG_CRYPTO_CRC32C is not set
+# CONFIG_CRYPTO_CRC32C_INTEL is not set
# CONFIG_CRYPTO_MD4 is not set
CONFIG_CRYPTO_MD5=y
# CONFIG_CRYPTO_MICHAEL_MIC is not set
@@ -2222,6 +2456,11 @@ CONFIG_CRYPTO_DES=y
#
# CONFIG_CRYPTO_DEFLATE is not set
# CONFIG_CRYPTO_LZO is not set
+
+#
+# Random Number Generation
+#
+# CONFIG_CRYPTO_ANSI_CPRNG is not set
CONFIG_CRYPTO_HW=y
# CONFIG_CRYPTO_DEV_PADLOCK is not set
# CONFIG_CRYPTO_DEV_GEODE is not set
@@ -2239,6 +2478,7 @@ CONFIG_VIRTUALIZATION=y
CONFIG_BITREVERSE=y
CONFIG_GENERIC_FIND_FIRST_BIT=y
CONFIG_GENERIC_FIND_NEXT_BIT=y
+CONFIG_GENERIC_FIND_LAST_BIT=y
# CONFIG_CRC_CCITT is not set
# CONFIG_CRC16 is not set
CONFIG_CRC_T10DIF=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index f0a03d7a7d6..2efb5d5063f 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,14 +1,13 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.27-rc5
-# Wed Sep 3 17:13:39 2008
+# Linux kernel version: 2.6.29-rc4
+# Thu Feb 12 12:57:29 2009
#
CONFIG_64BIT=y
# CONFIG_X86_32 is not set
CONFIG_X86_64=y
CONFIG_X86=y
CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
-# CONFIG_GENERIC_LOCKBREAK is not set
CONFIG_GENERIC_TIME=y
CONFIG_GENERIC_CMOS_UPDATE=y
CONFIG_CLOCKSOURCE_WATCHDOG=y
@@ -23,17 +22,16 @@ CONFIG_ZONE_DMA=y
CONFIG_GENERIC_ISA_DMA=y
CONFIG_GENERIC_IOMAP=y
CONFIG_GENERIC_BUG=y
+CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y
CONFIG_GENERIC_HWEIGHT=y
-# CONFIG_GENERIC_GPIO is not set
CONFIG_ARCH_MAY_HAVE_PC_FDC=y
CONFIG_RWSEM_GENERIC_SPINLOCK=y
# CONFIG_RWSEM_XCHGADD_ALGORITHM is not set
-# CONFIG_ARCH_HAS_ILOG2_U32 is not set
-# CONFIG_ARCH_HAS_ILOG2_U64 is not set
CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y
CONFIG_GENERIC_CALIBRATE_DELAY=y
CONFIG_GENERIC_TIME_VSYSCALL=y
CONFIG_ARCH_HAS_CPU_RELAX=y
+CONFIG_ARCH_HAS_DEFAULT_IDLE=y
CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
CONFIG_HAVE_SETUP_PER_CPU_AREA=y
CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y
@@ -42,12 +40,12 @@ CONFIG_ARCH_SUSPEND_POSSIBLE=y
CONFIG_ZONE_DMA32=y
CONFIG_ARCH_POPULATES_NODE_MAP=y
CONFIG_AUDIT_ARCH=y
-CONFIG_ARCH_SUPPORTS_AOUT=y
CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
CONFIG_GENERIC_HARDIRQS=y
CONFIG_GENERIC_IRQ_PROBE=y
CONFIG_GENERIC_PENDING_IRQ=y
CONFIG_X86_SMP=y
+CONFIG_USE_GENERIC_SMP_HELPERS=y
CONFIG_X86_64_SMP=y
CONFIG_X86_HT=y
CONFIG_X86_BIOS_REBOOT=y
@@ -76,30 +74,44 @@ CONFIG_TASK_IO_ACCOUNTING=y
CONFIG_AUDIT=y
CONFIG_AUDITSYSCALL=y
CONFIG_AUDIT_TREE=y
+
+#
+# RCU Subsystem
+#
+# CONFIG_CLASSIC_RCU is not set
+CONFIG_TREE_RCU=y
+# CONFIG_PREEMPT_RCU is not set
+# CONFIG_RCU_TRACE is not set
+CONFIG_RCU_FANOUT=64
+# CONFIG_RCU_FANOUT_EXACT is not set
+# CONFIG_TREE_RCU_TRACE is not set
+# CONFIG_PREEMPT_RCU_TRACE is not set
# CONFIG_IKCONFIG is not set
-CONFIG_LOG_BUF_SHIFT=17
-CONFIG_CGROUPS=y
-# CONFIG_CGROUP_DEBUG is not set
-CONFIG_CGROUP_NS=y
-# CONFIG_CGROUP_DEVICE is not set
-CONFIG_CPUSETS=y
+CONFIG_LOG_BUF_SHIFT=18
CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y
CONFIG_GROUP_SCHED=y
CONFIG_FAIR_GROUP_SCHED=y
# CONFIG_RT_GROUP_SCHED is not set
# CONFIG_USER_SCHED is not set
CONFIG_CGROUP_SCHED=y
+CONFIG_CGROUPS=y
+# CONFIG_CGROUP_DEBUG is not set
+CONFIG_CGROUP_NS=y
+CONFIG_CGROUP_FREEZER=y
+# CONFIG_CGROUP_DEVICE is not set
+CONFIG_CPUSETS=y
+CONFIG_PROC_PID_CPUSET=y
CONFIG_CGROUP_CPUACCT=y
CONFIG_RESOURCE_COUNTERS=y
# CONFIG_CGROUP_MEM_RES_CTLR is not set
# CONFIG_SYSFS_DEPRECATED_V2 is not set
-CONFIG_PROC_PID_CPUSET=y
CONFIG_RELAY=y
CONFIG_NAMESPACES=y
CONFIG_UTS_NS=y
CONFIG_IPC_NS=y
CONFIG_USER_NS=y
CONFIG_PID_NS=y
+CONFIG_NET_NS=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_INITRAMFS_SOURCE=""
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
@@ -124,12 +136,15 @@ CONFIG_SIGNALFD=y
CONFIG_TIMERFD=y
CONFIG_EVENTFD=y
CONFIG_SHMEM=y
+CONFIG_AIO=y
CONFIG_VM_EVENT_COUNTERS=y
+CONFIG_PCI_QUIRKS=y
CONFIG_SLUB_DEBUG=y
# CONFIG_SLAB is not set
CONFIG_SLUB=y
# CONFIG_SLOB is not set
CONFIG_PROFILING=y
+CONFIG_TRACEPOINTS=y
CONFIG_MARKERS=y
# CONFIG_OPROFILE is not set
CONFIG_HAVE_OPROFILE=y
@@ -139,15 +154,10 @@ CONFIG_KRETPROBES=y
CONFIG_HAVE_IOREMAP_PROT=y
CONFIG_HAVE_KPROBES=y
CONFIG_HAVE_KRETPROBES=y
-# CONFIG_HAVE_ARCH_TRACEHOOK is not set
-# CONFIG_HAVE_DMA_ATTRS is not set
-CONFIG_USE_GENERIC_SMP_HELPERS=y
-# CONFIG_HAVE_CLK is not set
-CONFIG_PROC_PAGE_MONITOR=y
+CONFIG_HAVE_ARCH_TRACEHOOK=y
# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
CONFIG_SLABINFO=y
CONFIG_RT_MUTEXES=y
-# CONFIG_TINY_SHMEM is not set
CONFIG_BASE_SMALL=0
CONFIG_MODULES=y
# CONFIG_MODULE_FORCE_LOAD is not set
@@ -155,7 +165,6 @@ CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
# CONFIG_MODVERSIONS is not set
# CONFIG_MODULE_SRCVERSION_ALL is not set
-CONFIG_KMOD=y
CONFIG_STOP_MACHINE=y
CONFIG_BLOCK=y
CONFIG_BLK_DEV_IO_TRACE=y
@@ -175,7 +184,7 @@ CONFIG_IOSCHED_CFQ=y
CONFIG_DEFAULT_CFQ=y
# CONFIG_DEFAULT_NOOP is not set
CONFIG_DEFAULT_IOSCHED="cfq"
-CONFIG_CLASSIC_RCU=y
+CONFIG_FREEZER=y
#
# Processor type and features
@@ -185,13 +194,15 @@ CONFIG_NO_HZ=y
CONFIG_HIGH_RES_TIMERS=y
CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
CONFIG_SMP=y
+CONFIG_SPARSE_IRQ=y
+# CONFIG_NUMA_MIGRATE_IRQ_DESC is not set
CONFIG_X86_FIND_SMP_CONFIG=y
CONFIG_X86_MPPARSE=y
-CONFIG_X86_PC=y
# CONFIG_X86_ELAN is not set
# CONFIG_X86_VOYAGER is not set
# CONFIG_X86_GENERICARCH is not set
# CONFIG_X86_VSMP is not set
+CONFIG_SCHED_OMIT_FRAME_POINTER=y
# CONFIG_PARAVIRT_GUEST is not set
# CONFIG_MEMTEST is not set
# CONFIG_M386 is not set
@@ -230,6 +241,11 @@ CONFIG_X86_CMPXCHG64=y
CONFIG_X86_CMOV=y
CONFIG_X86_MINIMUM_CPU_FAMILY=64
CONFIG_X86_DEBUGCTLMSR=y
+CONFIG_CPU_SUP_INTEL=y
+CONFIG_CPU_SUP_AMD=y
+CONFIG_CPU_SUP_CENTAUR_64=y
+CONFIG_X86_DS=y
+CONFIG_X86_PTRACE_BTS=y
CONFIG_HPET_TIMER=y
CONFIG_HPET_EMULATE_RTC=y
CONFIG_DMI=y
@@ -237,8 +253,11 @@ CONFIG_GART_IOMMU=y
CONFIG_CALGARY_IOMMU=y
CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y
CONFIG_AMD_IOMMU=y
+CONFIG_AMD_IOMMU_STATS=y
CONFIG_SWIOTLB=y
CONFIG_IOMMU_HELPER=y
+CONFIG_IOMMU_API=y
+# CONFIG_MAXSMP is not set
CONFIG_NR_CPUS=64
CONFIG_SCHED_SMT=y
CONFIG_SCHED_MC=y
@@ -247,12 +266,17 @@ CONFIG_PREEMPT_VOLUNTARY=y
# CONFIG_PREEMPT is not set
CONFIG_X86_LOCAL_APIC=y
CONFIG_X86_IO_APIC=y
+CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
# CONFIG_X86_MCE is not set
# CONFIG_I8K is not set
CONFIG_MICROCODE=y
+CONFIG_MICROCODE_INTEL=y
+CONFIG_MICROCODE_AMD=y
CONFIG_MICROCODE_OLD_INTERFACE=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
+CONFIG_ARCH_PHYS_ADDR_T_64BIT=y
+CONFIG_DIRECT_GBPAGES=y
CONFIG_NUMA=y
CONFIG_K8_NUMA=y
CONFIG_X86_64_ACPI_NUMA=y
@@ -269,7 +293,6 @@ CONFIG_SPARSEMEM_MANUAL=y
CONFIG_SPARSEMEM=y
CONFIG_NEED_MULTIPLE_NODES=y
CONFIG_HAVE_MEMORY_PRESENT=y
-# CONFIG_SPARSEMEM_STATIC is not set
CONFIG_SPARSEMEM_EXTREME=y
CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y
CONFIG_SPARSEMEM_VMEMMAP=y
@@ -280,10 +303,14 @@ CONFIG_SPARSEMEM_VMEMMAP=y
CONFIG_PAGEFLAGS_EXTENDED=y
CONFIG_SPLIT_PTLOCK_CPUS=4
CONFIG_MIGRATION=y
-CONFIG_RESOURCES_64BIT=y
+CONFIG_PHYS_ADDR_T_64BIT=y
CONFIG_ZONE_DMA_FLAG=1
CONFIG_BOUNCE=y
CONFIG_VIRT_TO_BUS=y
+CONFIG_UNEVICTABLE_LRU=y
+CONFIG_X86_CHECK_BIOS_CORRUPTION=y
+CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
+CONFIG_X86_RESERVE_LOW_64K=y
CONFIG_MTRR=y
# CONFIG_MTRR_SANITIZER is not set
CONFIG_X86_PAT=y
@@ -298,15 +325,16 @@ CONFIG_SCHED_HRTICK=y
CONFIG_KEXEC=y
CONFIG_CRASH_DUMP=y
CONFIG_PHYSICAL_START=0x1000000
-CONFIG_RELOCATABLE=y
+# CONFIG_RELOCATABLE is not set
CONFIG_PHYSICAL_ALIGN=0x200000
CONFIG_HOTPLUG_CPU=y
# CONFIG_COMPAT_VDSO is not set
+# CONFIG_CMDLINE_BOOL is not set
CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
#
-# Power management options
+# Power management and ACPI options
#
CONFIG_ARCH_HIBERNATION_HEADER=y
CONFIG_PM=y
@@ -333,20 +361,14 @@ CONFIG_ACPI_BATTERY=y
CONFIG_ACPI_BUTTON=y
CONFIG_ACPI_FAN=y
CONFIG_ACPI_DOCK=y
-# CONFIG_ACPI_BAY is not set
CONFIG_ACPI_PROCESSOR=y
CONFIG_ACPI_HOTPLUG_CPU=y
CONFIG_ACPI_THERMAL=y
CONFIG_ACPI_NUMA=y
-# CONFIG_ACPI_WMI is not set
-# CONFIG_ACPI_ASUS is not set
-# CONFIG_ACPI_TOSHIBA is not set
# CONFIG_ACPI_CUSTOM_DSDT is not set
CONFIG_ACPI_BLACKLIST_YEAR=0
# CONFIG_ACPI_DEBUG is not set
-CONFIG_ACPI_EC=y
# CONFIG_ACPI_PCI_SLOT is not set
-CONFIG_ACPI_POWER=y
CONFIG_ACPI_SYSTEM=y
CONFIG_X86_PM_TIMER=y
CONFIG_ACPI_CONTAINER=y
@@ -381,13 +403,17 @@ CONFIG_X86_ACPI_CPUFREQ=y
#
# shared options
#
-# CONFIG_X86_ACPI_CPUFREQ_PROC_INTF is not set
# CONFIG_X86_SPEEDSTEP_LIB is not set
CONFIG_CPU_IDLE=y
CONFIG_CPU_IDLE_GOV_LADDER=y
CONFIG_CPU_IDLE_GOV_MENU=y
#
+# Memory power savings
+#
+# CONFIG_I7300_IDLE is not set
+
+#
# Bus options (PCI etc.)
#
CONFIG_PCI=y
@@ -395,8 +421,10 @@ CONFIG_PCI_DIRECT=y
CONFIG_PCI_MMCONFIG=y
CONFIG_PCI_DOMAINS=y
CONFIG_DMAR=y
+# CONFIG_DMAR_DEFAULT_ON is not set
CONFIG_DMAR_GFX_WA=y
CONFIG_DMAR_FLOPPY_WA=y
+# CONFIG_INTR_REMAP is not set
CONFIG_PCIEPORTBUS=y
# CONFIG_HOTPLUG_PCI_PCIE is not set
CONFIG_PCIEAER=y
@@ -405,6 +433,7 @@ CONFIG_ARCH_SUPPORTS_MSI=y
CONFIG_PCI_MSI=y
# CONFIG_PCI_LEGACY is not set
# CONFIG_PCI_DEBUG is not set
+# CONFIG_PCI_STUB is not set
CONFIG_HT_IRQ=y
CONFIG_ISA_DMA_API=y
CONFIG_K8_NB=y
@@ -438,6 +467,8 @@ CONFIG_HOTPLUG_PCI=y
#
CONFIG_BINFMT_ELF=y
CONFIG_COMPAT_BINFMT_ELF=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
+# CONFIG_HAVE_AOUT is not set
CONFIG_BINFMT_MISC=y
CONFIG_IA32_EMULATION=y
# CONFIG_IA32_AOUT is not set
@@ -449,6 +480,7 @@ CONFIG_NET=y
#
# Networking options
#
+CONFIG_COMPAT_NET_DEV_OPS=y
CONFIG_PACKET=y
CONFIG_PACKET_MMAP=y
CONFIG_UNIX=y
@@ -509,7 +541,6 @@ CONFIG_DEFAULT_CUBIC=y
# CONFIG_DEFAULT_RENO is not set
CONFIG_DEFAULT_TCP_CONG="cubic"
CONFIG_TCP_MD5SIG=y
-# CONFIG_IP_VS is not set
CONFIG_IPV6=y
# CONFIG_IPV6_PRIVACY is not set
# CONFIG_IPV6_ROUTER_PREF is not set
@@ -547,19 +578,21 @@ CONFIG_NF_CONNTRACK_IRC=y
CONFIG_NF_CONNTRACK_SIP=y
CONFIG_NF_CT_NETLINK=y
CONFIG_NETFILTER_XTABLES=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
CONFIG_NETFILTER_XT_TARGET_MARK=y
CONFIG_NETFILTER_XT_TARGET_NFLOG=y
CONFIG_NETFILTER_XT_TARGET_SECMARK=y
-CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
CONFIG_NETFILTER_XT_MATCH_MARK=y
CONFIG_NETFILTER_XT_MATCH_POLICY=y
CONFIG_NETFILTER_XT_MATCH_STATE=y
+# CONFIG_IP_VS is not set
#
# IP: Netfilter Configuration
#
+CONFIG_NF_DEFRAG_IPV4=y
CONFIG_NF_CONNTRACK_IPV4=y
CONFIG_NF_CONNTRACK_PROC_COMPAT=y
CONFIG_IP_NF_IPTABLES=y
@@ -585,8 +618,8 @@ CONFIG_IP_NF_MANGLE=y
CONFIG_NF_CONNTRACK_IPV6=y
CONFIG_IP6_NF_IPTABLES=y
CONFIG_IP6_NF_MATCH_IPV6HEADER=y
-CONFIG_IP6_NF_FILTER=y
CONFIG_IP6_NF_TARGET_LOG=y
+CONFIG_IP6_NF_FILTER=y
CONFIG_IP6_NF_TARGET_REJECT=y
CONFIG_IP6_NF_MANGLE=y
# CONFIG_IP_DCCP is not set
@@ -594,6 +627,7 @@ CONFIG_IP6_NF_MANGLE=y
# CONFIG_TIPC is not set
# CONFIG_ATM is not set
# CONFIG_BRIDGE is not set
+# CONFIG_NET_DSA is not set
# CONFIG_VLAN_8021Q is not set
# CONFIG_DECNET is not set
CONFIG_LLC=y
@@ -613,6 +647,7 @@ CONFIG_NET_SCHED=y
# CONFIG_NET_SCH_HTB is not set
# CONFIG_NET_SCH_HFSC is not set
# CONFIG_NET_SCH_PRIO is not set
+# CONFIG_NET_SCH_MULTIQ is not set
# CONFIG_NET_SCH_RED is not set
# CONFIG_NET_SCH_SFQ is not set
# CONFIG_NET_SCH_TEQL is not set
@@ -620,6 +655,7 @@ CONFIG_NET_SCHED=y
# CONFIG_NET_SCH_GRED is not set
# CONFIG_NET_SCH_DSMARK is not set
# CONFIG_NET_SCH_NETEM is not set
+# CONFIG_NET_SCH_DRR is not set
# CONFIG_NET_SCH_INGRESS is not set
#
@@ -634,6 +670,7 @@ CONFIG_NET_CLS=y
# CONFIG_NET_CLS_RSVP is not set
# CONFIG_NET_CLS_RSVP6 is not set
# CONFIG_NET_CLS_FLOW is not set
+# CONFIG_NET_CLS_CGROUP is not set
CONFIG_NET_EMATCH=y
CONFIG_NET_EMATCH_STACK=32
# CONFIG_NET_EMATCH_CMP is not set
@@ -649,7 +686,9 @@ CONFIG_NET_CLS_ACT=y
# CONFIG_NET_ACT_NAT is not set
# CONFIG_NET_ACT_PEDIT is not set
# CONFIG_NET_ACT_SIMP is not set
+# CONFIG_NET_ACT_SKBEDIT is not set
CONFIG_NET_SCH_FIFO=y
+# CONFIG_DCB is not set
#
# Network testing
@@ -666,29 +705,33 @@ CONFIG_HAMRADIO=y
# CONFIG_IRDA is not set
# CONFIG_BT is not set
# CONFIG_AF_RXRPC is not set
+# CONFIG_PHONET is not set
CONFIG_FIB_RULES=y
-
-#
-# Wireless
-#
+CONFIG_WIRELESS=y
CONFIG_CFG80211=y
+# CONFIG_CFG80211_REG_DEBUG is not set
CONFIG_NL80211=y
+CONFIG_WIRELESS_OLD_REGULATORY=y
CONFIG_WIRELESS_EXT=y
CONFIG_WIRELESS_EXT_SYSFS=y
+# CONFIG_LIB80211 is not set
CONFIG_MAC80211=y
#
# Rate control algorithm selection
#
-CONFIG_MAC80211_RC_PID=y
-CONFIG_MAC80211_RC_DEFAULT_PID=y
-CONFIG_MAC80211_RC_DEFAULT="pid"
+CONFIG_MAC80211_RC_MINSTREL=y
+# CONFIG_MAC80211_RC_DEFAULT_PID is not set
+CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y
+CONFIG_MAC80211_RC_DEFAULT="minstrel"
# CONFIG_MAC80211_MESH is not set
CONFIG_MAC80211_LEDS=y
# CONFIG_MAC80211_DEBUGFS is not set
# CONFIG_MAC80211_DEBUG_MENU is not set
-# CONFIG_IEEE80211 is not set
-# CONFIG_RFKILL is not set
+# CONFIG_WIMAX is not set
+CONFIG_RFKILL=y
+# CONFIG_RFKILL_INPUT is not set
+CONFIG_RFKILL_LEDS=y
# CONFIG_NET_9P is not set
#
@@ -712,7 +755,7 @@ CONFIG_PROC_EVENTS=y
# CONFIG_MTD is not set
# CONFIG_PARPORT is not set
CONFIG_PNP=y
-# CONFIG_PNP_DEBUG is not set
+CONFIG_PNP_DEBUG_MESSAGES=y
#
# Protocols
@@ -740,21 +783,21 @@ CONFIG_BLK_DEV_RAM_SIZE=16384
CONFIG_MISC_DEVICES=y
# CONFIG_IBM_ASM is not set
# CONFIG_PHANTOM is not set
-# CONFIG_EEPROM_93CX6 is not set
# CONFIG_SGI_IOC4 is not set
# CONFIG_TIFM_CORE is not set
-# CONFIG_ACER_WMI is not set
-# CONFIG_ASUS_LAPTOP is not set
-# CONFIG_FUJITSU_LAPTOP is not set
-# CONFIG_MSI_LAPTOP is not set
-# CONFIG_COMPAL_LAPTOP is not set
-# CONFIG_SONY_LAPTOP is not set
-# CONFIG_THINKPAD_ACPI is not set
-# CONFIG_INTEL_MENLOW is not set
+# CONFIG_ICS932S401 is not set
# CONFIG_ENCLOSURE_SERVICES is not set
# CONFIG_SGI_XP is not set
# CONFIG_HP_ILO is not set
# CONFIG_SGI_GRU is not set
+# CONFIG_C2PORT is not set
+
+#
+# EEPROM support
+#
+# CONFIG_EEPROM_AT24 is not set
+# CONFIG_EEPROM_LEGACY is not set
+# CONFIG_EEPROM_93CX6 is not set
CONFIG_HAVE_IDE=y
# CONFIG_IDE is not set
@@ -793,7 +836,7 @@ CONFIG_SCSI_WAIT_SCAN=m
#
CONFIG_SCSI_SPI_ATTRS=y
# CONFIG_SCSI_FC_ATTRS is not set
-CONFIG_SCSI_ISCSI_ATTRS=y
+# CONFIG_SCSI_ISCSI_ATTRS is not set
# CONFIG_SCSI_SAS_ATTRS is not set
# CONFIG_SCSI_SAS_LIBSAS is not set
# CONFIG_SCSI_SRP_ATTRS is not set
@@ -864,6 +907,7 @@ CONFIG_PATA_OLDPIIX=y
CONFIG_PATA_SCH=y
CONFIG_MD=y
CONFIG_BLK_DEV_MD=y
+CONFIG_MD_AUTODETECT=y
# CONFIG_MD_LINEAR is not set
# CONFIG_MD_RAID0 is not set
# CONFIG_MD_RAID1 is not set
@@ -919,6 +963,9 @@ CONFIG_PHYLIB=y
# CONFIG_BROADCOM_PHY is not set
# CONFIG_ICPLUS_PHY is not set
# CONFIG_REALTEK_PHY is not set
+# CONFIG_NATIONAL_PHY is not set
+# CONFIG_STE10XP is not set
+# CONFIG_LSI_ET1011C_PHY is not set
# CONFIG_FIXED_PHY is not set
# CONFIG_MDIO_BITBANG is not set
CONFIG_NET_ETHERNET=y
@@ -942,6 +989,9 @@ CONFIG_NET_TULIP=y
# CONFIG_IBM_NEW_EMAC_RGMII is not set
# CONFIG_IBM_NEW_EMAC_TAH is not set
# CONFIG_IBM_NEW_EMAC_EMAC4 is not set
+# CONFIG_IBM_NEW_EMAC_NO_FLOW_CTRL is not set
+# CONFIG_IBM_NEW_EMAC_MAL_CLR_ICINTSTAT is not set
+# CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set
CONFIG_NET_PCI=y
# CONFIG_PCNET32 is not set
# CONFIG_AMD8111_ETH is not set
@@ -949,7 +999,6 @@ CONFIG_NET_PCI=y
# CONFIG_B44 is not set
CONFIG_FORCEDETH=y
# CONFIG_FORCEDETH_NAPI is not set
-# CONFIG_EEPRO100 is not set
CONFIG_E100=y
# CONFIG_FEALNX is not set
# CONFIG_NATSEMI is not set
@@ -963,15 +1012,16 @@ CONFIG_8139TOO_PIO=y
# CONFIG_R6040 is not set
# CONFIG_SIS900 is not set
# CONFIG_EPIC100 is not set
+# CONFIG_SMSC9420 is not set
# CONFIG_SUNDANCE is not set
# CONFIG_TLAN is not set
# CONFIG_VIA_RHINE is not set
# CONFIG_SC92031 is not set
+# CONFIG_ATL2 is not set
CONFIG_NETDEV_1000=y
# CONFIG_ACENIC is not set
# CONFIG_DL2K is not set
CONFIG_E1000=y
-# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
# CONFIG_E1000E is not set
# CONFIG_IP1000 is not set
# CONFIG_IGB is not set
@@ -989,18 +1039,23 @@ CONFIG_TIGON3=y
# CONFIG_QLA3XXX is not set
# CONFIG_ATL1 is not set
# CONFIG_ATL1E is not set
+# CONFIG_JME is not set
CONFIG_NETDEV_10000=y
# CONFIG_CHELSIO_T1 is not set
+CONFIG_CHELSIO_T3_DEPENDS=y
# CONFIG_CHELSIO_T3 is not set
+# CONFIG_ENIC is not set
# CONFIG_IXGBE is not set
# CONFIG_IXGB is not set
# CONFIG_S2IO is not set
# CONFIG_MYRI10GE is not set
# CONFIG_NETXEN_NIC is not set
# CONFIG_NIU is not set
+# CONFIG_MLX4_EN is not set
# CONFIG_MLX4_CORE is not set
# CONFIG_TEHUTI is not set
# CONFIG_BNX2X is not set
+# CONFIG_QLGE is not set
# CONFIG_SFC is not set
CONFIG_TR=y
# CONFIG_IBMOL is not set
@@ -1013,9 +1068,8 @@ CONFIG_TR=y
# CONFIG_WLAN_PRE80211 is not set
CONFIG_WLAN_80211=y
# CONFIG_PCMCIA_RAYCS is not set
-# CONFIG_IPW2100 is not set
-# CONFIG_IPW2200 is not set
# CONFIG_LIBERTAS is not set
+# CONFIG_LIBERTAS_THINFIRM is not set
# CONFIG_AIRO is not set
# CONFIG_HERMES is not set
# CONFIG_ATMEL is not set
@@ -1032,6 +1086,8 @@ CONFIG_WLAN_80211=y
CONFIG_ATH5K=y
# CONFIG_ATH5K_DEBUG is not set
# CONFIG_ATH9K is not set
+# CONFIG_IPW2100 is not set
+# CONFIG_IPW2200 is not set
# CONFIG_IWLCORE is not set
# CONFIG_IWLWIFI_LEDS is not set
# CONFIG_IWLAGN is not set
@@ -1043,6 +1099,10 @@ CONFIG_ATH5K=y
# CONFIG_RT2X00 is not set
#
+# Enable WiMAX (Networking options) to see the WiMAX drivers
+#
+
+#
# USB Network Adapters
#
# CONFIG_USB_CATC is not set
@@ -1050,6 +1110,7 @@ CONFIG_ATH5K=y
# CONFIG_USB_PEGASUS is not set
# CONFIG_USB_RTL8150 is not set
# CONFIG_USB_USBNET is not set
+# CONFIG_USB_HSO is not set
CONFIG_NET_PCMCIA=y
# CONFIG_PCMCIA_3C589 is not set
# CONFIG_PCMCIA_3C574 is not set
@@ -1059,6 +1120,7 @@ CONFIG_NET_PCMCIA=y
# CONFIG_PCMCIA_SMC91C92 is not set
# CONFIG_PCMCIA_XIRC2PS is not set
# CONFIG_PCMCIA_AXNET is not set
+# CONFIG_PCMCIA_IBMTR is not set
# CONFIG_WAN is not set
CONFIG_FDDI=y
# CONFIG_DEFXX is not set
@@ -1110,6 +1172,7 @@ CONFIG_MOUSE_PS2_LOGIPS2PP=y
CONFIG_MOUSE_PS2_SYNAPTICS=y
CONFIG_MOUSE_PS2_LIFEBOOK=y
CONFIG_MOUSE_PS2_TRACKPOINT=y
+# CONFIG_MOUSE_PS2_ELANTECH is not set
# CONFIG_MOUSE_PS2_TOUCHKIT is not set
# CONFIG_MOUSE_SERIAL is not set
# CONFIG_MOUSE_APPLETOUCH is not set
@@ -1147,15 +1210,16 @@ CONFIG_INPUT_TOUCHSCREEN=y
# CONFIG_TOUCHSCREEN_FUJITSU is not set
# CONFIG_TOUCHSCREEN_GUNZE is not set
# CONFIG_TOUCHSCREEN_ELO is not set
+# CONFIG_TOUCHSCREEN_WACOM_W8001 is not set
# CONFIG_TOUCHSCREEN_MTOUCH is not set
# CONFIG_TOUCHSCREEN_INEXIO is not set
# CONFIG_TOUCHSCREEN_MK712 is not set
# CONFIG_TOUCHSCREEN_PENMOUNT is not set
# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
-# CONFIG_TOUCHSCREEN_UCB1400 is not set
# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
+# CONFIG_TOUCHSCREEN_TSC2007 is not set
CONFIG_INPUT_MISC=y
# CONFIG_INPUT_PCSPKR is not set
# CONFIG_INPUT_APANEL is not set
@@ -1165,6 +1229,7 @@ CONFIG_INPUT_MISC=y
# CONFIG_INPUT_KEYSPAN_REMOTE is not set
# CONFIG_INPUT_POWERMATE is not set
# CONFIG_INPUT_YEALINK is not set
+# CONFIG_INPUT_CM109 is not set
# CONFIG_INPUT_UINPUT is not set
#
@@ -1231,6 +1296,7 @@ CONFIG_SERIAL_CORE=y
CONFIG_SERIAL_CORE_CONSOLE=y
# CONFIG_SERIAL_JSM is not set
CONFIG_UNIX98_PTYS=y
+# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set
# CONFIG_LEGACY_PTYS is not set
# CONFIG_IPMI_HANDLER is not set
CONFIG_HW_RANDOM=y
@@ -1260,6 +1326,7 @@ CONFIG_I2C=y
CONFIG_I2C_BOARDINFO=y
# CONFIG_I2C_CHARDEV is not set
CONFIG_I2C_HELPER_AUTO=y
+CONFIG_I2C_ALGOBIT=y
#
# I2C Hardware Bus support
@@ -1311,8 +1378,6 @@ CONFIG_I2C_I801=y
# Miscellaneous I2C Chip support
#
# CONFIG_DS1682 is not set
-# CONFIG_AT24 is not set
-# CONFIG_SENSORS_EEPROM is not set
# CONFIG_SENSORS_PCF8574 is not set
# CONFIG_PCF8575 is not set
# CONFIG_SENSORS_PCA9539 is not set
@@ -1331,8 +1396,78 @@ CONFIG_POWER_SUPPLY=y
# CONFIG_POWER_SUPPLY_DEBUG is not set
# CONFIG_PDA_POWER is not set
# CONFIG_BATTERY_DS2760 is not set
-# CONFIG_HWMON is not set
+# CONFIG_BATTERY_BQ27x00 is not set
+CONFIG_HWMON=y
+# CONFIG_HWMON_VID is not set
+# CONFIG_SENSORS_ABITUGURU is not set
+# CONFIG_SENSORS_ABITUGURU3 is not set
+# CONFIG_SENSORS_AD7414 is not set
+# CONFIG_SENSORS_AD7418 is not set
+# CONFIG_SENSORS_ADM1021 is not set
+# CONFIG_SENSORS_ADM1025 is not set
+# CONFIG_SENSORS_ADM1026 is not set
+# CONFIG_SENSORS_ADM1029 is not set
+# CONFIG_SENSORS_ADM1031 is not set
+# CONFIG_SENSORS_ADM9240 is not set
+# CONFIG_SENSORS_ADT7462 is not set
+# CONFIG_SENSORS_ADT7470 is not set
+# CONFIG_SENSORS_ADT7473 is not set
+# CONFIG_SENSORS_ADT7475 is not set
+# CONFIG_SENSORS_K8TEMP is not set
+# CONFIG_SENSORS_ASB100 is not set
+# CONFIG_SENSORS_ATXP1 is not set
+# CONFIG_SENSORS_DS1621 is not set
+# CONFIG_SENSORS_I5K_AMB is not set
+# CONFIG_SENSORS_F71805F is not set
+# CONFIG_SENSORS_F71882FG is not set
+# CONFIG_SENSORS_F75375S is not set
+# CONFIG_SENSORS_FSCHER is not set
+# CONFIG_SENSORS_FSCPOS is not set
+# CONFIG_SENSORS_FSCHMD is not set
+# CONFIG_SENSORS_GL518SM is not set
+# CONFIG_SENSORS_GL520SM is not set
+# CONFIG_SENSORS_CORETEMP is not set
+# CONFIG_SENSORS_IT87 is not set
+# CONFIG_SENSORS_LM63 is not set
+# CONFIG_SENSORS_LM75 is not set
+# CONFIG_SENSORS_LM77 is not set
+# CONFIG_SENSORS_LM78 is not set
+# CONFIG_SENSORS_LM80 is not set
+# CONFIG_SENSORS_LM83 is not set
+# CONFIG_SENSORS_LM85 is not set
+# CONFIG_SENSORS_LM87 is not set
+# CONFIG_SENSORS_LM90 is not set
+# CONFIG_SENSORS_LM92 is not set
+# CONFIG_SENSORS_LM93 is not set
+# CONFIG_SENSORS_LTC4245 is not set
+# CONFIG_SENSORS_MAX1619 is not set
+# CONFIG_SENSORS_MAX6650 is not set
+# CONFIG_SENSORS_PC87360 is not set
+# CONFIG_SENSORS_PC87427 is not set
+# CONFIG_SENSORS_SIS5595 is not set
+# CONFIG_SENSORS_DME1737 is not set
+# CONFIG_SENSORS_SMSC47M1 is not set
+# CONFIG_SENSORS_SMSC47M192 is not set
+# CONFIG_SENSORS_SMSC47B397 is not set
+# CONFIG_SENSORS_ADS7828 is not set
+# CONFIG_SENSORS_THMC50 is not set
+# CONFIG_SENSORS_VIA686A is not set
+# CONFIG_SENSORS_VT1211 is not set
+# CONFIG_SENSORS_VT8231 is not set
+# CONFIG_SENSORS_W83781D is not set
+# CONFIG_SENSORS_W83791D is not set
+# CONFIG_SENSORS_W83792D is not set
+# CONFIG_SENSORS_W83793 is not set
+# CONFIG_SENSORS_W83L785TS is not set
+# CONFIG_SENSORS_W83L786NG is not set
+# CONFIG_SENSORS_W83627HF is not set
+# CONFIG_SENSORS_W83627EHF is not set
+# CONFIG_SENSORS_HDAPS is not set
+# CONFIG_SENSORS_LIS3LV02D is not set
+# CONFIG_SENSORS_APPLESMC is not set
+# CONFIG_HWMON_DEBUG_CHIP is not set
CONFIG_THERMAL=y
+# CONFIG_THERMAL_HWMON is not set
CONFIG_WATCHDOG=y
# CONFIG_WATCHDOG_NOWAYOUT is not set
@@ -1352,15 +1487,18 @@ CONFIG_WATCHDOG=y
# CONFIG_I6300ESB_WDT is not set
# CONFIG_ITCO_WDT is not set
# CONFIG_IT8712F_WDT is not set
+# CONFIG_IT87_WDT is not set
# CONFIG_HP_WATCHDOG is not set
# CONFIG_SC1200_WDT is not set
# CONFIG_PC87413_WDT is not set
# CONFIG_60XX_WDT is not set
# CONFIG_SBC8360_WDT is not set
# CONFIG_CPU5_WDT is not set
+# CONFIG_SMSC_SCH311X_WDT is not set
# CONFIG_SMSC37B787_WDT is not set
# CONFIG_W83627HF_WDT is not set
# CONFIG_W83697HF_WDT is not set
+# CONFIG_W83697UG_WDT is not set
# CONFIG_W83877F_WDT is not set
# CONFIG_W83977F_WDT is not set
# CONFIG_MACHZ_WDT is not set
@@ -1376,11 +1514,11 @@ CONFIG_WATCHDOG=y
# USB-based Watchdog Cards
#
# CONFIG_USBPCWATCHDOG is not set
+CONFIG_SSB_POSSIBLE=y
#
# Sonics Silicon Backplane
#
-CONFIG_SSB_POSSIBLE=y
# CONFIG_SSB is not set
#
@@ -1389,7 +1527,13 @@ CONFIG_SSB_POSSIBLE=y
# CONFIG_MFD_CORE is not set
# CONFIG_MFD_SM501 is not set
# CONFIG_HTC_PASIC3 is not set
+# CONFIG_TWL4030_CORE is not set
# CONFIG_MFD_TMIO is not set
+# CONFIG_PMIC_DA903X is not set
+# CONFIG_MFD_WM8400 is not set
+# CONFIG_MFD_WM8350_I2C is not set
+# CONFIG_MFD_PCF50633 is not set
+# CONFIG_REGULATOR is not set
#
# Multimedia devices
@@ -1423,6 +1567,7 @@ CONFIG_DRM=y
# CONFIG_DRM_I810 is not set
# CONFIG_DRM_I830 is not set
CONFIG_DRM_I915=y
+CONFIG_DRM_I915_KMS=y
# CONFIG_DRM_MGA is not set
# CONFIG_DRM_SIS is not set
# CONFIG_DRM_VIA is not set
@@ -1432,6 +1577,7 @@ CONFIG_DRM_I915=y
CONFIG_FB=y
# CONFIG_FIRMWARE_EDID is not set
# CONFIG_FB_DDC is not set
+# CONFIG_FB_BOOT_VESA_SUPPORT is not set
CONFIG_FB_CFB_FILLRECT=y
CONFIG_FB_CFB_COPYAREA=y
CONFIG_FB_CFB_IMAGEBLIT=y
@@ -1460,7 +1606,6 @@ CONFIG_FB_TILEBLITTING=y
# CONFIG_FB_UVESA is not set
# CONFIG_FB_VESA is not set
CONFIG_FB_EFI=y
-# CONFIG_FB_IMAC is not set
# CONFIG_FB_N411 is not set
# CONFIG_FB_HGA is not set
# CONFIG_FB_S1D13XXX is not set
@@ -1475,6 +1620,7 @@ CONFIG_FB_EFI=y
# CONFIG_FB_S3 is not set
# CONFIG_FB_SAVAGE is not set
# CONFIG_FB_SIS is not set
+# CONFIG_FB_VIA is not set
# CONFIG_FB_NEOMAGIC is not set
# CONFIG_FB_KYRO is not set
# CONFIG_FB_3DFX is not set
@@ -1486,12 +1632,15 @@ CONFIG_FB_EFI=y
# CONFIG_FB_CARMINE is not set
# CONFIG_FB_GEODE is not set
# CONFIG_FB_VIRTUAL is not set
+# CONFIG_FB_METRONOME is not set
+# CONFIG_FB_MB862XX is not set
CONFIG_BACKLIGHT_LCD_SUPPORT=y
# CONFIG_LCD_CLASS_DEVICE is not set
CONFIG_BACKLIGHT_CLASS_DEVICE=y
-# CONFIG_BACKLIGHT_CORGI is not set
+CONFIG_BACKLIGHT_GENERIC=y
# CONFIG_BACKLIGHT_PROGEAR is not set
# CONFIG_BACKLIGHT_MBP_NVIDIA is not set
+# CONFIG_BACKLIGHT_SAHARA is not set
#
# Display device support
@@ -1511,10 +1660,12 @@ CONFIG_LOGO=y
# CONFIG_LOGO_LINUX_VGA16 is not set
CONFIG_LOGO_LINUX_CLUT224=y
CONFIG_SOUND=y
+CONFIG_SOUND_OSS_CORE=y
CONFIG_SND=y
CONFIG_SND_TIMER=y
CONFIG_SND_PCM=y
CONFIG_SND_HWDEP=y
+CONFIG_SND_JACK=y
CONFIG_SND_SEQUENCER=y
CONFIG_SND_SEQ_DUMMY=y
CONFIG_SND_OSSEMUL=y
@@ -1522,6 +1673,8 @@ CONFIG_SND_MIXER_OSS=y
CONFIG_SND_PCM_OSS=y
CONFIG_SND_PCM_OSS_PLUGINS=y
CONFIG_SND_SEQUENCER_OSS=y
+CONFIG_SND_HRTIMER=y
+CONFIG_SND_SEQ_HRTIMER_DEFAULT=y
CONFIG_SND_DYNAMIC_MINORS=y
CONFIG_SND_SUPPORT_OLD_API=y
CONFIG_SND_VERBOSE_PROCFS=y
@@ -1575,11 +1728,16 @@ CONFIG_SND_PCI=y
# CONFIG_SND_FM801 is not set
CONFIG_SND_HDA_INTEL=y
CONFIG_SND_HDA_HWDEP=y
+# CONFIG_SND_HDA_RECONFIG is not set
+# CONFIG_SND_HDA_INPUT_BEEP is not set
CONFIG_SND_HDA_CODEC_REALTEK=y
CONFIG_SND_HDA_CODEC_ANALOG=y
CONFIG_SND_HDA_CODEC_SIGMATEL=y
CONFIG_SND_HDA_CODEC_VIA=y
CONFIG_SND_HDA_CODEC_ATIHDMI=y
+CONFIG_SND_HDA_CODEC_NVHDMI=y
+CONFIG_SND_HDA_CODEC_INTELHDMI=y
+CONFIG_SND_HDA_ELD=y
CONFIG_SND_HDA_CODEC_CONEXANT=y
CONFIG_SND_HDA_CODEC_CMEDIA=y
CONFIG_SND_HDA_CODEC_SI3054=y
@@ -1612,6 +1770,7 @@ CONFIG_SND_USB=y
# CONFIG_SND_USB_AUDIO is not set
# CONFIG_SND_USB_USX2Y is not set
# CONFIG_SND_USB_CAIAQ is not set
+# CONFIG_SND_USB_US122L is not set
CONFIG_SND_PCMCIA=y
# CONFIG_SND_VXPOCKET is not set
# CONFIG_SND_PDAUDIOCF is not set
@@ -1626,15 +1785,37 @@ CONFIG_HIDRAW=y
# USB Input Devices
#
CONFIG_USB_HID=y
-CONFIG_USB_HIDINPUT_POWERBOOK=y
-CONFIG_HID_FF=y
CONFIG_HID_PID=y
+CONFIG_USB_HIDDEV=y
+
+#
+# Special HID drivers
+#
+CONFIG_HID_COMPAT=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_LOGITECH=y
CONFIG_LOGITECH_FF=y
# CONFIG_LOGIRUMBLEPAD2_FF is not set
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_PANTHERLORD=y
CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SUNPLUS=y
+# CONFIG_GREENASIA_FF is not set
+CONFIG_HID_TOPSEED=y
CONFIG_THRUSTMASTER_FF=y
CONFIG_ZEROPLUS_FF=y
-CONFIG_USB_HIDDEV=y
CONFIG_USB_SUPPORT=y
CONFIG_USB_ARCH_HAS_HCD=y
CONFIG_USB_ARCH_HAS_OHCI=y
@@ -1652,6 +1833,8 @@ CONFIG_USB_DEVICEFS=y
CONFIG_USB_SUSPEND=y
# CONFIG_USB_OTG is not set
CONFIG_USB_MON=y
+# CONFIG_USB_WUSB is not set
+# CONFIG_USB_WUSB_CBAF is not set
#
# USB Host Controller Drivers
@@ -1660,6 +1843,7 @@ CONFIG_USB_MON=y
CONFIG_USB_EHCI_HCD=y
# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
# CONFIG_USB_EHCI_TT_NEWSCHED is not set
+# CONFIG_USB_OXU210HP_HCD is not set
# CONFIG_USB_ISP116X_HCD is not set
# CONFIG_USB_ISP1760_HCD is not set
CONFIG_USB_OHCI_HCD=y
@@ -1669,6 +1853,8 @@ CONFIG_USB_OHCI_LITTLE_ENDIAN=y
CONFIG_USB_UHCI_HCD=y
# CONFIG_USB_SL811_HCD is not set
# CONFIG_USB_R8A66597_HCD is not set
+# CONFIG_USB_WHCI_HCD is not set
+# CONFIG_USB_HWA_HCD is not set
#
# USB Device Class drivers
@@ -1676,20 +1862,20 @@ CONFIG_USB_UHCI_HCD=y
# CONFIG_USB_ACM is not set
CONFIG_USB_PRINTER=y
# CONFIG_USB_WDM is not set
+# CONFIG_USB_TMC is not set
#
-# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
+# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed;
#
#
-# may also be needed; see USB_STORAGE Help for more information
+# see USB_STORAGE Help for more information
#
CONFIG_USB_STORAGE=y
# CONFIG_USB_STORAGE_DEBUG is not set
# CONFIG_USB_STORAGE_DATAFAB is not set
# CONFIG_USB_STORAGE_FREECOM is not set
# CONFIG_USB_STORAGE_ISD200 is not set
-# CONFIG_USB_STORAGE_DPCM is not set
# CONFIG_USB_STORAGE_USBAT is not set
# CONFIG_USB_STORAGE_SDDR09 is not set
# CONFIG_USB_STORAGE_SDDR55 is not set
@@ -1697,7 +1883,6 @@ CONFIG_USB_STORAGE=y
# CONFIG_USB_STORAGE_ALAUDA is not set
# CONFIG_USB_STORAGE_ONETOUCH is not set
# CONFIG_USB_STORAGE_KARMA is not set
-# CONFIG_USB_STORAGE_SIERRA is not set
# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
CONFIG_USB_LIBUSUAL=y
@@ -1718,6 +1903,7 @@ CONFIG_USB_LIBUSUAL=y
# CONFIG_USB_EMI62 is not set
# CONFIG_USB_EMI26 is not set
# CONFIG_USB_ADUTUX is not set
+# CONFIG_USB_SEVSEG is not set
# CONFIG_USB_RIO500 is not set
# CONFIG_USB_LEGOTOWER is not set
# CONFIG_USB_LCD is not set
@@ -1735,7 +1921,13 @@ CONFIG_USB_LIBUSUAL=y
# CONFIG_USB_IOWARRIOR is not set
# CONFIG_USB_TEST is not set
# CONFIG_USB_ISIGHTFW is not set
+# CONFIG_USB_VST is not set
# CONFIG_USB_GADGET is not set
+
+#
+# OTG and related infrastructure
+#
+# CONFIG_UWB is not set
# CONFIG_MMC is not set
# CONFIG_MEMSTICK is not set
CONFIG_NEW_LEDS=y
@@ -1744,6 +1936,7 @@ CONFIG_LEDS_CLASS=y
#
# LED drivers
#
+# CONFIG_LEDS_ALIX2 is not set
# CONFIG_LEDS_PCA9532 is not set
# CONFIG_LEDS_CLEVO_MAIL is not set
# CONFIG_LEDS_PCA955X is not set
@@ -1754,6 +1947,7 @@ CONFIG_LEDS_CLASS=y
CONFIG_LEDS_TRIGGERS=y
# CONFIG_LEDS_TRIGGER_TIMER is not set
# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
+# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
# CONFIG_ACCESSIBILITY is not set
# CONFIG_INFINIBAND is not set
@@ -1793,6 +1987,7 @@ CONFIG_RTC_INTF_DEV=y
# CONFIG_RTC_DRV_M41T80 is not set
# CONFIG_RTC_DRV_S35390A is not set
# CONFIG_RTC_DRV_FM3130 is not set
+# CONFIG_RTC_DRV_RX8581 is not set
#
# SPI RTC drivers
@@ -1802,12 +1997,15 @@ CONFIG_RTC_INTF_DEV=y
# Platform RTC drivers
#
CONFIG_RTC_DRV_CMOS=y
+# CONFIG_RTC_DRV_DS1286 is not set
# CONFIG_RTC_DRV_DS1511 is not set
# CONFIG_RTC_DRV_DS1553 is not set
# CONFIG_RTC_DRV_DS1742 is not set
# CONFIG_RTC_DRV_STK17TA8 is not set
# CONFIG_RTC_DRV_M48T86 is not set
+# CONFIG_RTC_DRV_M48T35 is not set
# CONFIG_RTC_DRV_M48T59 is not set
+# CONFIG_RTC_DRV_BQ4802 is not set
# CONFIG_RTC_DRV_V3020 is not set
#
@@ -1820,6 +2018,21 @@ CONFIG_DMADEVICES=y
#
# CONFIG_INTEL_IOATDMA is not set
# CONFIG_UIO is not set
+# CONFIG_STAGING is not set
+CONFIG_X86_PLATFORM_DEVICES=y
+# CONFIG_ACER_WMI is not set
+# CONFIG_ASUS_LAPTOP is not set
+# CONFIG_FUJITSU_LAPTOP is not set
+# CONFIG_MSI_LAPTOP is not set
+# CONFIG_PANASONIC_LAPTOP is not set
+# CONFIG_COMPAL_LAPTOP is not set
+# CONFIG_SONY_LAPTOP is not set
+# CONFIG_THINKPAD_ACPI is not set
+# CONFIG_INTEL_MENLOW is not set
+CONFIG_EEEPC_LAPTOP=y
+# CONFIG_ACPI_WMI is not set
+# CONFIG_ACPI_ASUS is not set
+# CONFIG_ACPI_TOSHIBA is not set
#
# Firmware Drivers
@@ -1830,8 +2043,7 @@ CONFIG_EFI_VARS=y
# CONFIG_DELL_RBU is not set
# CONFIG_DCDBAS is not set
CONFIG_DMIID=y
-CONFIG_ISCSI_IBFT_FIND=y
-CONFIG_ISCSI_IBFT=y
+# CONFIG_ISCSI_IBFT_FIND is not set
#
# File systems
@@ -1841,22 +2053,25 @@ CONFIG_EXT3_FS=y
CONFIG_EXT3_FS_XATTR=y
CONFIG_EXT3_FS_POSIX_ACL=y
CONFIG_EXT3_FS_SECURITY=y
-# CONFIG_EXT4DEV_FS is not set
+# CONFIG_EXT4_FS is not set
CONFIG_JBD=y
# CONFIG_JBD_DEBUG is not set
CONFIG_FS_MBCACHE=y
# CONFIG_REISERFS_FS is not set
# CONFIG_JFS_FS is not set
CONFIG_FS_POSIX_ACL=y
+CONFIG_FILE_LOCKING=y
# CONFIG_XFS_FS is not set
# CONFIG_GFS2_FS is not set
# CONFIG_OCFS2_FS is not set
+# CONFIG_BTRFS_FS is not set
CONFIG_DNOTIFY=y
CONFIG_INOTIFY=y
CONFIG_INOTIFY_USER=y
CONFIG_QUOTA=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
# CONFIG_PRINT_QUOTA_WARNING is not set
+CONFIG_QUOTA_TREE=y
# CONFIG_QFMT_V1 is not set
CONFIG_QFMT_V2=y
CONFIG_QUOTACTL=y
@@ -1890,16 +2105,14 @@ CONFIG_PROC_FS=y
CONFIG_PROC_KCORE=y
CONFIG_PROC_VMCORE=y
CONFIG_PROC_SYSCTL=y
+CONFIG_PROC_PAGE_MONITOR=y
CONFIG_SYSFS=y
CONFIG_TMPFS=y
CONFIG_TMPFS_POSIX_ACL=y
CONFIG_HUGETLBFS=y
CONFIG_HUGETLB_PAGE=y
# CONFIG_CONFIGFS_FS is not set
-
-#
-# Miscellaneous filesystems
-#
+CONFIG_MISC_FILESYSTEMS=y
# CONFIG_ADFS_FS is not set
# CONFIG_AFFS_FS is not set
# CONFIG_ECRYPT_FS is not set
@@ -1909,6 +2122,7 @@ CONFIG_HUGETLB_PAGE=y
# CONFIG_BFS_FS is not set
# CONFIG_EFS_FS is not set
# CONFIG_CRAMFS is not set
+# CONFIG_SQUASHFS is not set
# CONFIG_VXFS_FS is not set
# CONFIG_MINIX_FS is not set
# CONFIG_OMFS_FS is not set
@@ -1930,6 +2144,7 @@ CONFIG_NFS_ACL_SUPPORT=y
CONFIG_NFS_COMMON=y
CONFIG_SUNRPC=y
CONFIG_SUNRPC_GSS=y
+# CONFIG_SUNRPC_REGISTER_V4 is not set
CONFIG_RPCSEC_GSS_KRB5=y
# CONFIG_RPCSEC_GSS_SPKM3 is not set
# CONFIG_SMB_FS is not set
@@ -2006,7 +2221,7 @@ CONFIG_NLS_UTF8=y
#
CONFIG_TRACE_IRQFLAGS_SUPPORT=y
CONFIG_PRINTK_TIME=y
-CONFIG_ENABLE_WARN_DEPRECATED=y
+# CONFIG_ENABLE_WARN_DEPRECATED is not set
CONFIG_ENABLE_MUST_CHECK=y
CONFIG_FRAME_WARN=2048
CONFIG_MAGIC_SYSRQ=y
@@ -2035,40 +2250,60 @@ CONFIG_TIMER_STATS=y
CONFIG_DEBUG_BUGVERBOSE=y
# CONFIG_DEBUG_INFO is not set
# CONFIG_DEBUG_VM is not set
+# CONFIG_DEBUG_VIRTUAL is not set
# CONFIG_DEBUG_WRITECOUNT is not set
CONFIG_DEBUG_MEMORY_INIT=y
# CONFIG_DEBUG_LIST is not set
# CONFIG_DEBUG_SG is not set
+# CONFIG_DEBUG_NOTIFIERS is not set
+CONFIG_ARCH_WANT_FRAME_POINTERS=y
CONFIG_FRAME_POINTER=y
# CONFIG_BOOT_PRINTK_DELAY is not set
# CONFIG_RCU_TORTURE_TEST is not set
+# CONFIG_RCU_CPU_STALL_DETECTOR is not set
# CONFIG_KPROBES_SANITY_TEST is not set
# CONFIG_BACKTRACE_SELF_TEST is not set
+# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set
# CONFIG_LKDTM is not set
# CONFIG_FAULT_INJECTION is not set
# CONFIG_LATENCYTOP is not set
CONFIG_SYSCTL_SYSCALL_CHECK=y
-CONFIG_HAVE_FTRACE=y
+CONFIG_USER_STACKTRACE_SUPPORT=y
+CONFIG_HAVE_FUNCTION_TRACER=y
+CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
+CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
CONFIG_HAVE_DYNAMIC_FTRACE=y
-# CONFIG_FTRACE is not set
+CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_HAVE_HW_BRANCH_TRACER=y
+
+#
+# Tracers
+#
+# CONFIG_FUNCTION_TRACER is not set
# CONFIG_IRQSOFF_TRACER is not set
# CONFIG_SYSPROF_TRACER is not set
# CONFIG_SCHED_TRACER is not set
# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_POWER_TRACER is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_HW_BRANCH_TRACER is not set
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
+# CONFIG_DYNAMIC_PRINTK_DEBUG is not set
# CONFIG_SAMPLES is not set
CONFIG_HAVE_ARCH_KGDB=y
# CONFIG_KGDB is not set
# CONFIG_STRICT_DEVMEM is not set
CONFIG_X86_VERBOSE_BOOTUP=y
CONFIG_EARLY_PRINTK=y
+CONFIG_EARLY_PRINTK_DBGP=y
CONFIG_DEBUG_STACKOVERFLOW=y
CONFIG_DEBUG_STACK_USAGE=y
# CONFIG_DEBUG_PAGEALLOC is not set
# CONFIG_DEBUG_PER_CPU_MAPS is not set
# CONFIG_X86_PTDUMP is not set
CONFIG_DEBUG_RODATA=y
-# CONFIG_DIRECT_GBPAGES is not set
# CONFIG_DEBUG_RODATA_TEST is not set
CONFIG_DEBUG_NX_TEST=m
# CONFIG_IOMMU_DEBUG is not set
@@ -2092,8 +2327,10 @@ CONFIG_OPTIMIZE_INLINING=y
CONFIG_KEYS=y
CONFIG_KEYS_DEBUG_PROC_KEYS=y
CONFIG_SECURITY=y
+# CONFIG_SECURITYFS is not set
CONFIG_SECURITY_NETWORK=y
# CONFIG_SECURITY_NETWORK_XFRM is not set
+# CONFIG_SECURITY_PATH is not set
CONFIG_SECURITY_FILE_CAPABILITIES=y
# CONFIG_SECURITY_ROOTPLUG is not set
CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536
@@ -2104,7 +2341,6 @@ CONFIG_SECURITY_SELINUX_DISABLE=y
CONFIG_SECURITY_SELINUX_DEVELOP=y
CONFIG_SECURITY_SELINUX_AVC_STATS=y
CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
-# CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT is not set
# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
# CONFIG_SECURITY_SMACK is not set
CONFIG_CRYPTO=y
@@ -2112,11 +2348,18 @@ CONFIG_CRYPTO=y
#
# Crypto core or helper
#
+# CONFIG_CRYPTO_FIPS is not set
CONFIG_CRYPTO_ALGAPI=y
+CONFIG_CRYPTO_ALGAPI2=y
CONFIG_CRYPTO_AEAD=y
+CONFIG_CRYPTO_AEAD2=y
CONFIG_CRYPTO_BLKCIPHER=y
+CONFIG_CRYPTO_BLKCIPHER2=y
CONFIG_CRYPTO_HASH=y
+CONFIG_CRYPTO_HASH2=y
+CONFIG_CRYPTO_RNG2=y
CONFIG_CRYPTO_MANAGER=y
+CONFIG_CRYPTO_MANAGER2=y
# CONFIG_CRYPTO_GF128MUL is not set
# CONFIG_CRYPTO_NULL is not set
# CONFIG_CRYPTO_CRYPTD is not set
@@ -2151,6 +2394,7 @@ CONFIG_CRYPTO_HMAC=y
# Digest
#
# CONFIG_CRYPTO_CRC32C is not set
+# CONFIG_CRYPTO_CRC32C_INTEL is not set
# CONFIG_CRYPTO_MD4 is not set
CONFIG_CRYPTO_MD5=y
# CONFIG_CRYPTO_MICHAEL_MIC is not set
@@ -2191,6 +2435,11 @@ CONFIG_CRYPTO_DES=y
#
# CONFIG_CRYPTO_DEFLATE is not set
# CONFIG_CRYPTO_LZO is not set
+
+#
+# Random Number Generation
+#
+# CONFIG_CRYPTO_ANSI_CPRNG is not set
CONFIG_CRYPTO_HW=y
# CONFIG_CRYPTO_DEV_HIFN_795X is not set
CONFIG_HAVE_KVM=y
@@ -2205,6 +2454,7 @@ CONFIG_VIRTUALIZATION=y
CONFIG_BITREVERSE=y
CONFIG_GENERIC_FIND_FIRST_BIT=y
CONFIG_GENERIC_FIND_NEXT_BIT=y
+CONFIG_GENERIC_FIND_LAST_BIT=y
# CONFIG_CRC_CCITT is not set
# CONFIG_CRC16 is not set
CONFIG_CRC_T10DIF=y
diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c
index 070afc5b6c9..b9d00261703 100644
--- a/arch/x86/crypto/crc32c-intel.c
+++ b/arch/x86/crypto/crc32c-intel.c
@@ -6,13 +6,22 @@
* Intel(R) 64 and IA-32 Architectures Software Developer's Manual
* Volume 2A: Instruction Set Reference, A-M
*
- * Copyright (c) 2008 Austin Zhang <austin_zhang@linux.intel.com>
- * Copyright (c) 2008 Kent Liu <kent.liu@intel.com>
+ * Copyright (C) 2008 Intel Corporation
+ * Authors: Austin Zhang <austin_zhang@linux.intel.com>
+ * Kent Liu <kent.liu@intel.com>
*
* This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
*
*/
#include <linux/init.h>
@@ -75,99 +84,92 @@ static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len
* If your algorithm starts with ~0, then XOR with ~0 before you set
* the seed.
*/
-static int crc32c_intel_setkey(struct crypto_ahash *hash, const u8 *key,
+static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key,
unsigned int keylen)
{
- u32 *mctx = crypto_ahash_ctx(hash);
+ u32 *mctx = crypto_shash_ctx(hash);
if (keylen != sizeof(u32)) {
- crypto_ahash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
return -EINVAL;
}
*mctx = le32_to_cpup((__le32 *)key);
return 0;
}
-static int crc32c_intel_init(struct ahash_request *req)
+static int crc32c_intel_init(struct shash_desc *desc)
{
- u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
- u32 *crcp = ahash_request_ctx(req);
+ u32 *mctx = crypto_shash_ctx(desc->tfm);
+ u32 *crcp = shash_desc_ctx(desc);
*crcp = *mctx;
return 0;
}
-static int crc32c_intel_update(struct ahash_request *req)
+static int crc32c_intel_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
{
- struct crypto_hash_walk walk;
- u32 *crcp = ahash_request_ctx(req);
- u32 crc = *crcp;
- int nbytes;
-
- for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
- nbytes = crypto_hash_walk_done(&walk, 0))
- crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
+ u32 *crcp = shash_desc_ctx(desc);
- *crcp = crc;
+ *crcp = crc32c_intel_le_hw(*crcp, data, len);
return 0;
}
-static int crc32c_intel_final(struct ahash_request *req)
+static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
+ u8 *out)
{
- u32 *crcp = ahash_request_ctx(req);
-
- *(__le32 *)req->result = ~cpu_to_le32p(crcp);
+ *(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
return 0;
}
-static int crc32c_intel_digest(struct ahash_request *req)
+static int crc32c_intel_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
{
- struct crypto_hash_walk walk;
- u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
- u32 crc = *mctx;
- int nbytes;
+ return __crc32c_intel_finup(shash_desc_ctx(desc), data, len, out);
+}
- for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
- nbytes = crypto_hash_walk_done(&walk, 0))
- crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
+static int crc32c_intel_final(struct shash_desc *desc, u8 *out)
+{
+ u32 *crcp = shash_desc_ctx(desc);
- *(__le32 *)req->result = ~cpu_to_le32(crc);
+ *(__le32 *)out = ~cpu_to_le32p(crcp);
return 0;
}
+static int crc32c_intel_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
+ out);
+}
+
static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
{
u32 *key = crypto_tfm_ctx(tfm);
*key = ~0;
- tfm->crt_ahash.reqsize = sizeof(u32);
-
return 0;
}
-static struct crypto_alg alg = {
- .cra_name = "crc32c",
- .cra_driver_name = "crc32c-intel",
- .cra_priority = 200,
- .cra_flags = CRYPTO_ALG_TYPE_AHASH,
- .cra_blocksize = CHKSUM_BLOCK_SIZE,
- .cra_alignmask = 3,
- .cra_ctxsize = sizeof(u32),
- .cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT(alg.cra_list),
- .cra_init = crc32c_intel_cra_init,
- .cra_type = &crypto_ahash_type,
- .cra_u = {
- .ahash = {
- .digestsize = CHKSUM_DIGEST_SIZE,
- .setkey = crc32c_intel_setkey,
- .init = crc32c_intel_init,
- .update = crc32c_intel_update,
- .final = crc32c_intel_final,
- .digest = crc32c_intel_digest,
- }
+static struct shash_alg alg = {
+ .setkey = crc32c_intel_setkey,
+ .init = crc32c_intel_init,
+ .update = crc32c_intel_update,
+ .final = crc32c_intel_final,
+ .finup = crc32c_intel_finup,
+ .digest = crc32c_intel_digest,
+ .descsize = sizeof(u32),
+ .digestsize = CHKSUM_DIGEST_SIZE,
+ .base = {
+ .cra_name = "crc32c",
+ .cra_driver_name = "crc32c-intel",
+ .cra_priority = 200,
+ .cra_blocksize = CHKSUM_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(u32),
+ .cra_module = THIS_MODULE,
+ .cra_init = crc32c_intel_cra_init,
}
};
@@ -175,14 +177,14 @@ static struct crypto_alg alg = {
static int __init crc32c_intel_mod_init(void)
{
if (cpu_has_xmm4_2)
- return crypto_register_alg(&alg);
+ return crypto_register_shash(&alg);
else
return -ENODEV;
}
static void __exit crc32c_intel_mod_fini(void)
{
- crypto_unregister_alg(&alg);
+ crypto_unregister_shash(&alg);
}
module_init(crc32c_intel_mod_init);
@@ -194,4 +196,3 @@ MODULE_LICENSE("GPL");
MODULE_ALIAS("crc32c");
MODULE_ALIAS("crc32c-intel");
-
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 127ec3f0721..2a4d073d2cf 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -327,7 +327,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
current->mm->cached_hole_size = 0;
current->mm->mmap = NULL;
- compute_creds(bprm);
+ install_exec_creds(bprm);
current->flags &= ~PF_FORKNOEXEC;
if (N_MAGIC(ex) == OMAGIC) {
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 4bc02b23674..dd77ac0cac4 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -24,13 +24,14 @@
#include <asm/ucontext.h>
#include <asm/uaccess.h>
#include <asm/i387.h>
-#include <asm/ia32.h>
#include <asm/ptrace.h>
#include <asm/ia32_unistd.h>
#include <asm/user32.h>
#include <asm/sigcontext32.h>
#include <asm/proto.h>
#include <asm/vdso.h>
+#include <asm/sigframe.h>
+#include <asm/sys_ia32.h>
#define DEBUG_SIG 0
@@ -41,83 +42,87 @@
X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
X86_EFLAGS_CF)
-asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
{
- int err;
+ int err = 0;
if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
return -EFAULT;
- /* If you change siginfo_t structure, please make sure that
- this code is fixed accordingly.
- It should never copy any pad contained in the structure
- to avoid security leaks, but must copy the generic
- 3 ints plus the relevant union member. */
- err = __put_user(from->si_signo, &to->si_signo);
- err |= __put_user(from->si_errno, &to->si_errno);
- err |= __put_user((short)from->si_code, &to->si_code);
-
- if (from->si_code < 0) {
- err |= __put_user(from->si_pid, &to->si_pid);
- err |= __put_user(from->si_uid, &to->si_uid);
- err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr);
- } else {
- /*
- * First 32bits of unions are always present:
- * si_pid === si_band === si_tid === si_addr(LS half)
- */
- err |= __put_user(from->_sifields._pad[0],
- &to->_sifields._pad[0]);
- switch (from->si_code >> 16) {
- case __SI_FAULT >> 16:
- break;
- case __SI_CHLD >> 16:
- err |= __put_user(from->si_utime, &to->si_utime);
- err |= __put_user(from->si_stime, &to->si_stime);
- err |= __put_user(from->si_status, &to->si_status);
- /* FALL THROUGH */
- default:
- case __SI_KILL >> 16:
- err |= __put_user(from->si_uid, &to->si_uid);
- break;
- case __SI_POLL >> 16:
- err |= __put_user(from->si_fd, &to->si_fd);
- break;
- case __SI_TIMER >> 16:
- err |= __put_user(from->si_overrun, &to->si_overrun);
- err |= __put_user(ptr_to_compat(from->si_ptr),
- &to->si_ptr);
- break;
- /* This is not generated by the kernel as of now. */
- case __SI_RT >> 16:
- case __SI_MESGQ >> 16:
- err |= __put_user(from->si_uid, &to->si_uid);
- err |= __put_user(from->si_int, &to->si_int);
- break;
+ put_user_try {
+ /* If you change siginfo_t structure, please make sure that
+ this code is fixed accordingly.
+ It should never copy any pad contained in the structure
+ to avoid security leaks, but must copy the generic
+ 3 ints plus the relevant union member. */
+ put_user_ex(from->si_signo, &to->si_signo);
+ put_user_ex(from->si_errno, &to->si_errno);
+ put_user_ex((short)from->si_code, &to->si_code);
+
+ if (from->si_code < 0) {
+ put_user_ex(from->si_pid, &to->si_pid);
+ put_user_ex(from->si_uid, &to->si_uid);
+ put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
+ } else {
+ /*
+ * First 32bits of unions are always present:
+ * si_pid === si_band === si_tid === si_addr(LS half)
+ */
+ put_user_ex(from->_sifields._pad[0],
+ &to->_sifields._pad[0]);
+ switch (from->si_code >> 16) {
+ case __SI_FAULT >> 16:
+ break;
+ case __SI_CHLD >> 16:
+ put_user_ex(from->si_utime, &to->si_utime);
+ put_user_ex(from->si_stime, &to->si_stime);
+ put_user_ex(from->si_status, &to->si_status);
+ /* FALL THROUGH */
+ default:
+ case __SI_KILL >> 16:
+ put_user_ex(from->si_uid, &to->si_uid);
+ break;
+ case __SI_POLL >> 16:
+ put_user_ex(from->si_fd, &to->si_fd);
+ break;
+ case __SI_TIMER >> 16:
+ put_user_ex(from->si_overrun, &to->si_overrun);
+ put_user_ex(ptr_to_compat(from->si_ptr),
+ &to->si_ptr);
+ break;
+ /* This is not generated by the kernel as of now. */
+ case __SI_RT >> 16:
+ case __SI_MESGQ >> 16:
+ put_user_ex(from->si_uid, &to->si_uid);
+ put_user_ex(from->si_int, &to->si_int);
+ break;
+ }
}
- }
+ } put_user_catch(err);
+
return err;
}
int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
{
- int err;
+ int err = 0;
u32 ptr32;
if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
return -EFAULT;
- err = __get_user(to->si_signo, &from->si_signo);
- err |= __get_user(to->si_errno, &from->si_errno);
- err |= __get_user(to->si_code, &from->si_code);
+ get_user_try {
+ get_user_ex(to->si_signo, &from->si_signo);
+ get_user_ex(to->si_errno, &from->si_errno);
+ get_user_ex(to->si_code, &from->si_code);
- err |= __get_user(to->si_pid, &from->si_pid);
- err |= __get_user(to->si_uid, &from->si_uid);
- err |= __get_user(ptr32, &from->si_ptr);
- to->si_ptr = compat_ptr(ptr32);
+ get_user_ex(to->si_pid, &from->si_pid);
+ get_user_ex(to->si_uid, &from->si_uid);
+ get_user_ex(ptr32, &from->si_ptr);
+ to->si_ptr = compat_ptr(ptr32);
+ } get_user_catch(err);
return err;
}
@@ -142,17 +147,23 @@ asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
struct pt_regs *regs)
{
stack_t uss, uoss;
- int ret;
+ int ret, err = 0;
mm_segment_t seg;
if (uss_ptr) {
u32 ptr;
memset(&uss, 0, sizeof(stack_t));
- if (!access_ok(VERIFY_READ, uss_ptr, sizeof(stack_ia32_t)) ||
- __get_user(ptr, &uss_ptr->ss_sp) ||
- __get_user(uss.ss_flags, &uss_ptr->ss_flags) ||
- __get_user(uss.ss_size, &uss_ptr->ss_size))
+ if (!access_ok(VERIFY_READ, uss_ptr, sizeof(stack_ia32_t)))
+ return -EFAULT;
+
+ get_user_try {
+ get_user_ex(ptr, &uss_ptr->ss_sp);
+ get_user_ex(uss.ss_flags, &uss_ptr->ss_flags);
+ get_user_ex(uss.ss_size, &uss_ptr->ss_size);
+ } get_user_catch(err);
+
+ if (err)
return -EFAULT;
uss.ss_sp = compat_ptr(ptr);
}
@@ -161,10 +172,16 @@ asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->sp);
set_fs(seg);
if (ret >= 0 && uoss_ptr) {
- if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(stack_ia32_t)) ||
- __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
- __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
- __put_user(uoss.ss_size, &uoss_ptr->ss_size))
+ if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(stack_ia32_t)))
+ return -EFAULT;
+
+ put_user_try {
+ put_user_ex(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp);
+ put_user_ex(uoss.ss_flags, &uoss_ptr->ss_flags);
+ put_user_ex(uoss.ss_size, &uoss_ptr->ss_size);
+ } put_user_catch(err);
+
+ if (err)
ret = -EFAULT;
}
return ret;
@@ -173,47 +190,28 @@ asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
/*
* Do a signal return; undo the signal stack.
*/
+#define COPY(x) { \
+ get_user_ex(regs->x, &sc->x); \
+}
-struct sigframe
-{
- u32 pretcode;
- int sig;
- struct sigcontext_ia32 sc;
- struct _fpstate_ia32 fpstate_unused; /* look at kernel/sigframe.h */
- unsigned int extramask[_COMPAT_NSIG_WORDS-1];
- char retcode[8];
- /* fp state follows here */
-};
-
-struct rt_sigframe
-{
- u32 pretcode;
- int sig;
- u32 pinfo;
- u32 puc;
- compat_siginfo_t info;
- struct ucontext_ia32 uc;
- char retcode[8];
- /* fp state follows here */
-};
-
-#define COPY(x) { \
- unsigned int reg; \
- err |= __get_user(reg, &sc->x); \
- regs->x = reg; \
+#define COPY_SEG_CPL3(seg) { \
+ unsigned short tmp; \
+ get_user_ex(tmp, &sc->seg); \
+ regs->seg = tmp | 3; \
}
-#define RELOAD_SEG(seg,mask) \
- { unsigned int cur; \
- unsigned short pre; \
- err |= __get_user(pre, &sc->seg); \
- savesegment(seg, cur); \
- pre |= mask; \
- if (pre != cur) loadsegment(seg, pre); }
+#define RELOAD_SEG(seg) { \
+ unsigned int cur, pre; \
+ get_user_ex(pre, &sc->seg); \
+ savesegment(seg, cur); \
+ pre |= 3; \
+ if (pre != cur) \
+ loadsegment(seg, pre); \
+}
static int ia32_restore_sigcontext(struct pt_regs *regs,
struct sigcontext_ia32 __user *sc,
- unsigned int *peax)
+ unsigned int *pax)
{
unsigned int tmpflags, gs, oldgs, err = 0;
void __user *buf;
@@ -228,49 +226,48 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
sc, sc->err, sc->ip, sc->cs, sc->flags);
#endif
- /*
- * Reload fs and gs if they have changed in the signal
- * handler. This does not handle long fs/gs base changes in
- * the handler, but does not clobber them at least in the
- * normal case.
- */
- err |= __get_user(gs, &sc->gs);
- gs |= 3;
- savesegment(gs, oldgs);
- if (gs != oldgs)
- load_gs_index(gs);
-
- RELOAD_SEG(fs, 3);
- RELOAD_SEG(ds, 3);
- RELOAD_SEG(es, 3);
-
- COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
- COPY(dx); COPY(cx); COPY(ip);
- /* Don't touch extended registers */
-
- err |= __get_user(regs->cs, &sc->cs);
- regs->cs |= 3;
- err |= __get_user(regs->ss, &sc->ss);
- regs->ss |= 3;
-
- err |= __get_user(tmpflags, &sc->flags);
- regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
- /* disable syscall checks */
- regs->orig_ax = -1;
-
- err |= __get_user(tmp, &sc->fpstate);
- buf = compat_ptr(tmp);
- err |= restore_i387_xstate_ia32(buf);
-
- err |= __get_user(tmp, &sc->ax);
- *peax = tmp;
+ get_user_try {
+ /*
+ * Reload fs and gs if they have changed in the signal
+ * handler. This does not handle long fs/gs base changes in
+ * the handler, but does not clobber them at least in the
+ * normal case.
+ */
+ get_user_ex(gs, &sc->gs);
+ gs |= 3;
+ savesegment(gs, oldgs);
+ if (gs != oldgs)
+ load_gs_index(gs);
+
+ RELOAD_SEG(fs);
+ RELOAD_SEG(ds);
+ RELOAD_SEG(es);
+
+ COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
+ COPY(dx); COPY(cx); COPY(ip);
+ /* Don't touch extended registers */
+
+ COPY_SEG_CPL3(cs);
+ COPY_SEG_CPL3(ss);
+
+ get_user_ex(tmpflags, &sc->flags);
+ regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
+ /* disable syscall checks */
+ regs->orig_ax = -1;
+
+ get_user_ex(tmp, &sc->fpstate);
+ buf = compat_ptr(tmp);
+ err |= restore_i387_xstate_ia32(buf);
+
+ get_user_ex(*pax, &sc->ax);
+ } get_user_catch(err);
return err;
}
asmlinkage long sys32_sigreturn(struct pt_regs *regs)
{
- struct sigframe __user *frame = (struct sigframe __user *)(regs->sp-8);
+ struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
sigset_t set;
unsigned int ax;
@@ -300,12 +297,12 @@ badframe:
asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
{
- struct rt_sigframe __user *frame;
+ struct rt_sigframe_ia32 __user *frame;
sigset_t set;
unsigned int ax;
struct pt_regs tregs;
- frame = (struct rt_sigframe __user *)(regs->sp - 4);
+ frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4);
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
goto badframe;
@@ -342,41 +339,38 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
{
int tmp, err = 0;
- savesegment(gs, tmp);
- err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
- savesegment(fs, tmp);
- err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
- savesegment(ds, tmp);
- err |= __put_user(tmp, (unsigned int __user *)&sc->ds);
- savesegment(es, tmp);
- err |= __put_user(tmp, (unsigned int __user *)&sc->es);
-
- err |= __put_user(regs->di, &sc->di);
- err |= __put_user(regs->si, &sc->si);
- err |= __put_user(regs->bp, &sc->bp);
- err |= __put_user(regs->sp, &sc->sp);
- err |= __put_user(regs->bx, &sc->bx);
- err |= __put_user(regs->dx, &sc->dx);
- err |= __put_user(regs->cx, &sc->cx);
- err |= __put_user(regs->ax, &sc->ax);
- err |= __put_user(regs->cs, &sc->cs);
- err |= __put_user(regs->ss, &sc->ss);
- err |= __put_user(current->thread.trap_no, &sc->trapno);
- err |= __put_user(current->thread.error_code, &sc->err);
- err |= __put_user(regs->ip, &sc->ip);
- err |= __put_user(regs->flags, &sc->flags);
- err |= __put_user(regs->sp, &sc->sp_at_signal);
-
- tmp = save_i387_xstate_ia32(fpstate);
- if (tmp < 0)
- err = -EFAULT;
- else
- err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL),
- &sc->fpstate);
-
- /* non-iBCS2 extensions.. */
- err |= __put_user(mask, &sc->oldmask);
- err |= __put_user(current->thread.cr2, &sc->cr2);
+ put_user_try {
+ savesegment(gs, tmp);
+ put_user_ex(tmp, (unsigned int __user *)&sc->gs);
+ savesegment(fs, tmp);
+ put_user_ex(tmp, (unsigned int __user *)&sc->fs);
+ savesegment(ds, tmp);
+ put_user_ex(tmp, (unsigned int __user *)&sc->ds);
+ savesegment(es, tmp);
+ put_user_ex(tmp, (unsigned int __user *)&sc->es);
+
+ put_user_ex(regs->di, &sc->di);
+ put_user_ex(regs->si, &sc->si);
+ put_user_ex(regs->bp, &sc->bp);
+ put_user_ex(regs->sp, &sc->sp);
+ put_user_ex(regs->bx, &sc->bx);
+ put_user_ex(regs->dx, &sc->dx);
+ put_user_ex(regs->cx, &sc->cx);
+ put_user_ex(regs->ax, &sc->ax);
+ put_user_ex(current->thread.trap_no, &sc->trapno);
+ put_user_ex(current->thread.error_code, &sc->err);
+ put_user_ex(regs->ip, &sc->ip);
+ put_user_ex(regs->cs, (unsigned int __user *)&sc->cs);
+ put_user_ex(regs->flags, &sc->flags);
+ put_user_ex(regs->sp, &sc->sp_at_signal);
+ put_user_ex(regs->ss, (unsigned int __user *)&sc->ss);
+
+ put_user_ex(ptr_to_compat(fpstate), &sc->fpstate);
+
+ /* non-iBCS2 extensions.. */
+ put_user_ex(mask, &sc->oldmask);
+ put_user_ex(current->thread.cr2, &sc->cr2);
+ } put_user_catch(err);
return err;
}
@@ -400,7 +394,7 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
}
/* This is the legacy signal stack switching. */
- else if ((regs->ss & 0xffff) != __USER_DS &&
+ else if ((regs->ss & 0xffff) != __USER32_DS &&
!(ka->sa.sa_flags & SA_RESTORER) &&
ka->sa.sa_restorer)
sp = (unsigned long) ka->sa.sa_restorer;
@@ -408,6 +402,8 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
if (used_math()) {
sp = sp - sig_xstate_ia32_size;
*fpstate = (struct _fpstate_ia32 *) sp;
+ if (save_i387_xstate_ia32(*fpstate) < 0)
+ return (void __user *) -1L;
}
sp -= frame_size;
@@ -420,7 +416,7 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
int ia32_setup_frame(int sig, struct k_sigaction *ka,
compat_sigset_t *set, struct pt_regs *regs)
{
- struct sigframe __user *frame;
+ struct sigframe_ia32 __user *frame;
void __user *restorer;
int err = 0;
void __user *fpstate = NULL;
@@ -430,12 +426,10 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
u16 poplmovl;
u32 val;
u16 int80;
- u16 pad;
} __attribute__((packed)) code = {
0xb858, /* popl %eax ; movl $...,%eax */
__NR_ia32_sigreturn,
0x80cd, /* int $0x80 */
- 0,
};
frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
@@ -465,13 +459,17 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
else
restorer = &frame->retcode;
}
- err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
- /*
- * These are actually not used anymore, but left because some
- * gdb versions depend on them as a marker.
- */
- err |= __copy_to_user(frame->retcode, &code, 8);
+ put_user_try {
+ put_user_ex(ptr_to_compat(restorer), &frame->pretcode);
+
+ /*
+ * These are actually not used anymore, but left because some
+ * gdb versions depend on them as a marker.
+ */
+ put_user_ex(*((u64 *)&code), (u64 *)frame->retcode);
+ } put_user_catch(err);
+
if (err)
return -EFAULT;
@@ -501,7 +499,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
compat_sigset_t *set, struct pt_regs *regs)
{
- struct rt_sigframe __user *frame;
+ struct rt_sigframe_ia32 __user *frame;
void __user *restorer;
int err = 0;
void __user *fpstate = NULL;
@@ -511,8 +509,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
u8 movl;
u32 val;
u16 int80;
- u16 pad;
- u8 pad2;
+ u8 pad;
} __attribute__((packed)) code = {
0xb8,
__NR_ia32_rt_sigreturn,
@@ -525,41 +522,40 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
return -EFAULT;
- err |= __put_user(sig, &frame->sig);
- err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
- err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
- err |= copy_siginfo_to_user32(&frame->info, info);
- if (err)
- return -EFAULT;
+ put_user_try {
+ put_user_ex(sig, &frame->sig);
+ put_user_ex(ptr_to_compat(&frame->info), &frame->pinfo);
+ put_user_ex(ptr_to_compat(&frame->uc), &frame->puc);
+ err |= copy_siginfo_to_user32(&frame->info, info);
- /* Create the ucontext. */
- if (cpu_has_xsave)
- err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
- else
- err |= __put_user(0, &frame->uc.uc_flags);
- err |= __put_user(0, &frame->uc.uc_link);
- err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
- err |= __put_user(sas_ss_flags(regs->sp),
- &frame->uc.uc_stack.ss_flags);
- err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
- err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
- regs, set->sig[0]);
- err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
- if (err)
- return -EFAULT;
+ /* Create the ucontext. */
+ if (cpu_has_xsave)
+ put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
+ else
+ put_user_ex(0, &frame->uc.uc_flags);
+ put_user_ex(0, &frame->uc.uc_link);
+ put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+ put_user_ex(sas_ss_flags(regs->sp),
+ &frame->uc.uc_stack.ss_flags);
+ put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
+ err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
+ regs, set->sig[0]);
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
+ if (ka->sa.sa_flags & SA_RESTORER)
+ restorer = ka->sa.sa_restorer;
+ else
+ restorer = VDSO32_SYMBOL(current->mm->context.vdso,
+ rt_sigreturn);
+ put_user_ex(ptr_to_compat(restorer), &frame->pretcode);
+
+ /*
+ * Not actually used anymore, but left because some gdb
+ * versions need it.
+ */
+ put_user_ex(*((u64 *)&code), (u64 *)frame->retcode);
+ } put_user_catch(err);
- if (ka->sa.sa_flags & SA_RESTORER)
- restorer = ka->sa.sa_restorer;
- else
- restorer = VDSO32_SYMBOL(current->mm->context.vdso,
- rt_sigreturn);
- err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
-
- /*
- * Not actually used anymore, but left because some gdb
- * versions need it.
- */
- err |= __copy_to_user(frame->retcode, &code, 8);
if (err)
return -EFAULT;
@@ -572,11 +568,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
regs->dx = (unsigned long) &frame->info;
regs->cx = (unsigned long) &frame->uc;
- /* Make -mregparm=3 work */
- regs->ax = sig;
- regs->dx = (unsigned long) &frame->info;
- regs->cx = (unsigned long) &frame->uc;
-
loadsegment(ds, __USER32_DS);
loadsegment(es, __USER32_DS);
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 256b00b6189..097a6b64c24 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -112,8 +112,8 @@ ENTRY(ia32_sysenter_target)
CFI_DEF_CFA rsp,0
CFI_REGISTER rsp,rbp
SWAPGS_UNSAFE_STACK
- movq %gs:pda_kernelstack, %rsp
- addq $(PDA_STACKOFFSET),%rsp
+ movq PER_CPU_VAR(kernel_stack), %rsp
+ addq $(KERNEL_STACK_OFFSET),%rsp
/*
* No need to follow this irqs on/off section: the syscall
* disabled irqs, here we enable it straight after entry:
@@ -273,13 +273,13 @@ ENDPROC(ia32_sysenter_target)
ENTRY(ia32_cstar_target)
CFI_STARTPROC32 simple
CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,PDA_STACKOFFSET
+ CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
CFI_REGISTER rip,rcx
/*CFI_REGISTER rflags,r11*/
SWAPGS_UNSAFE_STACK
movl %esp,%r8d
CFI_REGISTER rsp,r8
- movq %gs:pda_kernelstack,%rsp
+ movq PER_CPU_VAR(kernel_stack),%rsp
/*
* No need to follow this irqs on/off section: the syscall
* disabled irqs and here we enable it straight after entry:
@@ -418,9 +418,9 @@ ENTRY(ia32_syscall)
orl $TS_COMPAT,TI_status(%r10)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
jnz ia32_tracesys
-ia32_do_syscall:
cmpl $(IA32_NR_syscalls-1),%eax
- ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
+ ja ia32_badsys
+ia32_do_call:
IA32_ARG_FIXUP
call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
ia32_sysret:
@@ -435,7 +435,9 @@ ia32_tracesys:
call syscall_trace_enter
LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
RESTORE_REST
- jmp ia32_do_syscall
+ cmpl $(IA32_NR_syscalls-1),%eax
+ ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
+ jmp ia32_do_call
END(ia32_syscall)
ia32_badsys:
diff --git a/arch/x86/ia32/ipc32.c b/arch/x86/ia32/ipc32.c
index d21991ce606..29cdcd02ead 100644
--- a/arch/x86/ia32/ipc32.c
+++ b/arch/x86/ia32/ipc32.c
@@ -8,6 +8,7 @@
#include <linux/shm.h>
#include <linux/ipc.h>
#include <linux/compat.h>
+#include <asm/sys_ia32.h>
asmlinkage long sys32_ipc(u32 call, int first, int second, int third,
compat_uptr_t ptr, u32 fifth)
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 2e09dcd3c0a..6c0d7f6231a 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -44,8 +44,8 @@
#include <asm/types.h>
#include <asm/uaccess.h>
#include <asm/atomic.h>
-#include <asm/ia32.h>
#include <asm/vgtod.h>
+#include <asm/sys_ia32.h>
#define AA(__x) ((unsigned long)(__x))
diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
index 37822206083..bb70e397aa8 100644
--- a/arch/x86/include/asm/a.out-core.h
+++ b/arch/x86/include/asm/a.out-core.h
@@ -23,8 +23,6 @@
*/
static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
{
- u16 gs;
-
/* changed the size calculations - should hopefully work better. lbt */
dump->magic = CMAGIC;
dump->start_code = 0;
@@ -57,7 +55,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
dump->regs.ds = (u16)regs->ds;
dump->regs.es = (u16)regs->es;
dump->regs.fs = (u16)regs->fs;
- savesegment(gs, gs);
+ dump->regs.gs = get_user_gs(regs);
dump->regs.orig_ax = regs->orig_ax;
dump->regs.ip = regs->ip;
dump->regs.cs = (u16)regs->cs;
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 8d676d8ecde..4518dc50090 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -102,9 +102,6 @@ static inline void disable_acpi(void)
acpi_noirq = 1;
}
-/* Fixmap pages to reserve for ACPI boot-time tables (see fixmap.h) */
-#define FIX_ACPI_PAGES 4
-
extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
@@ -113,7 +110,6 @@ static inline void acpi_disable_pci(void)
acpi_pci_disabled = 1;
acpi_noirq_set();
}
-extern int acpi_irq_balance_set(char *str);
/* routines for saving/restoring kernel state */
extern int acpi_save_state_mem(void);
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 1a30c0440c6..95c8cd9d22b 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -190,16 +190,23 @@
/* FIXME: move this macro to <linux/pci.h> */
#define PCI_BUS(x) (((x) >> 8) & 0xff)
+/* Protection domain flags */
+#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */
+#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops
+ domain for an IOMMU */
+
/*
* This structure contains generic data for IOMMU protection domains
* independent of their use.
*/
struct protection_domain {
- spinlock_t lock; /* mostly used to lock the page table*/
- u16 id; /* the domain id written to the device table */
- int mode; /* paging mode (0-6 levels) */
- u64 *pt_root; /* page table root pointer */
- void *priv; /* private data */
+ spinlock_t lock; /* mostly used to lock the page table*/
+ u16 id; /* the domain id written to the device table */
+ int mode; /* paging mode (0-6 levels) */
+ u64 *pt_root; /* page table root pointer */
+ unsigned long flags; /* flags to find out type of domain */
+ unsigned dev_cnt; /* devices assigned to this domain */
+ void *priv; /* private data */
};
/*
@@ -251,13 +258,6 @@ struct amd_iommu {
/* Pointer to PCI device of this IOMMU */
struct pci_dev *dev;
- /*
- * Capability pointer. There could be more than one IOMMU per PCI
- * device function if there are more than one AMD IOMMU capability
- * pointers.
- */
- u16 cap_ptr;
-
/* physical address of MMIO space */
u64 mmio_phys;
/* virtual address of MMIO space */
@@ -266,6 +266,13 @@ struct amd_iommu {
/* capabilities of that IOMMU read from ACPI */
u32 cap;
+ /*
+ * Capability pointer. There could be more than one IOMMU per PCI
+ * device function if there are more than one AMD IOMMU capability
+ * pointers.
+ */
+ u16 cap_ptr;
+
/* pci domain of this IOMMU */
u16 pci_seg;
@@ -284,19 +291,19 @@ struct amd_iommu {
/* size of command buffer */
u32 cmd_buf_size;
- /* event buffer virtual address */
- u8 *evt_buf;
/* size of event buffer */
u32 evt_buf_size;
+ /* event buffer virtual address */
+ u8 *evt_buf;
/* MSI number for event interrupt */
u16 evt_msi_num;
- /* if one, we need to send a completion wait command */
- int need_sync;
-
/* true if interrupts for this IOMMU are already enabled */
bool int_enabled;
+ /* if one, we need to send a completion wait command */
+ bool need_sync;
+
/* default dma_ops domain for that IOMMU */
struct dma_ops_domain *default_dom;
};
@@ -374,7 +381,7 @@ extern struct protection_domain **amd_iommu_pd_table;
extern unsigned long *amd_iommu_pd_alloc_bitmap;
/* will be 1 if device isolation is enabled */
-extern int amd_iommu_isolate;
+extern bool amd_iommu_isolate;
/*
* If true, the addresses will be flushed on unmap time, not when
@@ -382,18 +389,6 @@ extern int amd_iommu_isolate;
*/
extern bool amd_iommu_unmap_flush;
-/* takes a PCI device id and prints it out in a readable form */
-static inline void print_devid(u16 devid, int nl)
-{
- int bus = devid >> 8;
- int dev = devid >> 3 & 0x1f;
- int fn = devid & 0x07;
-
- printk("%02x:%02x.%x", bus, dev, fn);
- if (nl)
- printk("\n");
-}
-
/* takes bus and device/function and returns the device id
* FIXME: should that be in generic PCI code? */
static inline u16 calc_devid(u8 bus, u8 devfn)
@@ -401,4 +396,32 @@ static inline u16 calc_devid(u8 bus, u8 devfn)
return (((u16)bus) << 8) | devfn;
}
+#ifdef CONFIG_AMD_IOMMU_STATS
+
+struct __iommu_counter {
+ char *name;
+ struct dentry *dent;
+ u64 value;
+};
+
+#define DECLARE_STATS_COUNTER(nm) \
+ static struct __iommu_counter nm = { \
+ .name = #nm, \
+ }
+
+#define INC_STATS_COUNTER(name) name.value += 1
+#define ADD_STATS_COUNTER(name, x) name.value += (x)
+#define SUB_STATS_COUNTER(name, x) name.value -= (x)
+
+#else /* CONFIG_AMD_IOMMU_STATS */
+
+#define DECLARE_STATS_COUNTER(name)
+#define INC_STATS_COUNTER(name)
+#define ADD_STATS_COUNTER(name, x)
+#define SUB_STATS_COUNTER(name, x)
+
+static inline void amd_iommu_stats_init(void) { }
+
+#endif /* CONFIG_AMD_IOMMU_STATS */
+
#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 3b1510b4fc5..fba49f66228 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -33,7 +33,13 @@
} while (0)
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
extern void generic_apic_probe(void);
+#else
+static inline void generic_apic_probe(void)
+{
+}
+#endif
#ifdef CONFIG_X86_LOCAL_APIC
@@ -41,6 +47,21 @@ extern unsigned int apic_verbosity;
extern int local_apic_timer_c2_ok;
extern int disable_apic;
+
+#ifdef CONFIG_SMP
+extern void __inquire_remote_apic(int apicid);
+#else /* CONFIG_SMP */
+static inline void __inquire_remote_apic(int apicid)
+{
+}
+#endif /* CONFIG_SMP */
+
+static inline void default_inquire_remote_apic(int apicid)
+{
+ if (apic_verbosity >= APIC_DEBUG)
+ __inquire_remote_apic(apicid);
+}
+
/*
* Basic functions accessing APICs.
*/
@@ -54,7 +75,6 @@ extern int disable_apic;
extern int is_vsmp_box(void);
extern void xapic_wait_icr_idle(void);
extern u32 safe_xapic_wait_icr_idle(void);
-extern u64 xapic_icr_read(void);
extern void xapic_icr_write(u32, u32);
extern int setup_profiling_timer(unsigned int);
@@ -93,7 +113,7 @@ static inline u32 native_apic_msr_read(u32 reg)
}
#ifndef CONFIG_X86_32
-extern int x2apic, x2apic_preenabled;
+extern int x2apic;
extern void check_x2apic(void);
extern void enable_x2apic(void);
extern void enable_IR_x2apic(void);
@@ -125,12 +145,35 @@ struct apic_ops {
extern struct apic_ops *apic_ops;
-#define apic_read (apic_ops->read)
-#define apic_write (apic_ops->write)
-#define apic_icr_read (apic_ops->icr_read)
-#define apic_icr_write (apic_ops->icr_write)
-#define apic_wait_icr_idle (apic_ops->wait_icr_idle)
-#define safe_apic_wait_icr_idle (apic_ops->safe_wait_icr_idle)
+static inline u32 apic_read(u32 reg)
+{
+ return apic_ops->read(reg);
+}
+
+static inline void apic_write(u32 reg, u32 val)
+{
+ apic_ops->write(reg, val);
+}
+
+static inline u64 apic_icr_read(void)
+{
+ return apic_ops->icr_read();
+}
+
+static inline void apic_icr_write(u32 low, u32 high)
+{
+ apic_ops->icr_write(low, high);
+}
+
+static inline void apic_wait_icr_idle(void)
+{
+ apic_ops->wait_icr_idle();
+}
+
+static inline u32 safe_apic_wait_icr_idle(void)
+{
+ return apic_ops->safe_wait_icr_idle();
+}
extern int get_physical_broadcast(void);
@@ -193,7 +236,26 @@ extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);
static inline void lapic_shutdown(void) { }
#define local_apic_timer_c2_ok 1
static inline void init_apic_mappings(void) { }
+static inline void disable_local_APIC(void) { }
#endif /* !CONFIG_X86_LOCAL_APIC */
+#ifdef CONFIG_X86_64
+#define SET_APIC_ID(x) (apic->set_apic_id(x))
+#else
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static inline unsigned default_get_apic_id(unsigned long x)
+{
+ unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
+
+ if (APIC_XAPIC(ver))
+ return (x >> 24) & 0xFF;
+ else
+ return (x >> 24) & 0x0F;
+}
+#endif
+
+#endif
+
#endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/include/asm/apicnum.h b/arch/x86/include/asm/apicnum.h
new file mode 100644
index 00000000000..82f613c607c
--- /dev/null
+++ b/arch/x86/include/asm/apicnum.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_APICNUM_H
+#define _ASM_X86_APICNUM_H
+
+/* define MAX_IO_APICS */
+#ifdef CONFIG_X86_32
+# define MAX_IO_APICS 64
+#else
+# define MAX_IO_APICS 128
+# define MAX_LOCAL_APIC 32768
+#endif
+
+#endif /* _ASM_X86_APICNUM_H */
diff --git a/arch/x86/include/asm/mach-default/apm.h b/arch/x86/include/asm/apm.h
index 20370c6db74..20370c6db74 100644
--- a/arch/x86/include/asm/mach-default/apm.h
+++ b/arch/x86/include/asm/apm.h
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index ad5b9f6ecdd..85b46fba422 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -2,6 +2,7 @@
#define _ASM_X86_ATOMIC_32_H
#include <linux/compiler.h>
+#include <linux/types.h>
#include <asm/processor.h>
#include <asm/cmpxchg.h>
@@ -10,15 +11,6 @@
* resource counting etc..
*/
-/*
- * Make sure gcc doesn't try to be clever and move things around
- * on us. We need to use _exactly_ the address the user gave us,
- * not some alias that contains the same information.
- */
-typedef struct {
- int counter;
-} atomic_t;
-
#define ATOMIC_INIT(i) { (i) }
/**
diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h
index 279d2a731f3..8c21731984d 100644
--- a/arch/x86/include/asm/atomic_64.h
+++ b/arch/x86/include/asm/atomic_64.h
@@ -1,25 +1,15 @@
#ifndef _ASM_X86_ATOMIC_64_H
#define _ASM_X86_ATOMIC_64_H
+#include <linux/types.h>
#include <asm/alternative.h>
#include <asm/cmpxchg.h>
-/* atomic_t should be 32 bit signed type */
-
/*
* Atomic operations that C can't guarantee us. Useful for
* resource counting etc..
*/
-/*
- * Make sure gcc doesn't try to be clever and move things around
- * on us. We need to use _exactly_ the address the user gave us,
- * not some alias that contains the same information.
- */
-typedef struct {
- int counter;
-} atomic_t;
-
#define ATOMIC_INIT(i) { (i) }
/**
@@ -191,11 +181,7 @@ static inline int atomic_sub_return(int i, atomic_t *v)
#define atomic_inc_return(v) (atomic_add_return(1, v))
#define atomic_dec_return(v) (atomic_sub_return(1, v))
-/* An 64bit atomic type */
-
-typedef struct {
- long counter;
-} atomic64_t;
+/* The 64-bit atomic type */
#define ATOMIC64_INIT(i) { (i) }
diff --git a/arch/x86/include/asm/bigsmp/apic.h b/arch/x86/include/asm/bigsmp/apic.h
deleted file mode 100644
index 1d9543b9d35..00000000000
--- a/arch/x86/include/asm/bigsmp/apic.h
+++ /dev/null
@@ -1,139 +0,0 @@
-#ifndef __ASM_MACH_APIC_H
-#define __ASM_MACH_APIC_H
-
-#define xapic_phys_to_log_apicid(cpu) (per_cpu(x86_bios_cpu_apicid, cpu))
-#define esr_disable (1)
-
-static inline int apic_id_registered(void)
-{
- return (1);
-}
-
-static inline cpumask_t target_cpus(void)
-{
-#ifdef CONFIG_SMP
- return cpu_online_map;
-#else
- return cpumask_of_cpu(0);
-#endif
-}
-
-#undef APIC_DEST_LOGICAL
-#define APIC_DEST_LOGICAL 0
-#define APIC_DFR_VALUE (APIC_DFR_FLAT)
-#define INT_DELIVERY_MODE (dest_Fixed)
-#define INT_DEST_MODE (0) /* phys delivery to target proc */
-#define NO_BALANCE_IRQ (0)
-#define WAKE_SECONDARY_VIA_INIT
-
-
-static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
-{
- return (0);
-}
-
-static inline unsigned long check_apicid_present(int bit)
-{
- return (1);
-}
-
-static inline unsigned long calculate_ldr(int cpu)
-{
- unsigned long val, id;
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- id = xapic_phys_to_log_apicid(cpu);
- val |= SET_APIC_LOGICAL_ID(id);
- return val;
-}
-
-/*
- * Set up the logical destination ID.
- *
- * Intel recommends to set DFR, LDR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
- */
-static inline void init_apic_ldr(void)
-{
- unsigned long val;
- int cpu = smp_processor_id();
-
- apic_write(APIC_DFR, APIC_DFR_VALUE);
- val = calculate_ldr(cpu);
- apic_write(APIC_LDR, val);
-}
-
-static inline void setup_apic_routing(void)
-{
- printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
- "Physflat", nr_ioapics);
-}
-
-static inline int multi_timer_check(int apic, int irq)
-{
- return (0);
-}
-
-static inline int apicid_to_node(int logical_apicid)
-{
- return apicid_2_node[hard_smp_processor_id()];
-}
-
-static inline int cpu_present_to_apicid(int mps_cpu)
-{
- if (mps_cpu < NR_CPUS)
- return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
-
- return BAD_APICID;
-}
-
-static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
-{
- return physid_mask_of_physid(phys_apicid);
-}
-
-extern u8 cpu_2_logical_apicid[];
-/* Mapping from cpu number to logical apicid */
-static inline int cpu_to_logical_apicid(int cpu)
-{
- if (cpu >= NR_CPUS)
- return BAD_APICID;
- return cpu_physical_id(cpu);
-}
-
-static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
-{
- /* For clustered we don't have a good way to do this yet - hack */
- return physids_promote(0xFFL);
-}
-
-static inline void setup_portio_remap(void)
-{
-}
-
-static inline void enable_apic_mode(void)
-{
-}
-
-static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
-{
- return (1);
-}
-
-/* As we are using single CPU as destination, pick only one CPU here */
-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
-{
- int cpu;
- int apicid;
-
- cpu = first_cpu(cpumask);
- apicid = cpu_to_logical_apicid(cpu);
- return apicid;
-}
-
-static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
-{
- return cpuid_apic >> index_msb;
-}
-
-#endif /* __ASM_MACH_APIC_H */
diff --git a/arch/x86/include/asm/bigsmp/apicdef.h b/arch/x86/include/asm/bigsmp/apicdef.h
deleted file mode 100644
index 392c3f5ef2f..00000000000
--- a/arch/x86/include/asm/bigsmp/apicdef.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __ASM_MACH_APICDEF_H
-#define __ASM_MACH_APICDEF_H
-
-#define APIC_ID_MASK (0xFF<<24)
-
-static inline unsigned get_apic_id(unsigned long x)
-{
- return (((x)>>24)&0xFF);
-}
-
-#define GET_APIC_ID(x) get_apic_id(x)
-
-#endif
diff --git a/arch/x86/include/asm/bigsmp/ipi.h b/arch/x86/include/asm/bigsmp/ipi.h
deleted file mode 100644
index 9404c535b7e..00000000000
--- a/arch/x86/include/asm/bigsmp/ipi.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef __ASM_MACH_IPI_H
-#define __ASM_MACH_IPI_H
-
-void send_IPI_mask_sequence(cpumask_t mask, int vector);
-
-static inline void send_IPI_mask(cpumask_t mask, int vector)
-{
- send_IPI_mask_sequence(mask, vector);
-}
-
-static inline void send_IPI_allbutself(int vector)
-{
- cpumask_t mask = cpu_online_map;
- cpu_clear(smp_processor_id(), mask);
-
- if (!cpus_empty(mask))
- send_IPI_mask(mask, vector);
-}
-
-static inline void send_IPI_all(int vector)
-{
- send_IPI_mask(cpu_online_map, vector);
-}
-
-#endif /* __ASM_MACH_IPI_H */
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 36001032271..02b47a603fc 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -3,6 +3,9 @@
/*
* Copyright 1992, Linus Torvalds.
+ *
+ * Note: inlines with more than a single statement should be marked
+ * __always_inline to avoid problems with older gcc's inlining heuristics.
*/
#ifndef _LINUX_BITOPS_H
@@ -53,7 +56,8 @@
* Note that @nr may be almost arbitrarily large; this function is not
* restricted to acting on a single-word quantity.
*/
-static inline void set_bit(unsigned int nr, volatile unsigned long *addr)
+static __always_inline void
+set_bit(unsigned int nr, volatile unsigned long *addr)
{
if (IS_IMMEDIATE(nr)) {
asm volatile(LOCK_PREFIX "orb %1,%0"
@@ -90,7 +94,8 @@ static inline void __set_bit(int nr, volatile unsigned long *addr)
* you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
* in order to ensure changes are visible on other processors.
*/
-static inline void clear_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+clear_bit(int nr, volatile unsigned long *addr)
{
if (IS_IMMEDIATE(nr)) {
asm volatile(LOCK_PREFIX "andb %1,%0"
@@ -168,7 +173,15 @@ static inline void __change_bit(int nr, volatile unsigned long *addr)
*/
static inline void change_bit(int nr, volatile unsigned long *addr)
{
- asm volatile(LOCK_PREFIX "btc %1,%0" : ADDR : "Ir" (nr));
+ if (IS_IMMEDIATE(nr)) {
+ asm volatile(LOCK_PREFIX "xorb %1,%0"
+ : CONST_MASK_ADDR(nr, addr)
+ : "iq" ((u8)CONST_MASK(nr)));
+ } else {
+ asm volatile(LOCK_PREFIX "btc %1,%0"
+ : BITOP_ADDR(addr)
+ : "Ir" (nr));
+ }
}
/**
@@ -196,7 +209,8 @@ static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
*
* This is the same as test_and_set_bit on x86.
*/
-static inline int test_and_set_bit_lock(int nr, volatile unsigned long *addr)
+static __always_inline int
+test_and_set_bit_lock(int nr, volatile unsigned long *addr)
{
return test_and_set_bit(nr, addr);
}
@@ -292,7 +306,7 @@ static inline int test_and_change_bit(int nr, volatile unsigned long *addr)
return oldbit;
}
-static inline int constant_test_bit(int nr, const volatile unsigned long *addr)
+static __always_inline int constant_test_bit(unsigned int nr, const volatile unsigned long *addr)
{
return ((1UL << (nr % BITS_PER_LONG)) &
(((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 3def2065fce..d9cf1cd156d 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -9,7 +9,7 @@
#ifdef CONFIG_X86_32
# define __BUG_C0 "2:\t.long 1b, %c0\n"
#else
-# define __BUG_C0 "2:\t.quad 1b, %c0\n"
+# define __BUG_C0 "2:\t.long 1b - 2b, %c0 - 2b\n"
#endif
#define BUG() \
diff --git a/arch/x86/include/asm/byteorder.h b/arch/x86/include/asm/byteorder.h
index e02ae2d89ac..b13a7a88f3e 100644
--- a/arch/x86/include/asm/byteorder.h
+++ b/arch/x86/include/asm/byteorder.h
@@ -1,81 +1,6 @@
#ifndef _ASM_X86_BYTEORDER_H
#define _ASM_X86_BYTEORDER_H
-#include <asm/types.h>
-#include <linux/compiler.h>
-
-#ifdef __GNUC__
-
-#ifdef __i386__
-
-static inline __attribute_const__ __u32 ___arch__swab32(__u32 x)
-{
-#ifdef CONFIG_X86_BSWAP
- asm("bswap %0" : "=r" (x) : "0" (x));
-#else
- asm("xchgb %b0,%h0\n\t" /* swap lower bytes */
- "rorl $16,%0\n\t" /* swap words */
- "xchgb %b0,%h0" /* swap higher bytes */
- : "=q" (x)
- : "0" (x));
-#endif
- return x;
-}
-
-static inline __attribute_const__ __u64 ___arch__swab64(__u64 val)
-{
- union {
- struct {
- __u32 a;
- __u32 b;
- } s;
- __u64 u;
- } v;
- v.u = val;
-#ifdef CONFIG_X86_BSWAP
- asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1"
- : "=r" (v.s.a), "=r" (v.s.b)
- : "0" (v.s.a), "1" (v.s.b));
-#else
- v.s.a = ___arch__swab32(v.s.a);
- v.s.b = ___arch__swab32(v.s.b);
- asm("xchgl %0,%1"
- : "=r" (v.s.a), "=r" (v.s.b)
- : "0" (v.s.a), "1" (v.s.b));
-#endif
- return v.u;
-}
-
-#else /* __i386__ */
-
-static inline __attribute_const__ __u64 ___arch__swab64(__u64 x)
-{
- asm("bswapq %0"
- : "=r" (x)
- : "0" (x));
- return x;
-}
-
-static inline __attribute_const__ __u32 ___arch__swab32(__u32 x)
-{
- asm("bswapl %0"
- : "=r" (x)
- : "0" (x));
- return x;
-}
-
-#endif
-
-/* Do not define swab16. Gcc is smart enough to recognize "C" version and
- convert it into rotation or exhange. */
-
-#define __arch__swab64(x) ___arch__swab64(x)
-#define __arch__swab32(x) ___arch__swab32(x)
-
-#define __BYTEORDER_HAS_U64__
-
-#endif /* __GNUC__ */
-
#include <linux/byteorder/little_endian.h>
#endif /* _ASM_X86_BYTEORDER_H */
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 2bc162e0ec6..0e63c9a2a8d 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -1,5 +1,55 @@
/*
- * Some macros to handle stack frames in assembly.
+
+ x86 function call convention, 64-bit:
+ -------------------------------------
+ arguments | callee-saved | extra caller-saved | return
+ [callee-clobbered] | | [callee-clobbered] |
+ ---------------------------------------------------------------------------
+ rdi rsi rdx rcx r8-9 | rbx rbp [*] r12-15 | r10-11 | rax, rdx [**]
+
+ ( rsp is obviously invariant across normal function calls. (gcc can 'merge'
+ functions when it sees tail-call optimization possibilities) rflags is
+ clobbered. Leftover arguments are passed over the stack frame.)
+
+ [*] In the frame-pointers case rbp is fixed to the stack frame.
+
+ [**] for struct return values wider than 64 bits the return convention is a
+ bit more complex: up to 128 bits width we return small structures
+ straight in rax, rdx. For structures larger than that (3 words or
+ larger) the caller puts a pointer to an on-stack return struct
+ [allocated in the caller's stack frame] into the first argument - i.e.
+ into rdi. All other arguments shift up by one in this case.
+ Fortunately this case is rare in the kernel.
+
+For 32-bit we have the following conventions - kernel is built with
+-mregparm=3 and -freg-struct-return:
+
+ x86 function calling convention, 32-bit:
+ ----------------------------------------
+ arguments | callee-saved | extra caller-saved | return
+ [callee-clobbered] | | [callee-clobbered] |
+ -------------------------------------------------------------------------
+ eax edx ecx | ebx edi esi ebp [*] | <none> | eax, edx [**]
+
+ ( here too esp is obviously invariant across normal function calls. eflags
+ is clobbered. Leftover arguments are passed over the stack frame. )
+
+ [*] In the frame-pointers case ebp is fixed to the stack frame.
+
+ [**] We build with -freg-struct-return, which on 32-bit means similar
+ semantics as on 64-bit: edx can be used for a second return value
+ (i.e. covering integer and structure sizes up to 64 bits) - after that
+ it gets more complex and more expensive: 3-word or larger struct returns
+ get done in the caller's frame and the pointer to the return struct goes
+ into regparm0, i.e. eax - the other arguments shift up and the
+ function's register parameters degenerate to regparm=2 in essence.
+
+*/
+
+
+/*
+ * 64-bit system call stack frame layout defines and helpers,
+ * for assembly code:
*/
#define R15 0
@@ -9,7 +59,7 @@
#define RBP 32
#define RBX 40
-/* arguments: interrupts/non tracing syscalls only save upto here*/
+/* arguments: interrupts/non tracing syscalls only save up to here: */
#define R11 48
#define R10 56
#define R9 64
@@ -22,7 +72,7 @@
#define ORIG_RAX 120 /* + error_code */
/* end of arguments */
-/* cpu exception frame or undefined in case of fast syscall. */
+/* cpu exception frame or undefined in case of fast syscall: */
#define RIP 128
#define CS 136
#define EFLAGS 144
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index bae482df603..b185091bf19 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -7,6 +7,20 @@
#include <linux/nodemask.h>
#include <linux/percpu.h>
+#ifdef CONFIG_SMP
+
+extern void prefill_possible_map(void);
+
+#else /* CONFIG_SMP */
+
+static inline void prefill_possible_map(void) {}
+
+#define cpu_physical_id(cpu) boot_cpu_physical_apicid
+#define safe_smp_processor_id() 0
+#define stack_smp_processor_id() 0
+
+#endif /* CONFIG_SMP */
+
struct x86_cpu {
struct cpu cpu;
};
@@ -17,4 +31,7 @@ extern void arch_unregister_cpu(int);
#endif
DECLARE_PER_CPU(int, cpu_state);
+
+extern unsigned int boot_cpu_id;
+
#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index cfdf8c2c5c3..7301e60dc4a 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -80,7 +80,6 @@
#define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */
#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */
#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
-#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */
#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */
#define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in ia32 userspace */
@@ -92,6 +91,9 @@
#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
#define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */
#define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */
+#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
+#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
+#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
@@ -117,6 +119,7 @@
#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */
/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
#define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */
@@ -237,6 +240,7 @@ extern const char * const x86_power_flags[32];
#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2)
#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
+#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
# define cpu_has_invlpg 1
diff --git a/arch/x86/include/asm/cpumask.h b/arch/x86/include/asm/cpumask.h
new file mode 100644
index 00000000000..a7f3c75f8ad
--- /dev/null
+++ b/arch/x86/include/asm/cpumask.h
@@ -0,0 +1,32 @@
+#ifndef _ASM_X86_CPUMASK_H
+#define _ASM_X86_CPUMASK_H
+#ifndef __ASSEMBLY__
+#include <linux/cpumask.h>
+
+#ifdef CONFIG_X86_64
+
+extern cpumask_var_t cpu_callin_mask;
+extern cpumask_var_t cpu_callout_mask;
+extern cpumask_var_t cpu_initialized_mask;
+extern cpumask_var_t cpu_sibling_setup_mask;
+
+extern void setup_cpu_local_masks(void);
+
+#else /* CONFIG_X86_32 */
+
+extern cpumask_t cpu_callin_map;
+extern cpumask_t cpu_callout_map;
+extern cpumask_t cpu_initialized;
+extern cpumask_t cpu_sibling_setup_map;
+
+#define cpu_callin_mask ((struct cpumask *)&cpu_callin_map)
+#define cpu_callout_mask ((struct cpumask *)&cpu_callout_map)
+#define cpu_initialized_mask ((struct cpumask *)&cpu_initialized)
+#define cpu_sibling_setup_mask ((struct cpumask *)&cpu_sibling_setup_map)
+
+static inline void setup_cpu_local_masks(void) { }
+
+#endif /* CONFIG_X86_32 */
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_CPUMASK_H */
diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index 0930b4f8d67..c68c361697e 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -1,39 +1,21 @@
#ifndef _ASM_X86_CURRENT_H
#define _ASM_X86_CURRENT_H
-#ifdef CONFIG_X86_32
#include <linux/compiler.h>
#include <asm/percpu.h>
+#ifndef __ASSEMBLY__
struct task_struct;
DECLARE_PER_CPU(struct task_struct *, current_task);
-static __always_inline struct task_struct *get_current(void)
-{
- return x86_read_percpu(current_task);
-}
-
-#else /* X86_32 */
-
-#ifndef __ASSEMBLY__
-#include <asm/pda.h>
-
-struct task_struct;
static __always_inline struct task_struct *get_current(void)
{
- return read_pda(pcurrent);
+ return percpu_read(current_task);
}
-#else /* __ASSEMBLY__ */
-
-#include <asm/asm-offsets.h>
-#define GET_CURRENT(reg) movq %gs:(pda_pcurrent),reg
+#define current get_current()
#endif /* __ASSEMBLY__ */
-#endif /* X86_32 */
-
-#define current get_current()
-
#endif /* _ASM_X86_CURRENT_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index e6b82b17b07..dc27705f544 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -320,16 +320,14 @@ static inline void set_intr_gate(unsigned int n, void *addr)
_set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
}
-#define SYS_VECTOR_FREE 0
-#define SYS_VECTOR_ALLOCED 1
-
extern int first_system_vector;
-extern char system_vectors[];
+/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
+extern unsigned long used_vectors[];
static inline void alloc_system_vector(int vector)
{
- if (system_vectors[vector] == SYS_VECTOR_FREE) {
- system_vectors[vector] = SYS_VECTOR_ALLOCED;
+ if (!test_bit(vector, used_vectors)) {
+ set_bit(vector, used_vectors);
if (first_system_vector > vector)
first_system_vector = vector;
} else
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 7f225a4b2a2..132a134d12f 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -2,8 +2,8 @@
#define _ASM_X86_DMA_MAPPING_H
/*
- * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
- * documentation.
+ * IOMMU interface. See Documentation/PCI/PCI-DMA-mapping.txt and
+ * Documentation/DMA-API.txt for documentation.
*/
#include <linux/scatterlist.h>
@@ -65,21 +65,17 @@ static inline struct dma_mapping_ops *get_dma_ops(struct device *dev)
return dma_ops;
else
return dev->archdata.dma_ops;
-#endif /* _ASM_X86_DMA_MAPPING_H */
+#endif
}
/* Make sure we keep the same behaviour */
static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
{
-#ifdef CONFIG_X86_32
- return 0;
-#else
struct dma_mapping_ops *ops = get_dma_ops(dev);
if (ops->mapping_error)
return ops->mapping_error(dev, dma_addr);
return (dma_addr == bad_dma_address);
-#endif
}
#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
diff --git a/arch/x86/include/asm/mach-default/do_timer.h b/arch/x86/include/asm/do_timer.h
index 23ecda0b28a..23ecda0b28a 100644
--- a/arch/x86/include/asm/mach-default/do_timer.h
+++ b/arch/x86/include/asm/do_timer.h
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index 72c5a190bf4..a8f672ba100 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -6,14 +6,13 @@
* precise-event based sampling (PEBS).
*
* It manages:
- * - per-thread and per-cpu allocation of BTS and PEBS
- * - buffer memory allocation (optional)
- * - buffer overflow handling
+ * - DS and BTS hardware configuration
+ * - buffer overflow handling (to be done)
* - buffer access
*
- * It assumes:
- * - get_task_struct on all parameter tasks
- * - current is allowed to trace parameter tasks
+ * It does not do:
+ * - security checking (is the caller allowed to trace the task)
+ * - buffer allocation (memory accounting)
*
*
* Copyright (C) 2007-2008 Intel Corporation.
@@ -23,13 +22,54 @@
#ifndef _ASM_X86_DS_H
#define _ASM_X86_DS_H
-#ifdef CONFIG_X86_DS
#include <linux/types.h>
#include <linux/init.h>
+#include <linux/err.h>
+
+#ifdef CONFIG_X86_DS
struct task_struct;
+struct ds_context;
+struct ds_tracer;
+struct bts_tracer;
+struct pebs_tracer;
+
+typedef void (*bts_ovfl_callback_t)(struct bts_tracer *);
+typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
+
+
+/*
+ * A list of features plus corresponding macros to talk about them in
+ * the ds_request function's flags parameter.
+ *
+ * We use the enum to index an array of corresponding control bits;
+ * we use the macro to index a flags bit-vector.
+ */
+enum ds_feature {
+ dsf_bts = 0,
+ dsf_bts_kernel,
+#define BTS_KERNEL (1 << dsf_bts_kernel)
+ /* trace kernel-mode branches */
+
+ dsf_bts_user,
+#define BTS_USER (1 << dsf_bts_user)
+ /* trace user-mode branches */
+
+ dsf_bts_overflow,
+ dsf_bts_max,
+ dsf_pebs = dsf_bts_max,
+
+ dsf_pebs_max,
+ dsf_ctl_max = dsf_pebs_max,
+ dsf_bts_timestamps = dsf_ctl_max,
+#define BTS_TIMESTAMPS (1 << dsf_bts_timestamps)
+ /* add timestamps into BTS trace */
+
+#define BTS_USER_FLAGS (BTS_KERNEL | BTS_USER | BTS_TIMESTAMPS)
+};
+
/*
* Request BTS or PEBS
@@ -37,163 +77,169 @@ struct task_struct;
* Due to alignement constraints, the actual buffer may be slightly
* smaller than the requested or provided buffer.
*
- * Returns 0 on success; -Eerrno otherwise
+ * Returns a pointer to a tracer structure on success, or
+ * ERR_PTR(errcode) on failure.
+ *
+ * The interrupt threshold is independent from the overflow callback
+ * to allow users to use their own overflow interrupt handling mechanism.
*
* task: the task to request recording for;
* NULL for per-cpu recording on the current cpu
* base: the base pointer for the (non-pageable) buffer;
- * NULL if buffer allocation requested
- * size: the size of the requested or provided buffer
+ * size: the size of the provided buffer in bytes
* ovfl: pointer to a function to be called on buffer overflow;
* NULL if cyclic buffer requested
+ * th: the interrupt threshold in records from the end of the buffer;
+ * -1 if no interrupt threshold is requested.
+ * flags: a bit-mask of the above flags
*/
-typedef void (*ds_ovfl_callback_t)(struct task_struct *);
-extern int ds_request_bts(struct task_struct *task, void *base, size_t size,
- ds_ovfl_callback_t ovfl);
-extern int ds_request_pebs(struct task_struct *task, void *base, size_t size,
- ds_ovfl_callback_t ovfl);
+extern struct bts_tracer *ds_request_bts(struct task_struct *task,
+ void *base, size_t size,
+ bts_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags);
+extern struct pebs_tracer *ds_request_pebs(struct task_struct *task,
+ void *base, size_t size,
+ pebs_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags);
/*
* Release BTS or PEBS resources
+ * Suspend and resume BTS or PEBS tracing
*
- * Frees buffers allocated on ds_request.
- *
- * Returns 0 on success; -Eerrno otherwise
- *
- * task: the task to release resources for;
- * NULL to release resources for the current cpu
+ * tracer: the tracer handle returned from ds_request_~()
*/
-extern int ds_release_bts(struct task_struct *task);
-extern int ds_release_pebs(struct task_struct *task);
+extern void ds_release_bts(struct bts_tracer *tracer);
+extern void ds_suspend_bts(struct bts_tracer *tracer);
+extern void ds_resume_bts(struct bts_tracer *tracer);
+extern void ds_release_pebs(struct pebs_tracer *tracer);
+extern void ds_suspend_pebs(struct pebs_tracer *tracer);
+extern void ds_resume_pebs(struct pebs_tracer *tracer);
-/*
- * Return the (array) index of the write pointer.
- * (assuming an array of BTS/PEBS records)
- *
- * Returns -Eerrno on error
- *
- * task: the task to access;
- * NULL to access the current cpu
- * pos (out): if not NULL, will hold the result
- */
-extern int ds_get_bts_index(struct task_struct *task, size_t *pos);
-extern int ds_get_pebs_index(struct task_struct *task, size_t *pos);
/*
- * Return the (array) index one record beyond the end of the array.
- * (assuming an array of BTS/PEBS records)
+ * The raw DS buffer state as it is used for BTS and PEBS recording.
*
- * Returns -Eerrno on error
- *
- * task: the task to access;
- * NULL to access the current cpu
- * pos (out): if not NULL, will hold the result
+ * This is the low-level, arch-dependent interface for working
+ * directly on the raw trace data.
*/
-extern int ds_get_bts_end(struct task_struct *task, size_t *pos);
-extern int ds_get_pebs_end(struct task_struct *task, size_t *pos);
+struct ds_trace {
+ /* the number of bts/pebs records */
+ size_t n;
+ /* the size of a bts/pebs record in bytes */
+ size_t size;
+ /* pointers into the raw buffer:
+ - to the first entry */
+ void *begin;
+ /* - one beyond the last entry */
+ void *end;
+ /* - one beyond the newest entry */
+ void *top;
+ /* - the interrupt threshold */
+ void *ith;
+ /* flags given on ds_request() */
+ unsigned int flags;
+};
/*
- * Provide a pointer to the BTS/PEBS record at parameter index.
- * (assuming an array of BTS/PEBS records)
- *
- * The pointer points directly into the buffer. The user is
- * responsible for copying the record.
- *
- * Returns the size of a single record on success; -Eerrno on error
- *
- * task: the task to access;
- * NULL to access the current cpu
- * index: the index of the requested record
- * record (out): pointer to the requested record
+ * An arch-independent view on branch trace data.
*/
-extern int ds_access_bts(struct task_struct *task,
- size_t index, const void **record);
-extern int ds_access_pebs(struct task_struct *task,
- size_t index, const void **record);
+enum bts_qualifier {
+ bts_invalid,
+#define BTS_INVALID bts_invalid
+
+ bts_branch,
+#define BTS_BRANCH bts_branch
+
+ bts_task_arrives,
+#define BTS_TASK_ARRIVES bts_task_arrives
+
+ bts_task_departs,
+#define BTS_TASK_DEPARTS bts_task_departs
+
+ bts_qual_bit_size = 4,
+ bts_qual_max = (1 << bts_qual_bit_size),
+};
+
+struct bts_struct {
+ __u64 qualifier;
+ union {
+ /* BTS_BRANCH */
+ struct {
+ __u64 from;
+ __u64 to;
+ } lbr;
+ /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
+ struct {
+ __u64 jiffies;
+ pid_t pid;
+ } timestamp;
+ } variant;
+};
-/*
- * Write one or more BTS/PEBS records at the write pointer index and
- * advance the write pointer.
- *
- * If size is not a multiple of the record size, trailing bytes are
- * zeroed out.
- *
- * May result in one or more overflow notifications.
- *
- * If called during overflow handling, that is, with index >=
- * interrupt threshold, the write will wrap around.
- *
- * An overflow notification is given if and when the interrupt
- * threshold is reached during or after the write.
- *
- * Returns the number of bytes written or -Eerrno.
- *
- * task: the task to access;
- * NULL to access the current cpu
- * buffer: the buffer to write
- * size: the size of the buffer
- */
-extern int ds_write_bts(struct task_struct *task,
- const void *buffer, size_t size);
-extern int ds_write_pebs(struct task_struct *task,
- const void *buffer, size_t size);
/*
- * Same as ds_write_bts/pebs, but omit ownership checks.
+ * The BTS state.
*
- * This is needed to have some other task than the owner of the
- * BTS/PEBS buffer or the parameter task itself write into the
- * respective buffer.
+ * This gives access to the raw DS state and adds functions to provide
+ * an arch-independent view of the BTS data.
*/
-extern int ds_unchecked_write_bts(struct task_struct *task,
- const void *buffer, size_t size);
-extern int ds_unchecked_write_pebs(struct task_struct *task,
- const void *buffer, size_t size);
+struct bts_trace {
+ struct ds_trace ds;
+
+ int (*read)(struct bts_tracer *tracer, const void *at,
+ struct bts_struct *out);
+ int (*write)(struct bts_tracer *tracer, const struct bts_struct *in);
+};
+
/*
- * Reset the write pointer of the BTS/PEBS buffer.
+ * The PEBS state.
*
- * Returns 0 on success; -Eerrno on error
- *
- * task: the task to access;
- * NULL to access the current cpu
+ * This gives access to the raw DS state and the PEBS-specific counter
+ * reset value.
*/
-extern int ds_reset_bts(struct task_struct *task);
-extern int ds_reset_pebs(struct task_struct *task);
+struct pebs_trace {
+ struct ds_trace ds;
+
+ /* the PEBS reset value */
+ unsigned long long reset_value;
+};
+
/*
- * Clear the BTS/PEBS buffer and reset the write pointer.
- * The entire buffer will be zeroed out.
+ * Read the BTS or PEBS trace.
*
- * Returns 0 on success; -Eerrno on error
+ * Returns a view on the trace collected for the parameter tracer.
*
- * task: the task to access;
- * NULL to access the current cpu
+ * The view remains valid as long as the traced task is not running or
+ * the tracer is suspended.
+ * Writes into the trace buffer are not reflected.
+ *
+ * tracer: the tracer handle returned from ds_request_~()
*/
-extern int ds_clear_bts(struct task_struct *task);
-extern int ds_clear_pebs(struct task_struct *task);
+extern const struct bts_trace *ds_read_bts(struct bts_tracer *tracer);
+extern const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer);
+
/*
- * Provide the PEBS counter reset value.
+ * Reset the write pointer of the BTS/PEBS buffer.
*
* Returns 0 on success; -Eerrno on error
*
- * task: the task to access;
- * NULL to access the current cpu
- * value (out): the counter reset value
+ * tracer: the tracer handle returned from ds_request_~()
*/
-extern int ds_get_pebs_reset(struct task_struct *task, u64 *value);
+extern int ds_reset_bts(struct bts_tracer *tracer);
+extern int ds_reset_pebs(struct pebs_tracer *tracer);
/*
* Set the PEBS counter reset value.
*
* Returns 0 on success; -Eerrno on error
*
- * task: the task to access;
- * NULL to access the current cpu
+ * tracer: the tracer handle returned from ds_request_pebs()
* value: the new counter reset value
*/
-extern int ds_set_pebs_reset(struct task_struct *task, u64 value);
+extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value);
/*
* Initialization
@@ -201,38 +247,26 @@ extern int ds_set_pebs_reset(struct task_struct *task, u64 value);
struct cpuinfo_x86;
extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
-
-
/*
- * The DS context - part of struct thread_struct.
+ * Context switch work
*/
-struct ds_context {
- /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
- unsigned char *ds;
- /* the owner of the BTS and PEBS configuration, respectively */
- struct task_struct *owner[2];
- /* buffer overflow notification function for BTS and PEBS */
- ds_ovfl_callback_t callback[2];
- /* the original buffer address */
- void *buffer[2];
- /* the number of allocated pages for on-request allocated buffers */
- unsigned int pages[2];
- /* use count */
- unsigned long count;
- /* a pointer to the context location inside the thread_struct
- * or the per_cpu context array */
- struct ds_context **this;
- /* a pointer to the task owning this context, or NULL, if the
- * context is owned by a cpu */
- struct task_struct *task;
-};
+extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
-/* called by exit_thread() to free leftover contexts */
-extern void ds_free(struct ds_context *context);
+/*
+ * Task clone/init and cleanup work
+ */
+extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father);
+extern void ds_exit_thread(struct task_struct *tsk);
#else /* CONFIG_X86_DS */
-#define ds_init_intel(config) do {} while (0)
+struct cpuinfo_x86;
+static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
+static inline void ds_switch_to(struct task_struct *prev,
+ struct task_struct *next) {}
+static inline void ds_copy_thread(struct task_struct *tsk,
+ struct task_struct *father) {}
+static inline void ds_exit_thread(struct task_struct *tsk) {}
#endif /* CONFIG_X86_DS */
#endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index 804b6e6be92..3afc5e87cfd 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -6,56 +6,91 @@
#endif
/*
- Macros for dwarf2 CFI unwind table entries.
- See "as.info" for details on these pseudo ops. Unfortunately
- they are only supported in very new binutils, so define them
- away for older version.
+ * Macros for dwarf2 CFI unwind table entries.
+ * See "as.info" for details on these pseudo ops. Unfortunately
+ * they are only supported in very new binutils, so define them
+ * away for older version.
*/
#ifdef CONFIG_AS_CFI
-#define CFI_STARTPROC .cfi_startproc
-#define CFI_ENDPROC .cfi_endproc
-#define CFI_DEF_CFA .cfi_def_cfa
-#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register
-#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset
-#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset
-#define CFI_OFFSET .cfi_offset
-#define CFI_REL_OFFSET .cfi_rel_offset
-#define CFI_REGISTER .cfi_register
-#define CFI_RESTORE .cfi_restore
-#define CFI_REMEMBER_STATE .cfi_remember_state
-#define CFI_RESTORE_STATE .cfi_restore_state
-#define CFI_UNDEFINED .cfi_undefined
+#define CFI_STARTPROC .cfi_startproc
+#define CFI_ENDPROC .cfi_endproc
+#define CFI_DEF_CFA .cfi_def_cfa
+#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register
+#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset
+#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset
+#define CFI_OFFSET .cfi_offset
+#define CFI_REL_OFFSET .cfi_rel_offset
+#define CFI_REGISTER .cfi_register
+#define CFI_RESTORE .cfi_restore
+#define CFI_REMEMBER_STATE .cfi_remember_state
+#define CFI_RESTORE_STATE .cfi_restore_state
+#define CFI_UNDEFINED .cfi_undefined
#ifdef CONFIG_AS_CFI_SIGNAL_FRAME
-#define CFI_SIGNAL_FRAME .cfi_signal_frame
+#define CFI_SIGNAL_FRAME .cfi_signal_frame
#else
#define CFI_SIGNAL_FRAME
#endif
#else
-/* Due to the structure of pre-exisiting code, don't use assembler line
- comment character # to ignore the arguments. Instead, use a dummy macro. */
+/*
+ * Due to the structure of pre-exisiting code, don't use assembler line
+ * comment character # to ignore the arguments. Instead, use a dummy macro.
+ */
.macro cfi_ignore a=0, b=0, c=0, d=0
.endm
-#define CFI_STARTPROC cfi_ignore
-#define CFI_ENDPROC cfi_ignore
-#define CFI_DEF_CFA cfi_ignore
+#define CFI_STARTPROC cfi_ignore
+#define CFI_ENDPROC cfi_ignore
+#define CFI_DEF_CFA cfi_ignore
#define CFI_DEF_CFA_REGISTER cfi_ignore
#define CFI_DEF_CFA_OFFSET cfi_ignore
#define CFI_ADJUST_CFA_OFFSET cfi_ignore
-#define CFI_OFFSET cfi_ignore
-#define CFI_REL_OFFSET cfi_ignore
-#define CFI_REGISTER cfi_ignore
-#define CFI_RESTORE cfi_ignore
-#define CFI_REMEMBER_STATE cfi_ignore
-#define CFI_RESTORE_STATE cfi_ignore
-#define CFI_UNDEFINED cfi_ignore
-#define CFI_SIGNAL_FRAME cfi_ignore
+#define CFI_OFFSET cfi_ignore
+#define CFI_REL_OFFSET cfi_ignore
+#define CFI_REGISTER cfi_ignore
+#define CFI_RESTORE cfi_ignore
+#define CFI_REMEMBER_STATE cfi_ignore
+#define CFI_RESTORE_STATE cfi_ignore
+#define CFI_UNDEFINED cfi_ignore
+#define CFI_SIGNAL_FRAME cfi_ignore
#endif
+/*
+ * An attempt to make CFI annotations more or less
+ * correct and shorter. It is implied that you know
+ * what you're doing if you use them.
+ */
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_X86_64
+ .macro pushq_cfi reg
+ pushq \reg
+ CFI_ADJUST_CFA_OFFSET 8
+ .endm
+
+ .macro popq_cfi reg
+ popq \reg
+ CFI_ADJUST_CFA_OFFSET -8
+ .endm
+
+ .macro movq_cfi reg offset=0
+ movq %\reg, \offset(%rsp)
+ CFI_REL_OFFSET \reg, \offset
+ .endm
+
+ .macro movq_cfi_restore offset reg
+ movq \offset(%rsp), %\reg
+ CFI_RESTORE \reg
+ .endm
+#else /*!CONFIG_X86_64*/
+
+ /* 32bit defenitions are missed yet */
+
+#endif /*!CONFIG_X86_64*/
+#endif /*__ASSEMBLY__*/
+
#endif /* _ASM_X86_DWARF2_H */
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 3d8ceddbd40..00d41ce4c84 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -49,6 +49,7 @@
#define E820_RESERVED_KERN 128
#ifndef __ASSEMBLY__
+#include <linux/types.h>
struct e820entry {
__u64 addr; /* start of memory segment */
__u64 size; /* size of memory segment */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index a2e545c91c3..ca5ffb2856b 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -90,6 +90,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size);
#endif /* CONFIG_X86_32 */
+extern int add_efi_memmap;
extern void efi_reserve_early(void);
extern void efi_call_phys_prelog(void);
extern void efi_call_phys_epilog(void);
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 40ca1bea791..83c1bc8d2e8 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -112,7 +112,7 @@ extern unsigned int vdso_enabled;
* now struct_user_regs, they are different)
*/
-#define ELF_CORE_COPY_REGS(pr_reg, regs) \
+#define ELF_CORE_COPY_REGS_COMMON(pr_reg, regs) \
do { \
pr_reg[0] = regs->bx; \
pr_reg[1] = regs->cx; \
@@ -124,7 +124,6 @@ do { \
pr_reg[7] = regs->ds & 0xffff; \
pr_reg[8] = regs->es & 0xffff; \
pr_reg[9] = regs->fs & 0xffff; \
- savesegment(gs, pr_reg[10]); \
pr_reg[11] = regs->orig_ax; \
pr_reg[12] = regs->ip; \
pr_reg[13] = regs->cs & 0xffff; \
@@ -133,6 +132,18 @@ do { \
pr_reg[16] = regs->ss & 0xffff; \
} while (0);
+#define ELF_CORE_COPY_REGS(pr_reg, regs) \
+do { \
+ ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\
+ pr_reg[10] = get_user_gs(regs); \
+} while (0);
+
+#define ELF_CORE_COPY_KERNEL_REGS(pr_reg, regs) \
+do { \
+ ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\
+ savesegment(gs, pr_reg[10]); \
+} while (0);
+
#define ELF_PLATFORM (utsname()->machine)
#define set_personality_64bit() do { } while (0)
@@ -325,7 +336,7 @@ struct linux_binprm;
#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
extern int arch_setup_additional_pages(struct linux_binprm *bprm,
- int executable_stack);
+ int uses_interp);
extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
#define compat_arch_setup_additional_pages syscall32_setup_pages
diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h
index 94826cf8745..cc70c1c78ca 100644
--- a/arch/x86/include/asm/emergency-restart.h
+++ b/arch/x86/include/asm/emergency-restart.h
@@ -8,7 +8,9 @@ enum reboot_type {
BOOT_BIOS = 'b',
#endif
BOOT_ACPI = 'a',
- BOOT_EFI = 'e'
+ BOOT_EFI = 'e',
+ BOOT_CF9 = 'p',
+ BOOT_CF9_COND = 'q',
};
extern enum reboot_type reboot_type;
diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 6b1add8e31d..854d538ae85 100644
--- a/arch/x86/include/asm/mach-default/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -9,12 +9,28 @@
* is no hardware IRQ pin equivalent for them, they are triggered
* through the ICC by us (IPIs)
*/
-#ifdef CONFIG_X86_SMP
+#ifdef CONFIG_SMP
BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
-BUILD_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR)
BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
+
+BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0,
+ smp_invalidate_interrupt)
+BUILD_INTERRUPT3(invalidate_interrupt1,INVALIDATE_TLB_VECTOR_START+1,
+ smp_invalidate_interrupt)
+BUILD_INTERRUPT3(invalidate_interrupt2,INVALIDATE_TLB_VECTOR_START+2,
+ smp_invalidate_interrupt)
+BUILD_INTERRUPT3(invalidate_interrupt3,INVALIDATE_TLB_VECTOR_START+3,
+ smp_invalidate_interrupt)
+BUILD_INTERRUPT3(invalidate_interrupt4,INVALIDATE_TLB_VECTOR_START+4,
+ smp_invalidate_interrupt)
+BUILD_INTERRUPT3(invalidate_interrupt5,INVALIDATE_TLB_VECTOR_START+5,
+ smp_invalidate_interrupt)
+BUILD_INTERRUPT3(invalidate_interrupt6,INVALIDATE_TLB_VECTOR_START+6,
+ smp_invalidate_interrupt)
+BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
+ smp_invalidate_interrupt)
#endif
/*
@@ -25,10 +41,15 @@ BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
* a much simpler SMP time architecture:
*/
#ifdef CONFIG_X86_LOCAL_APIC
+
BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
+#ifdef CONFIG_PERF_COUNTERS
+BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
+#endif
+
#ifdef CONFIG_X86_MCE_P4THERMAL
BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
#endif
diff --git a/arch/x86/include/asm/es7000/apic.h b/arch/x86/include/asm/es7000/apic.h
deleted file mode 100644
index 380f0b4f17e..00000000000
--- a/arch/x86/include/asm/es7000/apic.h
+++ /dev/null
@@ -1,193 +0,0 @@
-#ifndef __ASM_ES7000_APIC_H
-#define __ASM_ES7000_APIC_H
-
-#define xapic_phys_to_log_apicid(cpu) per_cpu(x86_bios_cpu_apicid, cpu)
-#define esr_disable (1)
-
-static inline int apic_id_registered(void)
-{
- return (1);
-}
-
-static inline cpumask_t target_cpus(void)
-{
-#if defined CONFIG_ES7000_CLUSTERED_APIC
- return CPU_MASK_ALL;
-#else
- return cpumask_of_cpu(smp_processor_id());
-#endif
-}
-
-#if defined CONFIG_ES7000_CLUSTERED_APIC
-#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
-#define INT_DELIVERY_MODE (dest_LowestPrio)
-#define INT_DEST_MODE (1) /* logical delivery broadcast to all procs */
-#define NO_BALANCE_IRQ (1)
-#undef WAKE_SECONDARY_VIA_INIT
-#define WAKE_SECONDARY_VIA_MIP
-#else
-#define APIC_DFR_VALUE (APIC_DFR_FLAT)
-#define INT_DELIVERY_MODE (dest_Fixed)
-#define INT_DEST_MODE (0) /* phys delivery to target procs */
-#define NO_BALANCE_IRQ (0)
-#undef APIC_DEST_LOGICAL
-#define APIC_DEST_LOGICAL 0x0
-#define WAKE_SECONDARY_VIA_INIT
-#endif
-
-static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
-{
- return 0;
-}
-static inline unsigned long check_apicid_present(int bit)
-{
- return physid_isset(bit, phys_cpu_present_map);
-}
-
-#define apicid_cluster(apicid) (apicid & 0xF0)
-
-static inline unsigned long calculate_ldr(int cpu)
-{
- unsigned long id;
- id = xapic_phys_to_log_apicid(cpu);
- return (SET_APIC_LOGICAL_ID(id));
-}
-
-/*
- * Set up the logical destination ID.
- *
- * Intel recommends to set DFR, LdR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
- */
-static inline void init_apic_ldr(void)
-{
- unsigned long val;
- int cpu = smp_processor_id();
-
- apic_write(APIC_DFR, APIC_DFR_VALUE);
- val = calculate_ldr(cpu);
- apic_write(APIC_LDR, val);
-}
-
-#ifndef CONFIG_X86_GENERICARCH
-extern void enable_apic_mode(void);
-#endif
-
-extern int apic_version [MAX_APICS];
-static inline void setup_apic_routing(void)
-{
- int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
- printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
- (apic_version[apic] == 0x14) ?
- "Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_addr(target_cpus())[0]);
-}
-
-static inline int multi_timer_check(int apic, int irq)
-{
- return 0;
-}
-
-static inline int apicid_to_node(int logical_apicid)
-{
- return 0;
-}
-
-
-static inline int cpu_present_to_apicid(int mps_cpu)
-{
- if (!mps_cpu)
- return boot_cpu_physical_apicid;
- else if (mps_cpu < NR_CPUS)
- return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
- else
- return BAD_APICID;
-}
-
-static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
-{
- static int id = 0;
- physid_mask_t mask;
- mask = physid_mask_of_physid(id);
- ++id;
- return mask;
-}
-
-extern u8 cpu_2_logical_apicid[];
-/* Mapping from cpu number to logical apicid */
-static inline int cpu_to_logical_apicid(int cpu)
-{
-#ifdef CONFIG_SMP
- if (cpu >= NR_CPUS)
- return BAD_APICID;
- return (int)cpu_2_logical_apicid[cpu];
-#else
- return logical_smp_processor_id();
-#endif
-}
-
-static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
-{
- /* For clustered we don't have a good way to do this yet - hack */
- return physids_promote(0xff);
-}
-
-
-static inline void setup_portio_remap(void)
-{
-}
-
-extern unsigned int boot_cpu_physical_apicid;
-static inline int check_phys_apicid_present(int cpu_physical_apicid)
-{
- boot_cpu_physical_apicid = read_apic_id();
- return (1);
-}
-
-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
-{
- int num_bits_set;
- int cpus_found = 0;
- int cpu;
- int apicid;
-
- num_bits_set = cpus_weight(cpumask);
- /* Return id to all */
- if (num_bits_set == NR_CPUS)
-#if defined CONFIG_ES7000_CLUSTERED_APIC
- return 0xFF;
-#else
- return cpu_to_logical_apicid(0);
-#endif
- /*
- * The cpus in the mask must all be on the apic cluster. If are not
- * on the same apicid cluster return default value of TARGET_CPUS.
- */
- cpu = first_cpu(cpumask);
- apicid = cpu_to_logical_apicid(cpu);
- while (cpus_found < num_bits_set) {
- if (cpu_isset(cpu, cpumask)) {
- int new_apicid = cpu_to_logical_apicid(cpu);
- if (apicid_cluster(apicid) !=
- apicid_cluster(new_apicid)){
- printk ("%s: Not a valid mask!\n", __func__);
-#if defined CONFIG_ES7000_CLUSTERED_APIC
- return 0xFF;
-#else
- return cpu_to_logical_apicid(0);
-#endif
- }
- apicid = new_apicid;
- cpus_found++;
- }
- cpu++;
- }
- return apicid;
-}
-
-static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
-{
- return cpuid_apic >> index_msb;
-}
-
-#endif /* __ASM_ES7000_APIC_H */
diff --git a/arch/x86/include/asm/es7000/apicdef.h b/arch/x86/include/asm/es7000/apicdef.h
deleted file mode 100644
index 8b234a3cb85..00000000000
--- a/arch/x86/include/asm/es7000/apicdef.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __ASM_ES7000_APICDEF_H
-#define __ASM_ES7000_APICDEF_H
-
-#define APIC_ID_MASK (0xFF<<24)
-
-static inline unsigned get_apic_id(unsigned long x)
-{
- return (((x)>>24)&0xFF);
-}
-
-#define GET_APIC_ID(x) get_apic_id(x)
-
-#endif
diff --git a/arch/x86/include/asm/es7000/ipi.h b/arch/x86/include/asm/es7000/ipi.h
deleted file mode 100644
index 632a955fcc0..00000000000
--- a/arch/x86/include/asm/es7000/ipi.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef __ASM_ES7000_IPI_H
-#define __ASM_ES7000_IPI_H
-
-void send_IPI_mask_sequence(cpumask_t mask, int vector);
-
-static inline void send_IPI_mask(cpumask_t mask, int vector)
-{
- send_IPI_mask_sequence(mask, vector);
-}
-
-static inline void send_IPI_allbutself(int vector)
-{
- cpumask_t mask = cpu_online_map;
- cpu_clear(smp_processor_id(), mask);
- if (!cpus_empty(mask))
- send_IPI_mask(mask, vector);
-}
-
-static inline void send_IPI_all(int vector)
-{
- send_IPI_mask(cpu_online_map, vector);
-}
-
-#endif /* __ASM_ES7000_IPI_H */
diff --git a/arch/x86/include/asm/es7000/mpparse.h b/arch/x86/include/asm/es7000/mpparse.h
deleted file mode 100644
index ed5a3caae14..00000000000
--- a/arch/x86/include/asm/es7000/mpparse.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef __ASM_ES7000_MPPARSE_H
-#define __ASM_ES7000_MPPARSE_H
-
-#include <linux/acpi.h>
-
-extern int parse_unisys_oem (char *oemptr);
-extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
-extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr);
-extern void setup_unisys(void);
-
-#ifndef CONFIG_X86_GENERICARCH
-extern int acpi_madt_oem_check(char *oem_id, char *oem_table_id);
-extern int mps_oem_check(struct mp_config_table *mpc, char *oem,
- char *productid);
-#endif
-
-#ifdef CONFIG_ACPI
-
-static inline int es7000_check_dsdt(void)
-{
- struct acpi_table_header header;
-
- if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) &&
- !strncmp(header.oem_id, "UNISYS", 6))
- return 1;
- return 0;
-}
-#endif
-
-#endif /* __ASM_MACH_MPPARSE_H */
diff --git a/arch/x86/include/asm/es7000/wakecpu.h b/arch/x86/include/asm/es7000/wakecpu.h
deleted file mode 100644
index 39849346191..00000000000
--- a/arch/x86/include/asm/es7000/wakecpu.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef __ASM_ES7000_WAKECPU_H
-#define __ASM_ES7000_WAKECPU_H
-
-/*
- * This file copes with machines that wakeup secondary CPUs by the
- * INIT, INIT, STARTUP sequence.
- */
-
-#ifdef CONFIG_ES7000_CLUSTERED_APIC
-#define WAKE_SECONDARY_VIA_MIP
-#else
-#define WAKE_SECONDARY_VIA_INIT
-#endif
-
-#ifdef WAKE_SECONDARY_VIA_MIP
-extern int es7000_start_cpu(int cpu, unsigned long eip);
-static inline int
-wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
-{
- int boot_error = 0;
- boot_error = es7000_start_cpu(phys_apicid, start_eip);
- return boot_error;
-}
-#endif
-
-#define TRAMPOLINE_LOW phys_to_virt(0x467)
-#define TRAMPOLINE_HIGH phys_to_virt(0x469)
-
-#define boot_cpu_apicid boot_cpu_physical_apicid
-
-static inline void wait_for_init_deassert(atomic_t *deassert)
-{
-#ifdef WAKE_SECONDARY_VIA_INIT
- while (!atomic_read(deassert))
- cpu_relax();
-#endif
- return;
-}
-
-/* Nothing to do for most platforms, since cleared by the INIT cycle */
-static inline void smp_callin_clear_local_apic(void)
-{
-}
-
-static inline void store_NMI_vector(unsigned short *high, unsigned short *low)
-{
-}
-
-static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
-{
-}
-
-#define inquire_remote_apic(apicid) do { \
- if (apic_verbosity >= APIC_DEBUG) \
- __inquire_remote_apic(apicid); \
- } while (0)
-
-#endif /* __ASM_MACH_WAKECPU_H */
diff --git a/arch/x86/include/asm/fixmap_32.h b/arch/x86/include/asm/fixmap_32.h
index c7115c1d721..047d9bab2b3 100644
--- a/arch/x86/include/asm/fixmap_32.h
+++ b/arch/x86/include/asm/fixmap_32.h
@@ -95,10 +95,6 @@ enum fixed_addresses {
(__end_of_permanent_fixed_addresses & 255),
FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
FIX_WP_TEST,
-#ifdef CONFIG_ACPI
- FIX_ACPI_BEGIN,
- FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
-#endif
#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
FIX_OHCI1394_BASE,
#endif
diff --git a/arch/x86/include/asm/fixmap_64.h b/arch/x86/include/asm/fixmap_64.h
index 00a30ab9b1a..298d9ba3fae 100644
--- a/arch/x86/include/asm/fixmap_64.h
+++ b/arch/x86/include/asm/fixmap_64.h
@@ -50,10 +50,6 @@ enum fixed_addresses {
FIX_PARAVIRT_BOOTMAP,
#endif
__end_of_permanent_fixed_addresses,
-#ifdef CONFIG_ACPI
- FIX_ACPI_BEGIN,
- FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
-#endif
#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
FIX_OHCI1394_BASE,
#endif
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 9e8bc29b8b1..b55b4a7fbef 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -1,6 +1,33 @@
#ifndef _ASM_X86_FTRACE_H
#define _ASM_X86_FTRACE_H
+#ifdef __ASSEMBLY__
+
+ .macro MCOUNT_SAVE_FRAME
+ /* taken from glibc */
+ subq $0x38, %rsp
+ movq %rax, (%rsp)
+ movq %rcx, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rsi, 24(%rsp)
+ movq %rdi, 32(%rsp)
+ movq %r8, 40(%rsp)
+ movq %r9, 48(%rsp)
+ .endm
+
+ .macro MCOUNT_RESTORE_FRAME
+ movq 48(%rsp), %r9
+ movq 40(%rsp), %r8
+ movq 32(%rsp), %rdi
+ movq 24(%rsp), %rsi
+ movq 16(%rsp), %rdx
+ movq 8(%rsp), %rcx
+ movq (%rsp), %rax
+ addq $0x38, %rsp
+ .endm
+
+#endif
+
#ifdef CONFIG_FUNCTION_TRACER
#define MCOUNT_ADDR ((long)(mcount))
#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
@@ -17,8 +44,40 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
*/
return addr - 1;
}
-#endif
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+struct dyn_arch_ftrace {
+ /* No extra data needed for x86 */
+};
+
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* __ASSEMBLY__ */
#endif /* CONFIG_FUNCTION_TRACER */
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+#ifndef __ASSEMBLY__
+
+/*
+ * Stack of return addresses for functions
+ * of a thread.
+ * Used in struct thread_info
+ */
+struct ftrace_ret_stack {
+ unsigned long ret;
+ unsigned long func;
+ unsigned long long calltime;
+};
+
+/*
+ * Primary handler of a function return.
+ * It relays on ftrace_return_to_handler.
+ * Defined in entry_32/64.S
+ */
+extern void return_to_handler(void);
+
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
#endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 74252264433..6cfdafa409d 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -29,6 +29,39 @@ extern int fix_aperture;
#define AMD64_GARTCACHECTL 0x9c
#define AMD64_GARTEN (1<<0)
+#ifdef CONFIG_GART_IOMMU
+extern int gart_iommu_aperture;
+extern int gart_iommu_aperture_allowed;
+extern int gart_iommu_aperture_disabled;
+
+extern void early_gart_iommu_check(void);
+extern void gart_iommu_init(void);
+extern void gart_iommu_shutdown(void);
+extern void __init gart_parse_options(char *);
+extern void gart_iommu_hole_init(void);
+
+#else
+#define gart_iommu_aperture 0
+#define gart_iommu_aperture_allowed 0
+#define gart_iommu_aperture_disabled 1
+
+static inline void early_gart_iommu_check(void)
+{
+}
+static inline void gart_iommu_init(void)
+{
+}
+static inline void gart_iommu_shutdown(void)
+{
+}
+static inline void gart_parse_options(char *options)
+{
+}
+static inline void gart_iommu_hole_init(void)
+{
+}
+#endif
+
extern int agp_amd64_init(void);
static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
diff --git a/arch/x86/include/asm/genapic.h b/arch/x86/include/asm/genapic.h
index d48bee663a6..273b99452ae 100644
--- a/arch/x86/include/asm/genapic.h
+++ b/arch/x86/include/asm/genapic.h
@@ -1,5 +1,263 @@
+#ifndef _ASM_X86_GENAPIC_H
+#define _ASM_X86_GENAPIC_H
+
+#include <linux/cpumask.h>
+
+#include <asm/mpspec.h>
+#include <asm/atomic.h>
+
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic APIC sub-arch data struct.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+struct genapic {
+ char *name;
+
+ int (*probe)(void);
+ int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
+ int (*apic_id_registered)(void);
+
+ u32 irq_delivery_mode;
+ u32 irq_dest_mode;
+
+ const struct cpumask *(*target_cpus)(void);
+
+ int disable_esr;
+
+ int dest_logical;
+ unsigned long (*check_apicid_used)(physid_mask_t bitmap, int apicid);
+ unsigned long (*check_apicid_present)(int apicid);
+
+ void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
+ void (*init_apic_ldr)(void);
+
+ physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map);
+
+ void (*setup_apic_routing)(void);
+ int (*multi_timer_check)(int apic, int irq);
+ int (*apicid_to_node)(int logical_apicid);
+ int (*cpu_to_logical_apicid)(int cpu);
+ int (*cpu_present_to_apicid)(int mps_cpu);
+ physid_mask_t (*apicid_to_cpu_present)(int phys_apicid);
+ void (*setup_portio_remap)(void);
+ int (*check_phys_apicid_present)(int boot_cpu_physical_apicid);
+ void (*enable_apic_mode)(void);
+ int (*phys_pkg_id)(int cpuid_apic, int index_msb);
+
+ /*
+ * When one of the next two hooks returns 1 the genapic
+ * is switched to this. Essentially they are additional
+ * probe functions:
+ */
+ int (*mps_oem_check)(struct mpc_table *mpc, char *oem, char *productid);
+
+ unsigned int (*get_apic_id)(unsigned long x);
+ unsigned long (*set_apic_id)(unsigned int id);
+ unsigned long apic_id_mask;
+
+ unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask);
+ unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
+ const struct cpumask *andmask);
+
+ /* ipi */
+ void (*send_IPI_mask)(const struct cpumask *mask, int vector);
+ void (*send_IPI_mask_allbutself)(const struct cpumask *mask,
+ int vector);
+ void (*send_IPI_allbutself)(int vector);
+ void (*send_IPI_all)(int vector);
+ void (*send_IPI_self)(int vector);
+
+ /* wakeup_secondary_cpu */
+ int (*wakeup_cpu)(int apicid, unsigned long start_eip);
+
+ int trampoline_phys_low;
+ int trampoline_phys_high;
+
+ void (*wait_for_init_deassert)(atomic_t *deassert);
+ void (*smp_callin_clear_local_apic)(void);
+ void (*store_NMI_vector)(unsigned short *high, unsigned short *low);
+ void (*inquire_remote_apic)(int apicid);
+};
+
+extern struct genapic *apic;
+
+/*
+ * Warm reset vector default position:
+ */
+#define DEFAULT_TRAMPOLINE_PHYS_LOW 0x467
+#define DEFAULT_TRAMPOLINE_PHYS_HIGH 0x469
+
#ifdef CONFIG_X86_32
-# include "genapic_32.h"
+extern void es7000_update_genapic_to_cluster(void);
#else
-# include "genapic_64.h"
+extern struct genapic apic_flat;
+extern struct genapic apic_physflat;
+extern struct genapic apic_x2apic_cluster;
+extern struct genapic apic_x2apic_phys;
+extern int default_acpi_madt_oem_check(char *, char *);
+
+extern void apic_send_IPI_self(int vector);
+
+extern struct genapic apic_x2apic_uv_x;
+DECLARE_PER_CPU(int, x2apic_extra_bits);
+
+extern void default_setup_apic_routing(void);
+
+extern int default_cpu_present_to_apicid(int mps_cpu);
+extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid);
#endif
+
+static inline void default_wait_for_init_deassert(atomic_t *deassert)
+{
+ while (!atomic_read(deassert))
+ cpu_relax();
+ return;
+}
+
+extern void generic_bigsmp_probe(void);
+
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+#include <asm/smp.h>
+
+#define APIC_DFR_VALUE (APIC_DFR_FLAT)
+
+static inline const struct cpumask *default_target_cpus(void)
+{
+#ifdef CONFIG_SMP
+ return cpu_online_mask;
+#else
+ return cpumask_of(0);
+#endif
+}
+
+DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
+
+
+static inline unsigned int read_apic_id(void)
+{
+ unsigned int reg;
+
+ reg = apic_read(APIC_ID);
+
+ return apic->get_apic_id(reg);
+}
+
+#ifdef CONFIG_X86_64
+extern void default_setup_apic_routing(void);
+#else
+
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+extern void default_init_apic_ldr(void);
+
+static inline int default_apic_id_registered(void)
+{
+ return physid_isset(read_apic_id(), phys_cpu_present_map);
+}
+
+static inline unsigned int
+default_cpu_mask_to_apicid(const struct cpumask *cpumask)
+{
+ return cpumask_bits(cpumask)[0];
+}
+
+static inline unsigned int
+default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ unsigned long mask1 = cpumask_bits(cpumask)[0];
+ unsigned long mask2 = cpumask_bits(andmask)[0];
+ unsigned long mask3 = cpumask_bits(cpu_online_mask)[0];
+
+ return (unsigned int)(mask1 & mask2 & mask3);
+}
+
+static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+static inline void default_setup_apic_routing(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+ printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
+ "Flat", nr_ioapics);
+#endif
+}
+
+extern int default_apicid_to_node(int logical_apicid);
+
+#endif
+
+static inline unsigned long default_check_apicid_used(physid_mask_t bitmap, int apicid)
+{
+ return physid_isset(apicid, bitmap);
+}
+
+static inline unsigned long default_check_apicid_present(int bit)
+{
+ return physid_isset(bit, phys_cpu_present_map);
+}
+
+static inline physid_mask_t default_ioapic_phys_id_map(physid_mask_t phys_map)
+{
+ return phys_map;
+}
+
+/* Mapping from cpu number to logical apicid */
+static inline int default_cpu_to_logical_apicid(int cpu)
+{
+ return 1 << cpu;
+}
+
+static inline int __default_cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
+ return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
+ else
+ return BAD_APICID;
+}
+
+static inline int
+__default_check_phys_apicid_present(int boot_cpu_physical_apicid)
+{
+ return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map);
+}
+
+#ifdef CONFIG_X86_32
+static inline int default_cpu_present_to_apicid(int mps_cpu)
+{
+ return __default_cpu_present_to_apicid(mps_cpu);
+}
+
+static inline int
+default_check_phys_apicid_present(int boot_cpu_physical_apicid)
+{
+ return __default_check_phys_apicid_present(boot_cpu_physical_apicid);
+}
+#else
+extern int default_cpu_present_to_apicid(int mps_cpu);
+extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid);
+#endif
+
+static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid)
+{
+ return physid_mask_of_physid(phys_apicid);
+}
+
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#endif /* _ASM_X86_GENAPIC_64_H */
diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h
deleted file mode 100644
index 5cbd4fcc06f..00000000000
--- a/arch/x86/include/asm/genapic_32.h
+++ /dev/null
@@ -1,126 +0,0 @@
-#ifndef _ASM_X86_GENAPIC_32_H
-#define _ASM_X86_GENAPIC_32_H
-
-#include <asm/mpspec.h>
-
-/*
- * Generic APIC driver interface.
- *
- * An straight forward mapping of the APIC related parts of the
- * x86 subarchitecture interface to a dynamic object.
- *
- * This is used by the "generic" x86 subarchitecture.
- *
- * Copyright 2003 Andi Kleen, SuSE Labs.
- */
-
-struct mpc_config_bus;
-struct mp_config_table;
-struct mpc_config_processor;
-
-struct genapic {
- char *name;
- int (*probe)(void);
-
- int (*apic_id_registered)(void);
- cpumask_t (*target_cpus)(void);
- int int_delivery_mode;
- int int_dest_mode;
- int ESR_DISABLE;
- int apic_destination_logical;
- unsigned long (*check_apicid_used)(physid_mask_t bitmap, int apicid);
- unsigned long (*check_apicid_present)(int apicid);
- int no_balance_irq;
- int no_ioapic_check;
- void (*init_apic_ldr)(void);
- physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map);
-
- void (*setup_apic_routing)(void);
- int (*multi_timer_check)(int apic, int irq);
- int (*apicid_to_node)(int logical_apicid);
- int (*cpu_to_logical_apicid)(int cpu);
- int (*cpu_present_to_apicid)(int mps_cpu);
- physid_mask_t (*apicid_to_cpu_present)(int phys_apicid);
- void (*setup_portio_remap)(void);
- int (*check_phys_apicid_present)(int boot_cpu_physical_apicid);
- void (*enable_apic_mode)(void);
- u32 (*phys_pkg_id)(u32 cpuid_apic, int index_msb);
-
- /* mpparse */
- /* When one of the next two hooks returns 1 the genapic
- is switched to this. Essentially they are additional probe
- functions. */
- int (*mps_oem_check)(struct mp_config_table *mpc, char *oem,
- char *productid);
- int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
-
- unsigned (*get_apic_id)(unsigned long x);
- unsigned long apic_id_mask;
- unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask);
- cpumask_t (*vector_allocation_domain)(int cpu);
-
-#ifdef CONFIG_SMP
- /* ipi */
- void (*send_IPI_mask)(cpumask_t mask, int vector);
- void (*send_IPI_allbutself)(int vector);
- void (*send_IPI_all)(int vector);
-#endif
-};
-
-#define APICFUNC(x) .x = x,
-
-/* More functions could be probably marked IPIFUNC and save some space
- in UP GENERICARCH kernels, but I don't have the nerve right now
- to untangle this mess. -AK */
-#ifdef CONFIG_SMP
-#define IPIFUNC(x) APICFUNC(x)
-#else
-#define IPIFUNC(x)
-#endif
-
-#define APIC_INIT(aname, aprobe) \
-{ \
- .name = aname, \
- .probe = aprobe, \
- .int_delivery_mode = INT_DELIVERY_MODE, \
- .int_dest_mode = INT_DEST_MODE, \
- .no_balance_irq = NO_BALANCE_IRQ, \
- .ESR_DISABLE = esr_disable, \
- .apic_destination_logical = APIC_DEST_LOGICAL, \
- APICFUNC(apic_id_registered) \
- APICFUNC(target_cpus) \
- APICFUNC(check_apicid_used) \
- APICFUNC(check_apicid_present) \
- APICFUNC(init_apic_ldr) \
- APICFUNC(ioapic_phys_id_map) \
- APICFUNC(setup_apic_routing) \
- APICFUNC(multi_timer_check) \
- APICFUNC(apicid_to_node) \
- APICFUNC(cpu_to_logical_apicid) \
- APICFUNC(cpu_present_to_apicid) \
- APICFUNC(apicid_to_cpu_present) \
- APICFUNC(setup_portio_remap) \
- APICFUNC(check_phys_apicid_present) \
- APICFUNC(mps_oem_check) \
- APICFUNC(get_apic_id) \
- .apic_id_mask = APIC_ID_MASK, \
- APICFUNC(cpu_mask_to_apicid) \
- APICFUNC(vector_allocation_domain) \
- APICFUNC(acpi_madt_oem_check) \
- IPIFUNC(send_IPI_mask) \
- IPIFUNC(send_IPI_allbutself) \
- IPIFUNC(send_IPI_all) \
- APICFUNC(enable_apic_mode) \
- APICFUNC(phys_pkg_id) \
-}
-
-extern struct genapic *genapic;
-
-enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
-#define get_uv_system_type() UV_NONE
-#define is_uv_system() 0
-#define uv_wakeup_secondary(a, b) 1
-#define uv_system_init() do {} while (0)
-
-
-#endif /* _ASM_X86_GENAPIC_32_H */
diff --git a/arch/x86/include/asm/genapic_64.h b/arch/x86/include/asm/genapic_64.h
deleted file mode 100644
index 13c4e96199e..00000000000
--- a/arch/x86/include/asm/genapic_64.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef _ASM_X86_GENAPIC_64_H
-#define _ASM_X86_GENAPIC_64_H
-
-/*
- * Copyright 2004 James Cleverdon, IBM.
- * Subject to the GNU Public License, v.2
- *
- * Generic APIC sub-arch data struct.
- *
- * Hacked for x86-64 by James Cleverdon from i386 architecture code by
- * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
- * James Cleverdon.
- */
-
-struct genapic {
- char *name;
- int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
- u32 int_delivery_mode;
- u32 int_dest_mode;
- int (*apic_id_registered)(void);
- cpumask_t (*target_cpus)(void);
- cpumask_t (*vector_allocation_domain)(int cpu);
- void (*init_apic_ldr)(void);
- /* ipi */
- void (*send_IPI_mask)(cpumask_t mask, int vector);
- void (*send_IPI_allbutself)(int vector);
- void (*send_IPI_all)(int vector);
- void (*send_IPI_self)(int vector);
- /* */
- unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask);
- unsigned int (*phys_pkg_id)(int index_msb);
- unsigned int (*get_apic_id)(unsigned long x);
- unsigned long (*set_apic_id)(unsigned int id);
- unsigned long apic_id_mask;
-};
-
-extern struct genapic *genapic;
-
-extern struct genapic apic_flat;
-extern struct genapic apic_physflat;
-extern struct genapic apic_x2apic_cluster;
-extern struct genapic apic_x2apic_phys;
-extern int acpi_madt_oem_check(char *, char *);
-
-extern void apic_send_IPI_self(int vector);
-enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
-extern enum uv_system_type get_uv_system_type(void);
-extern int is_uv_system(void);
-
-extern struct genapic apic_x2apic_uv_x;
-DECLARE_PER_CPU(int, x2apic_extra_bits);
-extern void uv_cpu_init(void);
-extern void uv_system_init(void);
-extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
-
-extern void setup_apic_routing(void);
-
-#endif /* _ASM_X86_GENAPIC_64_H */
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 000787df66e..176f058e715 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -1,11 +1,52 @@
-#ifdef CONFIG_X86_32
-# include "hardirq_32.h"
-#else
-# include "hardirq_64.h"
+#ifndef _ASM_X86_HARDIRQ_H
+#define _ASM_X86_HARDIRQ_H
+
+#include <linux/threads.h>
+#include <linux/irq.h>
+
+typedef struct {
+ unsigned int __softirq_pending;
+ unsigned int __nmi_count; /* arch dependent */
+ unsigned int irq0_irqs;
+#ifdef CONFIG_X86_LOCAL_APIC
+ unsigned int apic_timer_irqs; /* arch dependent */
+ unsigned int irq_spurious_count;
+#endif
+#ifdef CONFIG_SMP
+ unsigned int irq_resched_count;
+ unsigned int irq_call_count;
+ unsigned int irq_tlb_count;
+#endif
+#ifdef CONFIG_X86_MCE
+ unsigned int irq_thermal_count;
+# ifdef CONFIG_X86_64
+ unsigned int irq_threshold_count;
+# endif
#endif
+} ____cacheline_aligned irq_cpustat_t;
+
+DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
+
+/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
+#define MAX_HARDIRQS_PER_CPU NR_VECTORS
+
+#define __ARCH_IRQ_STAT
+
+#define inc_irq_stat(member) percpu_add(irq_stat.member, 1)
+
+#define local_softirq_pending() percpu_read(irq_stat.__softirq_pending)
+
+#define __ARCH_SET_SOFTIRQ_PENDING
+
+#define set_softirq_pending(x) percpu_write(irq_stat.__softirq_pending, (x))
+#define or_softirq_pending(x) percpu_or(irq_stat.__softirq_pending, (x))
+
+extern void ack_bad_irq(unsigned int irq);
extern u64 arch_irq_stat_cpu(unsigned int cpu);
#define arch_irq_stat_cpu arch_irq_stat_cpu
extern u64 arch_irq_stat(void);
#define arch_irq_stat arch_irq_stat
+
+#endif /* _ASM_X86_HARDIRQ_H */
diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h
deleted file mode 100644
index 5ca135e72f2..00000000000
--- a/arch/x86/include/asm/hardirq_32.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef _ASM_X86_HARDIRQ_32_H
-#define _ASM_X86_HARDIRQ_32_H
-
-#include <linux/threads.h>
-#include <linux/irq.h>
-
-typedef struct {
- unsigned int __softirq_pending;
- unsigned long idle_timestamp;
- unsigned int __nmi_count; /* arch dependent */
- unsigned int apic_timer_irqs; /* arch dependent */
- unsigned int irq0_irqs;
- unsigned int irq_resched_count;
- unsigned int irq_call_count;
- unsigned int irq_tlb_count;
- unsigned int irq_thermal_count;
- unsigned int irq_spurious_count;
-} ____cacheline_aligned irq_cpustat_t;
-
-DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
-
-#define __ARCH_IRQ_STAT
-#define __IRQ_STAT(cpu, member) (per_cpu(irq_stat, cpu).member)
-
-void ack_bad_irq(unsigned int irq);
-#include <linux/irq_cpustat.h>
-
-#endif /* _ASM_X86_HARDIRQ_32_H */
diff --git a/arch/x86/include/asm/hardirq_64.h b/arch/x86/include/asm/hardirq_64.h
deleted file mode 100644
index 1ba381fc51d..00000000000
--- a/arch/x86/include/asm/hardirq_64.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _ASM_X86_HARDIRQ_64_H
-#define _ASM_X86_HARDIRQ_64_H
-
-#include <linux/threads.h>
-#include <linux/irq.h>
-#include <asm/pda.h>
-#include <asm/apic.h>
-
-/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
-#define MAX_HARDIRQS_PER_CPU NR_VECTORS
-
-#define __ARCH_IRQ_STAT 1
-
-#define local_softirq_pending() read_pda(__softirq_pending)
-
-#define __ARCH_SET_SOFTIRQ_PENDING 1
-
-#define set_softirq_pending(x) write_pda(__softirq_pending, (x))
-#define or_softirq_pending(x) or_pda(__softirq_pending, (x))
-
-extern void ack_bad_irq(unsigned int irq);
-
-#endif /* _ASM_X86_HARDIRQ_64_H */
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b97aecb0b61..370e1c83bb4 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -25,8 +25,6 @@
#include <asm/irq.h>
#include <asm/sections.h>
-#define platform_legacy_irq(irq) ((irq) < 16)
-
/* Interrupt handlers registered during init_IRQ */
extern void apic_timer_interrupt(void);
extern void error_interrupt(void);
@@ -58,7 +56,7 @@ extern void make_8259A_irq(unsigned int irq);
extern void init_8259A(int aeoi);
/* IOAPIC */
-#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs))
+#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))
extern unsigned long io_apic_irqs;
extern void init_VISWS_APIC_irqs(void);
@@ -67,15 +65,7 @@ extern void disable_IO_APIC(void);
extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
extern void setup_ioapic_dest(void);
-#ifdef CONFIG_X86_64
extern void enable_IO_APIC(void);
-#endif
-
-/* IPI functions */
-#ifdef CONFIG_X86_32
-extern void send_IPI_self(int vector);
-#endif
-extern void send_IPI(int dest, int vector);
/* Statistics */
extern atomic_t irq_err_count;
@@ -84,21 +74,11 @@ extern atomic_t irq_mis_count;
/* EISA */
extern void eisa_set_level_irq(unsigned int irq);
-/* Voyager functions */
-extern asmlinkage void vic_cpi_interrupt(void);
-extern asmlinkage void vic_sys_interrupt(void);
-extern asmlinkage void vic_cmn_interrupt(void);
-extern asmlinkage void qic_timer_interrupt(void);
-extern asmlinkage void qic_invalidate_interrupt(void);
-extern asmlinkage void qic_reschedule_interrupt(void);
-extern asmlinkage void qic_enable_irq_interrupt(void);
-extern asmlinkage void qic_call_function_interrupt(void);
-
/* SMP */
extern void smp_apic_timer_interrupt(struct pt_regs *);
extern void smp_spurious_interrupt(struct pt_regs *);
extern void smp_error_interrupt(struct pt_regs *);
-#ifdef CONFIG_X86_SMP
+#ifdef CONFIG_SMP
extern void smp_reschedule_interrupt(struct pt_regs *);
extern void smp_call_function_interrupt(struct pt_regs *);
extern void smp_call_function_single_interrupt(struct pt_regs *);
@@ -109,9 +89,7 @@ extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *);
#endif
#endif
-#ifdef CONFIG_X86_32
-extern void (*const interrupt[NR_VECTORS])(void);
-#endif
+extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
typedef int vector_irq_t[NR_VECTORS];
DECLARE_PER_CPU(vector_irq_t, vector_irq);
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
new file mode 100644
index 00000000000..369f5c5d09a
--- /dev/null
+++ b/arch/x86/include/asm/hypervisor.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) 2008, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#ifndef ASM_X86__HYPERVISOR_H
+#define ASM_X86__HYPERVISOR_H
+
+extern unsigned long get_hypervisor_tsc_freq(void);
+extern void init_hypervisor(struct cpuinfo_x86 *c);
+
+#endif
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index 97989c0e534..50ca486fd88 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -129,24 +129,6 @@ typedef struct compat_siginfo {
} _sifields;
} compat_siginfo_t;
-struct sigframe32 {
- u32 pretcode;
- int sig;
- struct sigcontext_ia32 sc;
- struct _fpstate_ia32 fpstate;
- unsigned int extramask[_COMPAT_NSIG_WORDS-1];
-};
-
-struct rt_sigframe32 {
- u32 pretcode;
- int sig;
- u32 pinfo;
- u32 puc;
- compat_siginfo_t info;
- struct ucontext_ia32 uc;
- struct _fpstate_ia32 fpstate;
-};
-
struct ustat32 {
__u32 f_tfree;
compat_ino_t f_tinode;
diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h
index 44c89c3a23e..38d87379e27 100644
--- a/arch/x86/include/asm/idle.h
+++ b/arch/x86/include/asm/idle.h
@@ -8,8 +8,13 @@ struct notifier_block;
void idle_notifier_register(struct notifier_block *n);
void idle_notifier_unregister(struct notifier_block *n);
+#ifdef CONFIG_X86_64
void enter_idle(void);
void exit_idle(void);
+#else /* !CONFIG_X86_64 */
+static inline void enter_idle(void) { }
+static inline void exit_idle(void) { }
+#endif /* CONFIG_X86_64 */
void c1e_remove_cpu(int cpu);
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index ac2abc88cd9..e5a2ab44cd5 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -4,6 +4,8 @@
#define ARCH_HAS_IOREMAP_WC
#include <linux/compiler.h>
+#include <asm-generic/int-ll64.h>
+#include <asm/page.h>
#define build_mmio_read(name, size, type, reg, barrier) \
static inline type name(const volatile void __iomem *addr) \
@@ -45,21 +47,128 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
#define mmiowb() barrier()
#ifdef CONFIG_X86_64
+
build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
-build_mmio_read(__readq, "q", unsigned long, "=r", )
build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
-build_mmio_write(__writeq, "q", unsigned long, "r", )
-#define readq_relaxed(a) __readq(a)
-#define __raw_readq __readq
-#define __raw_writeq writeq
+#else
+
+static inline __u64 readq(const volatile void __iomem *addr)
+{
+ const volatile u32 __iomem *p = addr;
+ u32 low, high;
+
+ low = readl(p);
+ high = readl(p + 1);
+
+ return low + ((u64)high << 32);
+}
+
+static inline void writeq(__u64 val, volatile void __iomem *addr)
+{
+ writel(val, addr);
+ writel(val >> 32, addr+4);
+}
-/* Let people know we have them */
-#define readq readq
-#define writeq writeq
#endif
-extern int iommu_bio_merge;
+#define readq_relaxed(a) readq(a)
+
+#define __raw_readq(a) readq(a)
+#define __raw_writeq(val, addr) writeq(val, addr)
+
+/* Let people know that we have them */
+#define readq readq
+#define writeq writeq
+
+/**
+ * virt_to_phys - map virtual addresses to physical
+ * @address: address to remap
+ *
+ * The returned physical address is the physical (CPU) mapping for
+ * the memory address given. It is only valid to use this function on
+ * addresses directly mapped or allocated via kmalloc.
+ *
+ * This function does not give bus mappings for DMA transfers. In
+ * almost all conceivable cases a device driver should not be using
+ * this function
+ */
+
+static inline phys_addr_t virt_to_phys(volatile void *address)
+{
+ return __pa(address);
+}
+
+/**
+ * phys_to_virt - map physical address to virtual
+ * @address: address to remap
+ *
+ * The returned virtual address is a current CPU mapping for
+ * the memory address given. It is only valid to use this function on
+ * addresses that have a kernel mapping
+ *
+ * This function does not handle bus mappings for DMA transfers. In
+ * almost all conceivable cases a device driver should not be using
+ * this function
+ */
+
+static inline void *phys_to_virt(phys_addr_t address)
+{
+ return __va(address);
+}
+
+/*
+ * Change "struct page" to physical address.
+ */
+#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
+
+/*
+ * ISA I/O bus memory addresses are 1:1 with the physical address.
+ */
+#define isa_virt_to_bus virt_to_phys
+#define isa_page_to_bus page_to_phys
+#define isa_bus_to_virt phys_to_virt
+
+/*
+ * However PCI ones are not necessarily 1:1 and therefore these interfaces
+ * are forbidden in portable PCI drivers.
+ *
+ * Allow them on x86 for legacy drivers, though.
+ */
+#define virt_to_bus virt_to_phys
+#define bus_to_virt phys_to_virt
+
+/**
+ * ioremap - map bus memory into CPU space
+ * @offset: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * ioremap performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * If the area you are trying to map is a PCI BAR you should have a
+ * look at pci_iomap().
+ */
+extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
+ unsigned long prot_val);
+
+/*
+ * The default ioremap() behavior is non-cached:
+ */
+static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
+{
+ return ioremap_nocache(offset, size);
+}
+
+extern void iounmap(volatile void __iomem *addr);
+
+extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
+
#ifdef CONFIG_X86_32
# include "io_32.h"
@@ -72,7 +181,7 @@ extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
unsigned long prot_val);
-extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
+extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
/*
* early_ioremap() and early_iounmap() are for temporary early boot-time
@@ -80,12 +189,12 @@ extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
* A boot-time mapping is currently limited to at most 16 pages.
*/
extern void early_ioremap_init(void);
-extern void early_ioremap_clear(void);
extern void early_ioremap_reset(void);
extern void __iomem *early_ioremap(unsigned long offset, unsigned long size);
extern void __iomem *early_memremap(unsigned long offset, unsigned long size);
extern void early_iounmap(void __iomem *addr, unsigned long size);
extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
+#define IO_SPACE_LIMIT 0xffff
#endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/include/asm/io_32.h b/arch/x86/include/asm/io_32.h
index d8e242e1b39..a299900f592 100644
--- a/arch/x86/include/asm/io_32.h
+++ b/arch/x86/include/asm/io_32.h
@@ -37,8 +37,6 @@
* - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*/
-#define IO_SPACE_LIMIT 0xffff
-
#define XQUAD_PORTIO_BASE 0xfe400000
#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
@@ -53,92 +51,6 @@
*/
#define xlate_dev_kmem_ptr(p) p
-/**
- * virt_to_phys - map virtual addresses to physical
- * @address: address to remap
- *
- * The returned physical address is the physical (CPU) mapping for
- * the memory address given. It is only valid to use this function on
- * addresses directly mapped or allocated via kmalloc.
- *
- * This function does not give bus mappings for DMA transfers. In
- * almost all conceivable cases a device driver should not be using
- * this function
- */
-
-static inline unsigned long virt_to_phys(volatile void *address)
-{
- return __pa(address);
-}
-
-/**
- * phys_to_virt - map physical address to virtual
- * @address: address to remap
- *
- * The returned virtual address is a current CPU mapping for
- * the memory address given. It is only valid to use this function on
- * addresses that have a kernel mapping
- *
- * This function does not handle bus mappings for DMA transfers. In
- * almost all conceivable cases a device driver should not be using
- * this function
- */
-
-static inline void *phys_to_virt(unsigned long address)
-{
- return __va(address);
-}
-
-/*
- * Change "struct page" to physical address.
- */
-#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
-
-/**
- * ioremap - map bus memory into CPU space
- * @offset: bus address of the memory
- * @size: size of the resource to map
- *
- * ioremap performs a platform specific sequence of operations to
- * make bus memory CPU accessible via the readb/readw/readl/writeb/
- * writew/writel functions and the other mmio helpers. The returned
- * address is not guaranteed to be usable directly as a virtual
- * address.
- *
- * If the area you are trying to map is a PCI BAR you should have a
- * look at pci_iomap().
- */
-extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
-extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
-extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
- unsigned long prot_val);
-
-/*
- * The default ioremap() behavior is non-cached:
- */
-static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
-{
- return ioremap_nocache(offset, size);
-}
-
-extern void iounmap(volatile void __iomem *addr);
-
-/*
- * ISA I/O bus memory addresses are 1:1 with the physical address.
- */
-#define isa_virt_to_bus virt_to_phys
-#define isa_page_to_bus page_to_phys
-#define isa_bus_to_virt phys_to_virt
-
-/*
- * However PCI ones are not necessarily 1:1 and therefore these interfaces
- * are forbidden in portable PCI drivers.
- *
- * Allow them on x86 for legacy drivers, though.
- */
-#define virt_to_bus virt_to_phys
-#define bus_to_virt phys_to_virt
-
static inline void
memset_io(volatile void __iomem *addr, unsigned char val, int count)
{
diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h
index fea325a1122..244067893af 100644
--- a/arch/x86/include/asm/io_64.h
+++ b/arch/x86/include/asm/io_64.h
@@ -136,73 +136,12 @@ __OUTS(b)
__OUTS(w)
__OUTS(l)
-#define IO_SPACE_LIMIT 0xffff
-
#if defined(__KERNEL__) && defined(__x86_64__)
#include <linux/vmalloc.h>
-#ifndef __i386__
-/*
- * Change virtual addresses to physical addresses and vv.
- * These are pretty trivial
- */
-static inline unsigned long virt_to_phys(volatile void *address)
-{
- return __pa(address);
-}
-
-static inline void *phys_to_virt(unsigned long address)
-{
- return __va(address);
-}
-#endif
-
-/*
- * Change "struct page" to physical address.
- */
-#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
-
#include <asm-generic/iomap.h>
-/*
- * This one maps high address device memory and turns off caching for that area.
- * it's useful if some control registers are in such an area and write combining
- * or read caching is not desirable:
- */
-extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
-extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
-extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
- unsigned long prot_val);
-
-/*
- * The default ioremap() behavior is non-cached:
- */
-static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
-{
- return ioremap_nocache(offset, size);
-}
-
-extern void iounmap(volatile void __iomem *addr);
-
-extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
-
-/*
- * ISA I/O bus memory addresses are 1:1 with the physical address.
- */
-#define isa_virt_to_bus virt_to_phys
-#define isa_page_to_bus page_to_phys
-#define isa_bus_to_virt phys_to_virt
-
-/*
- * However PCI ones are not necessarily 1:1 and therefore these interfaces
- * are forbidden in portable PCI drivers.
- *
- * Allow them on x86 for legacy drivers, though.
- */
-#define virt_to_bus virt_to_phys
-#define bus_to_virt phys_to_virt
-
void __memcpy_fromio(void *, unsigned long, unsigned);
void __memcpy_toio(unsigned long, const void *, unsigned);
@@ -232,8 +171,6 @@ void memset_io(volatile void __iomem *a, int b, size_t c);
#define flush_write_buffers()
-#define BIO_VMERGE_BOUNDARY iommu_bio_merge
-
/*
* Convert a virtual cached pointer to an uncached pointer
*/
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 6afd9933a7d..59cb4a1317b 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -114,38 +114,16 @@ struct IR_IO_APIC_route_entry {
extern int nr_ioapics;
extern int nr_ioapic_registers[MAX_IO_APICS];
-/*
- * MP-BIOS irq configuration table structures:
- */
-
#define MP_MAX_IOAPIC_PIN 127
-struct mp_config_ioapic {
- unsigned long mp_apicaddr;
- unsigned int mp_apicid;
- unsigned char mp_type;
- unsigned char mp_apicver;
- unsigned char mp_flags;
-};
-
-struct mp_config_intsrc {
- unsigned int mp_dstapic;
- unsigned char mp_type;
- unsigned char mp_irqtype;
- unsigned short mp_irqflag;
- unsigned char mp_srcbus;
- unsigned char mp_srcbusirq;
- unsigned char mp_dstirq;
-};
-
/* I/O APIC entries */
-extern struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
+extern struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
/* # of MP IRQ source entries */
extern int mp_irq_entries;
/* MP IRQ source entries */
-extern struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
/* non-0 if default (table-less) MP configuration */
extern int mpc_default_type;
@@ -156,14 +134,15 @@ extern int sis_apic_bug;
/* 1 if "noapic" boot option passed */
extern int skip_ioapic_setup;
+/* 1 if "noapic" boot option passed */
+extern int noioapicquirk;
+
+/* -1 if "noapic" boot option passed */
+extern int noioapicreroute;
+
/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
extern int timer_through_8259;
-static inline void disable_ioapic_setup(void)
-{
- skip_ioapic_setup = 1;
-}
-
/*
* If we use the IO-APIC for IRQ routing, disable automatic
* assignment of PCI IRQ's.
@@ -188,17 +167,20 @@ extern void restore_IO_APIC_setup(void);
extern void reinit_intr_remapped_IO_APIC(int);
#endif
-extern int probe_nr_irqs(void);
+extern void probe_nr_irqs_gsi(void);
+extern int setup_ioapic_entry(int apic, int irq,
+ struct IO_APIC_route_entry *entry,
+ unsigned int destination, int trigger,
+ int polarity, int vector);
+extern void ioapic_write_entry(int apic, int pin,
+ struct IO_APIC_route_entry e);
#else /* !CONFIG_X86_IO_APIC */
#define io_apic_assign_pci_irqs 0
static const int timer_through_8259 = 0;
-static inline void ioapic_init_mappings(void) { }
+static inline void ioapic_init_mappings(void) { }
-static inline int probe_nr_irqs(void)
-{
- return NR_IRQS;
-}
+static inline void probe_nr_irqs_gsi(void) { }
#endif
#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
new file mode 100644
index 00000000000..c1f06289b14
--- /dev/null
+++ b/arch/x86/include/asm/iomap.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright © 2008 Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+
+void *
+iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
+
+void
+iounmap_atomic(void *kvaddr, enum km_type type);
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index e4a552d4446..a6ee9e6f530 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -6,44 +6,8 @@ extern void no_iommu_init(void);
extern struct dma_mapping_ops nommu_dma_ops;
extern int force_iommu, no_iommu;
extern int iommu_detected;
-extern int dmar_disabled;
-
-extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len);
/* 10 seconds */
#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
-#ifdef CONFIG_GART_IOMMU
-extern int gart_iommu_aperture;
-extern int gart_iommu_aperture_allowed;
-extern int gart_iommu_aperture_disabled;
-
-extern void early_gart_iommu_check(void);
-extern void gart_iommu_init(void);
-extern void gart_iommu_shutdown(void);
-extern void __init gart_parse_options(char *);
-extern void gart_iommu_hole_init(void);
-
-#else
-#define gart_iommu_aperture 0
-#define gart_iommu_aperture_allowed 0
-#define gart_iommu_aperture_disabled 1
-
-static inline void early_gart_iommu_check(void)
-{
-}
-static inline void gart_iommu_init(void)
-{
-}
-static inline void gart_iommu_shutdown(void)
-{
-}
-static inline void gart_parse_options(char *options)
-{
-}
-static inline void gart_iommu_hole_init(void)
-{
-}
-#endif
-
#endif /* _ASM_X86_IOMMU_H */
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
index f89dffb28aa..5f2efc5d992 100644
--- a/arch/x86/include/asm/ipi.h
+++ b/arch/x86/include/asm/ipi.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_IPI_H
#define _ASM_X86_IPI_H
+#ifdef CONFIG_X86_LOCAL_APIC
+
/*
* Copyright 2004 James Cleverdon, IBM.
* Subject to the GNU Public License, v.2
@@ -55,8 +57,8 @@ static inline void __xapic_wait_icr_idle(void)
cpu_relax();
}
-static inline void __send_IPI_shortcut(unsigned int shortcut, int vector,
- unsigned int dest)
+static inline void
+__default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
{
/*
* Subtle. In the case of the 'never do double writes' workaround
@@ -87,8 +89,8 @@ static inline void __send_IPI_shortcut(unsigned int shortcut, int vector,
* This is used to send an IPI with no shorthand notation (the destination is
* specified in bits 56 to 63 of the ICR).
*/
-static inline void __send_IPI_dest_field(unsigned int mask, int vector,
- unsigned int dest)
+static inline void
+ __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest)
{
unsigned long cfg;
@@ -117,22 +119,46 @@ static inline void __send_IPI_dest_field(unsigned int mask, int vector,
native_apic_mem_write(APIC_ICR, cfg);
}
-static inline void send_IPI_mask_sequence(cpumask_t mask, int vector)
+extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask,
+ int vector);
+extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
+ int vector);
+#include <asm/genapic.h>
+
+extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
+ int vector);
+extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
+ int vector);
+
+/* Avoid include hell */
+#define NMI_VECTOR 0x02
+
+extern int no_broadcast;
+
+static inline void __default_local_send_IPI_allbutself(int vector)
{
- unsigned long flags;
- unsigned long query_cpu;
+ if (no_broadcast || vector == NMI_VECTOR)
+ apic->send_IPI_mask_allbutself(cpu_online_mask, vector);
+ else
+ __default_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, apic->dest_logical);
+}
- /*
- * Hack. The clustered APIC addressing mode doesn't allow us to send
- * to an arbitrary mask, so I do a unicast to each CPU instead.
- * - mbligh
- */
- local_irq_save(flags);
- for_each_cpu_mask_nr(query_cpu, mask) {
- __send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu),
- vector, APIC_DEST_PHYSICAL);
- }
- local_irq_restore(flags);
+static inline void __default_local_send_IPI_all(int vector)
+{
+ if (no_broadcast || vector == NMI_VECTOR)
+ apic->send_IPI_mask(cpu_online_mask, vector);
+ else
+ __default_send_IPI_shortcut(APIC_DEST_ALLINC, vector, apic->dest_logical);
}
+#ifdef CONFIG_X86_32
+extern void default_send_IPI_mask_logical(const struct cpumask *mask,
+ int vector);
+extern void default_send_IPI_allbutself(int vector);
+extern void default_send_IPI_all(int vector);
+extern void default_send_IPI_self(int vector);
+#endif
+
+#endif
+
#endif /* _ASM_X86_IPI_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index bae0eda9548..107eb219669 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -31,20 +31,19 @@ static inline int irq_canonicalize(int irq)
# endif
#endif
-#ifdef CONFIG_IRQBALANCE
-extern int irqbalance_disable(char *str);
-#endif
-
#ifdef CONFIG_HOTPLUG_CPU
#include <linux/cpumask.h>
-extern void fixup_irqs(cpumask_t map);
+extern void fixup_irqs(void);
#endif
-extern unsigned int do_IRQ(struct pt_regs *regs);
extern void init_IRQ(void);
extern void native_init_IRQ(void);
+extern bool handle_irq(unsigned irq, struct pt_regs *regs);
+
+extern unsigned int do_IRQ(struct pt_regs *regs);
/* Interrupt vector management */
extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
+extern int vector_used_by_percpu_irq(unsigned int vector);
#endif /* _ASM_X86_IRQ_H */
diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h
index 89c898ab298..77843225b7e 100644
--- a/arch/x86/include/asm/irq_regs.h
+++ b/arch/x86/include/asm/irq_regs.h
@@ -1,5 +1,31 @@
-#ifdef CONFIG_X86_32
-# include "irq_regs_32.h"
-#else
-# include "irq_regs_64.h"
-#endif
+/*
+ * Per-cpu current frame pointer - the location of the last exception frame on
+ * the stack, stored in the per-cpu area.
+ *
+ * Jeremy Fitzhardinge <jeremy@goop.org>
+ */
+#ifndef _ASM_X86_IRQ_REGS_H
+#define _ASM_X86_IRQ_REGS_H
+
+#include <asm/percpu.h>
+
+#define ARCH_HAS_OWN_IRQ_REGS
+
+DECLARE_PER_CPU(struct pt_regs *, irq_regs);
+
+static inline struct pt_regs *get_irq_regs(void)
+{
+ return percpu_read(irq_regs);
+}
+
+static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
+{
+ struct pt_regs *old_regs;
+
+ old_regs = get_irq_regs();
+ percpu_write(irq_regs, new_regs);
+
+ return old_regs;
+}
+
+#endif /* _ASM_X86_IRQ_REGS_32_H */
diff --git a/arch/x86/include/asm/irq_regs_32.h b/arch/x86/include/asm/irq_regs_32.h
deleted file mode 100644
index af2f02d27fc..00000000000
--- a/arch/x86/include/asm/irq_regs_32.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Per-cpu current frame pointer - the location of the last exception frame on
- * the stack, stored in the per-cpu area.
- *
- * Jeremy Fitzhardinge <jeremy@goop.org>
- */
-#ifndef _ASM_X86_IRQ_REGS_32_H
-#define _ASM_X86_IRQ_REGS_32_H
-
-#include <asm/percpu.h>
-
-DECLARE_PER_CPU(struct pt_regs *, irq_regs);
-
-static inline struct pt_regs *get_irq_regs(void)
-{
- return x86_read_percpu(irq_regs);
-}
-
-static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
-{
- struct pt_regs *old_regs;
-
- old_regs = get_irq_regs();
- x86_write_percpu(irq_regs, new_regs);
-
- return old_regs;
-}
-
-#endif /* _ASM_X86_IRQ_REGS_32_H */
diff --git a/arch/x86/include/asm/irq_regs_64.h b/arch/x86/include/asm/irq_regs_64.h
deleted file mode 100644
index 3dd9c0b7027..00000000000
--- a/arch/x86/include/asm/irq_regs_64.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/irq_regs.h>
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 0005adb0f94..b07278c55e9 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -1,47 +1,69 @@
#ifndef _ASM_X86_IRQ_VECTORS_H
#define _ASM_X86_IRQ_VECTORS_H
-#include <linux/threads.h>
+/*
+ * Linux IRQ vector layout.
+ *
+ * There are 256 IDT entries (per CPU - each entry is 8 bytes) which can
+ * be defined by Linux. They are used as a jump table by the CPU when a
+ * given vector is triggered - by a CPU-external, CPU-internal or
+ * software-triggered event.
+ *
+ * Linux sets the kernel code address each entry jumps to early during
+ * bootup, and never changes them. This is the general layout of the
+ * IDT entries:
+ *
+ * Vectors 0 ... 31 : system traps and exceptions - hardcoded events
+ * Vectors 32 ... 127 : device interrupts
+ * Vector 128 : legacy int80 syscall interface
+ * Vectors 129 ... 237 : device interrupts
+ * Vectors 238 ... 255 : special interrupts
+ *
+ * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
+ *
+ * This file enumerates the exact layout of them:
+ */
-#define NMI_VECTOR 0x02
+#define NMI_VECTOR 0x02
/*
* IDT vectors usable for external interrupt sources start
* at 0x20:
*/
-#define FIRST_EXTERNAL_VECTOR 0x20
+#define FIRST_EXTERNAL_VECTOR 0x20
#ifdef CONFIG_X86_32
-# define SYSCALL_VECTOR 0x80
+# define SYSCALL_VECTOR 0x80
#else
-# define IA32_SYSCALL_VECTOR 0x80
+# define IA32_SYSCALL_VECTOR 0x80
#endif
/*
* Reserve the lowest usable priority level 0x20 - 0x2f for triggering
* cleanup after irq migration.
*/
-#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
+#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
/*
* Vectors 0x30-0x3f are used for ISA interrupts.
*/
-#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10)
-#define IRQ1_VECTOR (IRQ0_VECTOR + 1)
-#define IRQ2_VECTOR (IRQ0_VECTOR + 2)
-#define IRQ3_VECTOR (IRQ0_VECTOR + 3)
-#define IRQ4_VECTOR (IRQ0_VECTOR + 4)
-#define IRQ5_VECTOR (IRQ0_VECTOR + 5)
-#define IRQ6_VECTOR (IRQ0_VECTOR + 6)
-#define IRQ7_VECTOR (IRQ0_VECTOR + 7)
-#define IRQ8_VECTOR (IRQ0_VECTOR + 8)
-#define IRQ9_VECTOR (IRQ0_VECTOR + 9)
-#define IRQ10_VECTOR (IRQ0_VECTOR + 10)
-#define IRQ11_VECTOR (IRQ0_VECTOR + 11)
-#define IRQ12_VECTOR (IRQ0_VECTOR + 12)
-#define IRQ13_VECTOR (IRQ0_VECTOR + 13)
-#define IRQ14_VECTOR (IRQ0_VECTOR + 14)
-#define IRQ15_VECTOR (IRQ0_VECTOR + 15)
+#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10)
+
+#define IRQ1_VECTOR (IRQ0_VECTOR + 1)
+#define IRQ2_VECTOR (IRQ0_VECTOR + 2)
+#define IRQ3_VECTOR (IRQ0_VECTOR + 3)
+#define IRQ4_VECTOR (IRQ0_VECTOR + 4)
+#define IRQ5_VECTOR (IRQ0_VECTOR + 5)
+#define IRQ6_VECTOR (IRQ0_VECTOR + 6)
+#define IRQ7_VECTOR (IRQ0_VECTOR + 7)
+#define IRQ8_VECTOR (IRQ0_VECTOR + 8)
+#define IRQ9_VECTOR (IRQ0_VECTOR + 9)
+#define IRQ10_VECTOR (IRQ0_VECTOR + 10)
+#define IRQ11_VECTOR (IRQ0_VECTOR + 11)
+#define IRQ12_VECTOR (IRQ0_VECTOR + 12)
+#define IRQ13_VECTOR (IRQ0_VECTOR + 13)
+#define IRQ14_VECTOR (IRQ0_VECTOR + 14)
+#define IRQ15_VECTOR (IRQ0_VECTOR + 15)
/*
* Special IRQ vectors used by the SMP architecture, 0xf0-0xff
@@ -49,108 +71,98 @@
* some of the following vectors are 'rare', they are merged
* into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
* TLB, reschedule and local APIC vectors are performance-critical.
- *
- * Vectors 0xf0-0xfa are free (reserved for future Linux use).
*/
-#ifdef CONFIG_X86_32
-
-# define SPURIOUS_APIC_VECTOR 0xff
-# define ERROR_APIC_VECTOR 0xfe
-# define INVALIDATE_TLB_VECTOR 0xfd
-# define RESCHEDULE_VECTOR 0xfc
-# define CALL_FUNCTION_VECTOR 0xfb
-# define CALL_FUNCTION_SINGLE_VECTOR 0xfa
-# define THERMAL_APIC_VECTOR 0xf0
-
-#else
#define SPURIOUS_APIC_VECTOR 0xff
+/*
+ * Sanity check
+ */
+#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F)
+# error SPURIOUS_APIC_VECTOR definition error
+#endif
+
#define ERROR_APIC_VECTOR 0xfe
#define RESCHEDULE_VECTOR 0xfd
#define CALL_FUNCTION_VECTOR 0xfc
#define CALL_FUNCTION_SINGLE_VECTOR 0xfb
#define THERMAL_APIC_VECTOR 0xfa
-#define THRESHOLD_APIC_VECTOR 0xf9
-#define UV_BAU_MESSAGE 0xf8
-#define INVALIDATE_TLB_VECTOR_END 0xf7
-#define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */
-
-#define NUM_INVALIDATE_TLB_VECTORS 8
+#ifdef CONFIG_X86_32
+/* 0xf8 - 0xf9 : free */
+#else
+# define THRESHOLD_APIC_VECTOR 0xf9
+# define UV_BAU_MESSAGE 0xf8
#endif
+/* f0-f7 used for spreading out TLB flushes: */
+#define INVALIDATE_TLB_VECTOR_END 0xf7
+#define INVALIDATE_TLB_VECTOR_START 0xf0
+#define NUM_INVALIDATE_TLB_VECTORS 8
+
/*
* Local APIC timer IRQ vector is on a different priority level,
* to work around the 'lost local interrupt if more than 2 IRQ
* sources per level' errata.
*/
-#define LOCAL_TIMER_VECTOR 0xef
+#define LOCAL_TIMER_VECTOR 0xef
+
+/*
+ * Performance monitoring interrupt vector:
+ */
+#define LOCAL_PERF_VECTOR 0xee
/*
* First APIC vector available to drivers: (vectors 0x30-0xee) we
* start at 0x31(0x41) to spread out vectors evenly between priority
* levels. (0x80 is the syscall vector)
*/
-#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2)
-
-#define NR_VECTORS 256
+#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2)
-#define FPU_IRQ 13
+#define NR_VECTORS 256
-#define FIRST_VM86_IRQ 3
-#define LAST_VM86_IRQ 15
-#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
+#define FPU_IRQ 13
-#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
-# if NR_CPUS < MAX_IO_APICS
-# define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
-# else
-# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
-# endif
+#define FIRST_VM86_IRQ 3
+#define LAST_VM86_IRQ 15
-#elif defined(CONFIG_X86_VOYAGER)
+#ifndef __ASSEMBLY__
+static inline int invalid_vm86_irq(int irq)
+{
+ return irq < 3 || irq > 15;
+}
+#endif
-# define NR_IRQS 224
+/*
+ * Size the maximum number of interrupts.
+ *
+ * If the irq_desc[] array has a sparse layout, we can size things
+ * generously - it scales up linearly with the maximum number of CPUs,
+ * and the maximum number of IO-APICs, whichever is higher.
+ *
+ * In other cases we size more conservatively, to not create too large
+ * static arrays.
+ */
-#else /* IO_APIC || VOYAGER */
+#define NR_IRQS_LEGACY 16
-# define NR_IRQS 16
+#define CPU_VECTOR_LIMIT ( 8 * NR_CPUS )
+#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS )
+#ifdef CONFIG_X86_IO_APIC
+# ifdef CONFIG_SPARSE_IRQ
+# define NR_IRQS \
+ (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \
+ (NR_VECTORS + CPU_VECTOR_LIMIT) : \
+ (NR_VECTORS + IO_APIC_VECTOR_LIMIT))
+# else
+# if NR_CPUS < MAX_IO_APICS
+# define NR_IRQS (NR_VECTORS + 4*CPU_VECTOR_LIMIT)
+# else
+# define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT)
+# endif
+# endif
+#else /* !CONFIG_X86_IO_APIC: */
+# define NR_IRQS NR_IRQS_LEGACY
#endif
-/* Voyager specific defines */
-/* These define the CPIs we use in linux */
-#define VIC_CPI_LEVEL0 0
-#define VIC_CPI_LEVEL1 1
-/* now the fake CPIs */
-#define VIC_TIMER_CPI 2
-#define VIC_INVALIDATE_CPI 3
-#define VIC_RESCHEDULE_CPI 4
-#define VIC_ENABLE_IRQ_CPI 5
-#define VIC_CALL_FUNCTION_CPI 6
-#define VIC_CALL_FUNCTION_SINGLE_CPI 7
-
-/* Now the QIC CPIs: Since we don't need the two initial levels,
- * these are 2 less than the VIC CPIs */
-#define QIC_CPI_OFFSET 1
-#define QIC_TIMER_CPI (VIC_TIMER_CPI - QIC_CPI_OFFSET)
-#define QIC_INVALIDATE_CPI (VIC_INVALIDATE_CPI - QIC_CPI_OFFSET)
-#define QIC_RESCHEDULE_CPI (VIC_RESCHEDULE_CPI - QIC_CPI_OFFSET)
-#define QIC_ENABLE_IRQ_CPI (VIC_ENABLE_IRQ_CPI - QIC_CPI_OFFSET)
-#define QIC_CALL_FUNCTION_CPI (VIC_CALL_FUNCTION_CPI - QIC_CPI_OFFSET)
-#define QIC_CALL_FUNCTION_SINGLE_CPI (VIC_CALL_FUNCTION_SINGLE_CPI - QIC_CPI_OFFSET)
-
-#define VIC_START_FAKE_CPI VIC_TIMER_CPI
-#define VIC_END_FAKE_CPI VIC_CALL_FUNCTION_SINGLE_CPI
-
-/* this is the SYS_INT CPI. */
-#define VIC_SYS_INT 8
-#define VIC_CMN_INT 15
-
-/* This is the boot CPI for alternate processors. It gets overwritten
- * by the above once the system has activated all available processors */
-#define VIC_CPU_BOOT_CPI VIC_CPI_LEVEL0
-#define VIC_CPU_BOOT_ERRATA_CPI (VIC_CPI_LEVEL0 + 8)
-
-
#endif /* _ASM_X86_IRQ_VECTORS_H */
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index b95162af0bf..d2e3bf3608a 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -6,7 +6,7 @@
*
*/
-#include <asm/types.h>
+#include <linux/types.h>
#include <linux/ioctl.h>
/* Architectural interrupt line count. */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8346be87cfa..730843d1d2f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -21,6 +21,7 @@
#include <asm/pvclock-abi.h>
#include <asm/desc.h>
+#include <asm/mtrr.h>
#define KVM_MAX_VCPUS 16
#define KVM_MEMORY_SLOTS 32
@@ -86,6 +87,7 @@
#define KVM_MIN_FREE_MMU_PAGES 5
#define KVM_REFILL_PAGES 25
#define KVM_MAX_CPUID_ENTRIES 40
+#define KVM_NR_FIXED_MTRR_REGION 88
#define KVM_NR_VAR_MTRR 8
extern spinlock_t kvm_lock;
@@ -180,6 +182,8 @@ struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;
+ struct list_head oos_link;
+
/*
* The following two entries are used to key the shadow page in the
* hash table.
@@ -190,13 +194,16 @@ struct kvm_mmu_page {
u64 *spt;
/* hold the gfn of each spte inside spt */
gfn_t *gfns;
- unsigned long slot_bitmap; /* One bit set per slot which has memory
- * in this shadow page.
- */
+ /*
+ * One bit set per slot which has memory
+ * in this shadow page.
+ */
+ DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
int multimapped; /* More than one parent_pte? */
int root_count; /* Currently serving as active root */
bool unsync;
- bool unsync_children;
+ bool global;
+ unsigned int unsync_children;
union {
u64 *parent_pte; /* !multimapped */
struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
@@ -327,8 +334,10 @@ struct kvm_vcpu_arch {
bool nmi_pending;
bool nmi_injected;
+ bool nmi_window_open;
- u64 mtrr[0x100];
+ struct mtrr_state_type mtrr_state;
+ u32 pat;
};
struct kvm_mem_alias {
@@ -350,11 +359,13 @@ struct kvm_arch{
*/
struct list_head active_mmu_pages;
struct list_head assigned_dev_head;
- struct dmar_domain *intel_iommu_domain;
+ struct list_head oos_global_pages;
+ struct iommu_domain *iommu_domain;
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
struct hlist_head irq_ack_notifier_list;
+ int vapics_in_nmi_mode;
int round_robin_prev_vcpu;
unsigned int tss_addr;
@@ -378,6 +389,7 @@ struct kvm_vm_stat {
u32 mmu_recycled;
u32 mmu_cache_miss;
u32 mmu_unsync;
+ u32 mmu_unsync_global;
u32 remote_tlb_flush;
u32 lpages;
};
@@ -397,6 +409,7 @@ struct kvm_vcpu_stat {
u32 halt_exits;
u32 halt_wakeup;
u32 request_irq_exits;
+ u32 request_nmi_exits;
u32 irq_exits;
u32 host_state_reload;
u32 efer_reload;
@@ -405,6 +418,7 @@ struct kvm_vcpu_stat {
u32 insn_emulation_fail;
u32 hypercalls;
u32 irq_injections;
+ u32 nmi_injections;
};
struct descriptor_table {
@@ -477,6 +491,7 @@ struct kvm_x86_ops {
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
+ int (*get_mt_mask_shift)(void);
};
extern struct kvm_x86_ops *kvm_x86_ops;
@@ -490,7 +505,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu);
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
void kvm_mmu_set_base_ptes(u64 base_pte);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask);
+ u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask);
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
@@ -587,12 +602,14 @@ unsigned long segment_base(u16 selector);
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes);
+ const u8 *new, int bytes,
+ bool guest_initiated);
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_global(struct kvm_vcpu *vcpu);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
@@ -607,6 +624,8 @@ void kvm_disable_tdp(void);
int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
int complete_pio(struct kvm_vcpu *vcpu);
+struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
+
static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
{
struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -702,18 +721,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
}
-#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
-#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
-#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
-#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
-#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
-#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
-#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
-#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
-#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
-
#define MSR_IA32_TIME_STAMP_COUNTER 0x010
#define TSS_IOPB_BASE_OFFSET 0x66
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
index 25179a29f20..6a159732881 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -123,6 +123,7 @@ struct decode_cache {
u8 ad_bytes;
u8 rex_prefix;
struct operand src;
+ struct operand src2;
struct operand dst;
bool has_seg_override;
u8 seg_override;
@@ -146,22 +147,18 @@ struct x86_emulate_ctxt {
/* Register state before/after emulation. */
struct kvm_vcpu *vcpu;
- /* Linear faulting address (if emulating a page-faulting instruction) */
unsigned long eflags;
-
/* Emulated execution mode, represented by an X86EMUL_MODE value. */
int mode;
-
u32 cs_base;
/* decode cache */
-
struct decode_cache decode;
};
/* Repeat String Operation Prefix */
-#define REPE_PREFIX 1
-#define REPNE_PREFIX 2
+#define REPE_PREFIX 1
+#define REPNE_PREFIX 2
/* Execution mode, passed to the emulator. */
#define X86EMUL_MODE_REAL 0 /* Real mode. */
@@ -170,7 +167,7 @@ struct x86_emulate_ctxt {
#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
/* Host execution mode. */
-#if defined(__i386__)
+#if defined(CONFIG_X86_32)
#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
#elif defined(CONFIG_X86_64)
#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index d28a507cef3..1caf57628b9 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -15,7 +15,7 @@
#define SHARED_SWITCHER_PAGES \
DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
/* Pages for switcher itself, then two pages per cpu */
-#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS)
+#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
/* We map at -4M for ease of mapping into the guest (one PTE page). */
#define SWITCHER_ADDR 0xFFC00000
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index f61ee8f937e..5d98d0b68ff 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -57,5 +57,65 @@
#define __ALIGN_STR ".align 16,0x90"
#endif
+/*
+ * to check ENTRY_X86/END_X86 and
+ * KPROBE_ENTRY_X86/KPROBE_END_X86
+ * unbalanced-missed-mixed appearance
+ */
+#define __set_entry_x86 .set ENTRY_X86_IN, 0
+#define __unset_entry_x86 .set ENTRY_X86_IN, 1
+#define __set_kprobe_x86 .set KPROBE_X86_IN, 0
+#define __unset_kprobe_x86 .set KPROBE_X86_IN, 1
+
+#define __macro_err_x86 .error "ENTRY_X86/KPROBE_X86 unbalanced,missed,mixed"
+
+#define __check_entry_x86 \
+ .ifdef ENTRY_X86_IN; \
+ .ifeq ENTRY_X86_IN; \
+ __macro_err_x86; \
+ .abort; \
+ .endif; \
+ .endif
+
+#define __check_kprobe_x86 \
+ .ifdef KPROBE_X86_IN; \
+ .ifeq KPROBE_X86_IN; \
+ __macro_err_x86; \
+ .abort; \
+ .endif; \
+ .endif
+
+#define __check_entry_kprobe_x86 \
+ __check_entry_x86; \
+ __check_kprobe_x86
+
+#define ENTRY_KPROBE_FINAL_X86 __check_entry_kprobe_x86
+
+#define ENTRY_X86(name) \
+ __check_entry_kprobe_x86; \
+ __set_entry_x86; \
+ .globl name; \
+ __ALIGN; \
+ name:
+
+#define END_X86(name) \
+ __unset_entry_x86; \
+ __check_entry_kprobe_x86; \
+ .size name, .-name
+
+#define KPROBE_ENTRY_X86(name) \
+ __check_entry_kprobe_x86; \
+ __set_kprobe_x86; \
+ .pushsection .kprobes.text, "ax"; \
+ .globl name; \
+ __ALIGN; \
+ name:
+
+#define KPROBE_END_X86(name) \
+ __unset_kprobe_x86; \
+ __check_entry_kprobe_x86; \
+ .size name, .-name; \
+ .popsection
+
#endif /* _ASM_X86_LINKAGE_H */
diff --git a/arch/x86/include/asm/mach-default/mach_apic.h b/arch/x86/include/asm/mach-default/mach_apic.h
deleted file mode 100644
index ff3a6c236c0..00000000000
--- a/arch/x86/include/asm/mach-default/mach_apic.h
+++ /dev/null
@@ -1,156 +0,0 @@
-#ifndef _ASM_X86_MACH_DEFAULT_MACH_APIC_H
-#define _ASM_X86_MACH_DEFAULT_MACH_APIC_H
-
-#ifdef CONFIG_X86_LOCAL_APIC
-
-#include <mach_apicdef.h>
-#include <asm/smp.h>
-
-#define APIC_DFR_VALUE (APIC_DFR_FLAT)
-
-static inline cpumask_t target_cpus(void)
-{
-#ifdef CONFIG_SMP
- return cpu_online_map;
-#else
- return cpumask_of_cpu(0);
-#endif
-}
-
-#define NO_BALANCE_IRQ (0)
-#define esr_disable (0)
-
-#ifdef CONFIG_X86_64
-#include <asm/genapic.h>
-#define INT_DELIVERY_MODE (genapic->int_delivery_mode)
-#define INT_DEST_MODE (genapic->int_dest_mode)
-#define TARGET_CPUS (genapic->target_cpus())
-#define apic_id_registered (genapic->apic_id_registered)
-#define init_apic_ldr (genapic->init_apic_ldr)
-#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
-#define phys_pkg_id (genapic->phys_pkg_id)
-#define vector_allocation_domain (genapic->vector_allocation_domain)
-#define read_apic_id() (GET_APIC_ID(apic_read(APIC_ID)))
-#define send_IPI_self (genapic->send_IPI_self)
-extern void setup_apic_routing(void);
-#else
-#define INT_DELIVERY_MODE dest_LowestPrio
-#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */
-#define TARGET_CPUS (target_cpus())
-/*
- * Set up the logical destination ID.
- *
- * Intel recommends to set DFR, LDR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
- */
-static inline void init_apic_ldr(void)
-{
- unsigned long val;
-
- apic_write(APIC_DFR, APIC_DFR_VALUE);
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
- apic_write(APIC_LDR, val);
-}
-
-static inline int apic_id_registered(void)
-{
- return physid_isset(read_apic_id(), phys_cpu_present_map);
-}
-
-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
-{
- return cpus_addr(cpumask)[0];
-}
-
-static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
-{
- return cpuid_apic >> index_msb;
-}
-
-static inline void setup_apic_routing(void)
-{
-#ifdef CONFIG_X86_IO_APIC
- printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
- "Flat", nr_ioapics);
-#endif
-}
-
-static inline int apicid_to_node(int logical_apicid)
-{
-#ifdef CONFIG_SMP
- return apicid_2_node[hard_smp_processor_id()];
-#else
- return 0;
-#endif
-}
-
-static inline cpumask_t vector_allocation_domain(int cpu)
-{
- /* Careful. Some cpus do not strictly honor the set of cpus
- * specified in the interrupt destination when using lowest
- * priority interrupt delivery mode.
- *
- * In particular there was a hyperthreading cpu observed to
- * deliver interrupts to the wrong hyperthread when only one
- * hyperthread was specified in the interrupt desitination.
- */
- cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
- return domain;
-}
-#endif
-
-static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
-{
- return physid_isset(apicid, bitmap);
-}
-
-static inline unsigned long check_apicid_present(int bit)
-{
- return physid_isset(bit, phys_cpu_present_map);
-}
-
-static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
-{
- return phys_map;
-}
-
-static inline int multi_timer_check(int apic, int irq)
-{
- return 0;
-}
-
-/* Mapping from cpu number to logical apicid */
-static inline int cpu_to_logical_apicid(int cpu)
-{
- return 1 << cpu;
-}
-
-static inline int cpu_present_to_apicid(int mps_cpu)
-{
- if (mps_cpu < NR_CPUS && cpu_present(mps_cpu))
- return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
- else
- return BAD_APICID;
-}
-
-static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
-{
- return physid_mask_of_physid(phys_apicid);
-}
-
-static inline void setup_portio_remap(void)
-{
-}
-
-static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
-{
- return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map);
-}
-
-static inline void enable_apic_mode(void)
-{
-}
-#endif /* CONFIG_X86_LOCAL_APIC */
-#endif /* _ASM_X86_MACH_DEFAULT_MACH_APIC_H */
diff --git a/arch/x86/include/asm/mach-default/mach_apicdef.h b/arch/x86/include/asm/mach-default/mach_apicdef.h
deleted file mode 100644
index 53179936d6c..00000000000
--- a/arch/x86/include/asm/mach-default/mach_apicdef.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H
-#define _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H
-
-#include <asm/apic.h>
-
-#ifdef CONFIG_X86_64
-#define APIC_ID_MASK (genapic->apic_id_mask)
-#define GET_APIC_ID(x) (genapic->get_apic_id(x))
-#define SET_APIC_ID(x) (genapic->set_apic_id(x))
-#else
-#define APIC_ID_MASK (0xF<<24)
-static inline unsigned get_apic_id(unsigned long x)
-{
- unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
- if (APIC_XAPIC(ver))
- return (((x)>>24)&0xFF);
- else
- return (((x)>>24)&0xF);
-}
-
-#define GET_APIC_ID(x) get_apic_id(x)
-#endif
-
-#endif /* _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H */
diff --git a/arch/x86/include/asm/mach-default/mach_ipi.h b/arch/x86/include/asm/mach-default/mach_ipi.h
deleted file mode 100644
index fabca01ebac..00000000000
--- a/arch/x86/include/asm/mach-default/mach_ipi.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifndef _ASM_X86_MACH_DEFAULT_MACH_IPI_H
-#define _ASM_X86_MACH_DEFAULT_MACH_IPI_H
-
-/* Avoid include hell */
-#define NMI_VECTOR 0x02
-
-void send_IPI_mask_bitmask(cpumask_t mask, int vector);
-void __send_IPI_shortcut(unsigned int shortcut, int vector);
-
-extern int no_broadcast;
-
-#ifdef CONFIG_X86_64
-#include <asm/genapic.h>
-#define send_IPI_mask (genapic->send_IPI_mask)
-#else
-static inline void send_IPI_mask(cpumask_t mask, int vector)
-{
- send_IPI_mask_bitmask(mask, vector);
-}
-#endif
-
-static inline void __local_send_IPI_allbutself(int vector)
-{
- if (no_broadcast || vector == NMI_VECTOR) {
- cpumask_t mask = cpu_online_map;
-
- cpu_clear(smp_processor_id(), mask);
- send_IPI_mask(mask, vector);
- } else
- __send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
-}
-
-static inline void __local_send_IPI_all(int vector)
-{
- if (no_broadcast || vector == NMI_VECTOR)
- send_IPI_mask(cpu_online_map, vector);
- else
- __send_IPI_shortcut(APIC_DEST_ALLINC, vector);
-}
-
-#ifdef CONFIG_X86_64
-#define send_IPI_allbutself (genapic->send_IPI_allbutself)
-#define send_IPI_all (genapic->send_IPI_all)
-#else
-static inline void send_IPI_allbutself(int vector)
-{
- /*
- * if there are no other CPUs in the system then we get an APIC send
- * error if we try to broadcast, thus avoid sending IPIs in this case.
- */
- if (!(num_online_cpus() > 1))
- return;
-
- __local_send_IPI_allbutself(vector);
- return;
-}
-
-static inline void send_IPI_all(int vector)
-{
- __local_send_IPI_all(vector);
-}
-#endif
-
-#endif /* _ASM_X86_MACH_DEFAULT_MACH_IPI_H */
diff --git a/arch/x86/include/asm/mach-default/mach_mpparse.h b/arch/x86/include/asm/mach-default/mach_mpparse.h
deleted file mode 100644
index 8c1ea21238a..00000000000
--- a/arch/x86/include/asm/mach-default/mach_mpparse.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H
-#define _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H
-
-static inline int mps_oem_check(struct mp_config_table *mpc, char *oem,
- char *productid)
-{
- return 0;
-}
-
-/* Hook from generic ACPI tables.c */
-static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- return 0;
-}
-
-
-#endif /* _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H */
diff --git a/arch/x86/include/asm/mach-default/mach_mpspec.h b/arch/x86/include/asm/mach-default/mach_mpspec.h
deleted file mode 100644
index e85ede686be..00000000000
--- a/arch/x86/include/asm/mach-default/mach_mpspec.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H
-#define _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H
-
-#define MAX_IRQ_SOURCES 256
-
-#if CONFIG_BASE_SMALL == 0
-#define MAX_MP_BUSSES 256
-#else
-#define MAX_MP_BUSSES 32
-#endif
-
-#endif /* _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H */
diff --git a/arch/x86/include/asm/mach-default/mach_wakecpu.h b/arch/x86/include/asm/mach-default/mach_wakecpu.h
deleted file mode 100644
index 9d80db91e99..00000000000
--- a/arch/x86/include/asm/mach-default/mach_wakecpu.h
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H
-#define _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H
-
-/*
- * This file copes with machines that wakeup secondary CPUs by the
- * INIT, INIT, STARTUP sequence.
- */
-
-#define WAKE_SECONDARY_VIA_INIT
-
-#define TRAMPOLINE_LOW phys_to_virt(0x467)
-#define TRAMPOLINE_HIGH phys_to_virt(0x469)
-
-#define boot_cpu_apicid boot_cpu_physical_apicid
-
-static inline void wait_for_init_deassert(atomic_t *deassert)
-{
- while (!atomic_read(deassert))
- cpu_relax();
- return;
-}
-
-/* Nothing to do for most platforms, since cleared by the INIT cycle */
-static inline void smp_callin_clear_local_apic(void)
-{
-}
-
-static inline void store_NMI_vector(unsigned short *high, unsigned short *low)
-{
-}
-
-static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
-{
-}
-
-#define inquire_remote_apic(apicid) do { \
- if (apic_verbosity >= APIC_DEBUG) \
- __inquire_remote_apic(apicid); \
- } while (0)
-
-#endif /* _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H */
diff --git a/arch/x86/include/asm/mach-generic/gpio.h b/arch/x86/include/asm/mach-generic/gpio.h
deleted file mode 100644
index 995c45efdb3..00000000000
--- a/arch/x86/include/asm/mach-generic/gpio.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _ASM_X86_MACH_GENERIC_GPIO_H
-#define _ASM_X86_MACH_GENERIC_GPIO_H
-
-int gpio_request(unsigned gpio, const char *label);
-void gpio_free(unsigned gpio);
-int gpio_direction_input(unsigned gpio);
-int gpio_direction_output(unsigned gpio, int value);
-int gpio_get_value(unsigned gpio);
-void gpio_set_value(unsigned gpio, int value);
-int gpio_to_irq(unsigned gpio);
-int irq_to_gpio(unsigned irq);
-
-#include <asm-generic/gpio.h> /* cansleep wrappers */
-
-#endif /* _ASM_X86_MACH_GENERIC_GPIO_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_apic.h b/arch/x86/include/asm/mach-generic/mach_apic.h
deleted file mode 100644
index 5180bd7478f..00000000000
--- a/arch/x86/include/asm/mach-generic/mach_apic.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef _ASM_X86_MACH_GENERIC_MACH_APIC_H
-#define _ASM_X86_MACH_GENERIC_MACH_APIC_H
-
-#include <asm/genapic.h>
-
-#define esr_disable (genapic->ESR_DISABLE)
-#define NO_BALANCE_IRQ (genapic->no_balance_irq)
-#define INT_DELIVERY_MODE (genapic->int_delivery_mode)
-#define INT_DEST_MODE (genapic->int_dest_mode)
-#undef APIC_DEST_LOGICAL
-#define APIC_DEST_LOGICAL (genapic->apic_destination_logical)
-#define TARGET_CPUS (genapic->target_cpus())
-#define apic_id_registered (genapic->apic_id_registered)
-#define init_apic_ldr (genapic->init_apic_ldr)
-#define ioapic_phys_id_map (genapic->ioapic_phys_id_map)
-#define setup_apic_routing (genapic->setup_apic_routing)
-#define multi_timer_check (genapic->multi_timer_check)
-#define apicid_to_node (genapic->apicid_to_node)
-#define cpu_to_logical_apicid (genapic->cpu_to_logical_apicid)
-#define cpu_present_to_apicid (genapic->cpu_present_to_apicid)
-#define apicid_to_cpu_present (genapic->apicid_to_cpu_present)
-#define setup_portio_remap (genapic->setup_portio_remap)
-#define check_apicid_present (genapic->check_apicid_present)
-#define check_phys_apicid_present (genapic->check_phys_apicid_present)
-#define check_apicid_used (genapic->check_apicid_used)
-#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
-#define vector_allocation_domain (genapic->vector_allocation_domain)
-#define enable_apic_mode (genapic->enable_apic_mode)
-#define phys_pkg_id (genapic->phys_pkg_id)
-
-extern void generic_bigsmp_probe(void);
-
-#endif /* _ASM_X86_MACH_GENERIC_MACH_APIC_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_apicdef.h b/arch/x86/include/asm/mach-generic/mach_apicdef.h
deleted file mode 100644
index 68041f3802f..00000000000
--- a/arch/x86/include/asm/mach-generic/mach_apicdef.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef _ASM_X86_MACH_GENERIC_MACH_APICDEF_H
-#define _ASM_X86_MACH_GENERIC_MACH_APICDEF_H
-
-#ifndef APIC_DEFINITION
-#include <asm/genapic.h>
-
-#define GET_APIC_ID (genapic->get_apic_id)
-#define APIC_ID_MASK (genapic->apic_id_mask)
-#endif
-
-#endif /* _ASM_X86_MACH_GENERIC_MACH_APICDEF_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_ipi.h b/arch/x86/include/asm/mach-generic/mach_ipi.h
deleted file mode 100644
index ffd637e3c3d..00000000000
--- a/arch/x86/include/asm/mach-generic/mach_ipi.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef _ASM_X86_MACH_GENERIC_MACH_IPI_H
-#define _ASM_X86_MACH_GENERIC_MACH_IPI_H
-
-#include <asm/genapic.h>
-
-#define send_IPI_mask (genapic->send_IPI_mask)
-#define send_IPI_allbutself (genapic->send_IPI_allbutself)
-#define send_IPI_all (genapic->send_IPI_all)
-
-#endif /* _ASM_X86_MACH_GENERIC_MACH_IPI_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_mpparse.h b/arch/x86/include/asm/mach-generic/mach_mpparse.h
deleted file mode 100644
index 048f1d46853..00000000000
--- a/arch/x86/include/asm/mach-generic/mach_mpparse.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H
-#define _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H
-
-
-extern int mps_oem_check(struct mp_config_table *mpc, char *oem,
- char *productid);
-
-extern int acpi_madt_oem_check(char *oem_id, char *oem_table_id);
-
-#endif /* _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_mpspec.h b/arch/x86/include/asm/mach-generic/mach_mpspec.h
deleted file mode 100644
index bbab5ccfd4f..00000000000
--- a/arch/x86/include/asm/mach-generic/mach_mpspec.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H
-#define _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H
-
-#define MAX_IRQ_SOURCES 256
-
-/* Summit or generic (i.e. installer) kernels need lots of bus entries. */
-/* Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets. */
-#define MAX_MP_BUSSES 260
-
-extern void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
- char *productid);
-#endif /* _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H */
diff --git a/arch/x86/include/asm/mach-rdc321x/gpio.h b/arch/x86/include/asm/mach-rdc321x/gpio.h
deleted file mode 100644
index c210ab5788b..00000000000
--- a/arch/x86/include/asm/mach-rdc321x/gpio.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef _ASM_X86_MACH_RDC321X_GPIO_H
-#define _ASM_X86_MACH_RDC321X_GPIO_H
-
-#include <linux/kernel.h>
-
-extern int rdc_gpio_get_value(unsigned gpio);
-extern void rdc_gpio_set_value(unsigned gpio, int value);
-extern int rdc_gpio_direction_input(unsigned gpio);
-extern int rdc_gpio_direction_output(unsigned gpio, int value);
-extern int rdc_gpio_request(unsigned gpio, const char *label);
-extern void rdc_gpio_free(unsigned gpio);
-extern void __init rdc321x_gpio_setup(void);
-
-/* Wrappers for the arch-neutral GPIO API */
-
-static inline int gpio_request(unsigned gpio, const char *label)
-{
- return rdc_gpio_request(gpio, label);
-}
-
-static inline void gpio_free(unsigned gpio)
-{
- might_sleep();
- rdc_gpio_free(gpio);
-}
-
-static inline int gpio_direction_input(unsigned gpio)
-{
- return rdc_gpio_direction_input(gpio);
-}
-
-static inline int gpio_direction_output(unsigned gpio, int value)
-{
- return rdc_gpio_direction_output(gpio, value);
-}
-
-static inline int gpio_get_value(unsigned gpio)
-{
- return rdc_gpio_get_value(gpio);
-}
-
-static inline void gpio_set_value(unsigned gpio, int value)
-{
- rdc_gpio_set_value(gpio, value);
-}
-
-static inline int gpio_to_irq(unsigned gpio)
-{
- return gpio;
-}
-
-static inline int irq_to_gpio(unsigned irq)
-{
- return irq;
-}
-
-/* For cansleep */
-#include <asm-generic/gpio.h>
-
-#endif /* _ASM_X86_MACH_RDC321X_GPIO_H */
diff --git a/arch/x86/include/asm/mach-default/mach_timer.h b/arch/x86/include/asm/mach_timer.h
index 853728519ae..853728519ae 100644
--- a/arch/x86/include/asm/mach-default/mach_timer.h
+++ b/arch/x86/include/asm/mach_timer.h
diff --git a/arch/x86/include/asm/mach-default/mach_traps.h b/arch/x86/include/asm/mach_traps.h
index f7920601e47..f7920601e47 100644
--- a/arch/x86/include/asm/mach-default/mach_traps.h
+++ b/arch/x86/include/asm/mach_traps.h
diff --git a/arch/x86/include/asm/math_emu.h b/arch/x86/include/asm/math_emu.h
index 5a65b107ad5..031f6266f42 100644
--- a/arch/x86/include/asm/math_emu.h
+++ b/arch/x86/include/asm/math_emu.h
@@ -1,31 +1,18 @@
#ifndef _ASM_X86_MATH_EMU_H
#define _ASM_X86_MATH_EMU_H
+#include <asm/ptrace.h>
+#include <asm/vm86.h>
+
/* This structure matches the layout of the data saved to the stack
following a device-not-present interrupt, part of it saved
automatically by the 80386/80486.
*/
-struct info {
+struct math_emu_info {
long ___orig_eip;
- long ___ebx;
- long ___ecx;
- long ___edx;
- long ___esi;
- long ___edi;
- long ___ebp;
- long ___eax;
- long ___ds;
- long ___es;
- long ___fs;
- long ___orig_eax;
- long ___eip;
- long ___cs;
- long ___eflags;
- long ___esp;
- long ___ss;
- long ___vm86_es; /* This and the following only in vm86 mode */
- long ___vm86_ds;
- long ___vm86_fs;
- long ___vm86_gs;
+ union {
+ struct pt_regs *regs;
+ struct kernel_vm86_regs *vm86;
+ };
};
#endif /* _ASM_X86_MATH_EMU_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 1d6e17c2f23..32c6e17b960 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -3,8 +3,8 @@
#ifdef __x86_64__
+#include <linux/types.h>
#include <asm/ioctls.h>
-#include <asm/types.h>
/*
* Machine Check support for x86
@@ -115,8 +115,6 @@ extern int mce_notify_user(void);
#endif /* !CONFIG_X86_32 */
-
-
#ifdef CONFIG_X86_MCE
extern void mcheck_init(struct cpuinfo_x86 *c);
#else
@@ -126,5 +124,4 @@ extern void stop_mce(void);
extern void restart_mce(void);
#endif /* __KERNEL__ */
-
#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 8aeeb3fd73d..f923203dc39 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -21,11 +21,54 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,
int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
void destroy_context(struct mm_struct *mm);
-#ifdef CONFIG_X86_32
-# include "mmu_context_32.h"
-#else
-# include "mmu_context_64.h"
+
+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+#ifdef CONFIG_SMP
+ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+ percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
+#endif
+}
+
+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ unsigned cpu = smp_processor_id();
+
+ if (likely(prev != next)) {
+ /* stop flush ipis for the previous mm */
+ cpu_clear(cpu, prev->cpu_vm_mask);
+#ifdef CONFIG_SMP
+ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ percpu_write(cpu_tlbstate.active_mm, next);
#endif
+ cpu_set(cpu, next->cpu_vm_mask);
+
+ /* Re-load page tables */
+ load_cr3(next->pgd);
+
+ /*
+ * load the LDT, if the LDT is different:
+ */
+ if (unlikely(prev->context.ldt != next->context.ldt))
+ load_LDT_nolock(&next->context);
+ }
+#ifdef CONFIG_SMP
+ else {
+ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
+
+ if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
+ /* We were in lazy tlb mode and leave_mm disabled
+ * tlb flush IPI delivery. We must reload CR3
+ * to make sure to use no freed page tables.
+ */
+ load_cr3(next->pgd);
+ load_LDT_nolock(&next->context);
+ }
+ }
+#endif
+}
#define activate_mm(prev, next) \
do { \
@@ -33,5 +76,17 @@ do { \
switch_mm((prev), (next), NULL); \
} while (0);
+#ifdef CONFIG_X86_32
+#define deactivate_mm(tsk, mm) \
+do { \
+ lazy_load_gs(0); \
+} while (0)
+#else
+#define deactivate_mm(tsk, mm) \
+do { \
+ load_gs_index(0); \
+ loadsegment(fs, 0); \
+} while (0)
+#endif
#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/mmu_context_32.h b/arch/x86/include/asm/mmu_context_32.h
deleted file mode 100644
index 8e10015781f..00000000000
--- a/arch/x86/include/asm/mmu_context_32.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef _ASM_X86_MMU_CONTEXT_32_H
-#define _ASM_X86_MMU_CONTEXT_32_H
-
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-#ifdef CONFIG_SMP
- unsigned cpu = smp_processor_id();
- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
- per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY;
-#endif
-}
-
-static inline void switch_mm(struct mm_struct *prev,
- struct mm_struct *next,
- struct task_struct *tsk)
-{
- int cpu = smp_processor_id();
-
- if (likely(prev != next)) {
- /* stop flush ipis for the previous mm */
- cpu_clear(cpu, prev->cpu_vm_mask);
-#ifdef CONFIG_SMP
- per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
- per_cpu(cpu_tlbstate, cpu).active_mm = next;
-#endif
- cpu_set(cpu, next->cpu_vm_mask);
-
- /* Re-load page tables */
- load_cr3(next->pgd);
-
- /*
- * load the LDT, if the LDT is different:
- */
- if (unlikely(prev->context.ldt != next->context.ldt))
- load_LDT_nolock(&next->context);
- }
-#ifdef CONFIG_SMP
- else {
- per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
- BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
-
- if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
- /* We were in lazy tlb mode and leave_mm disabled
- * tlb flush IPI delivery. We must reload %cr3.
- */
- load_cr3(next->pgd);
- load_LDT_nolock(&next->context);
- }
- }
-#endif
-}
-
-#define deactivate_mm(tsk, mm) \
- asm("movl %0,%%gs": :"r" (0));
-
-#endif /* _ASM_X86_MMU_CONTEXT_32_H */
diff --git a/arch/x86/include/asm/mmu_context_64.h b/arch/x86/include/asm/mmu_context_64.h
deleted file mode 100644
index 677d36e9540..00000000000
--- a/arch/x86/include/asm/mmu_context_64.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef _ASM_X86_MMU_CONTEXT_64_H
-#define _ASM_X86_MMU_CONTEXT_64_H
-
-#include <asm/pda.h>
-
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-#ifdef CONFIG_SMP
- if (read_pda(mmu_state) == TLBSTATE_OK)
- write_pda(mmu_state, TLBSTATE_LAZY);
-#endif
-}
-
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
- struct task_struct *tsk)
-{
- unsigned cpu = smp_processor_id();
- if (likely(prev != next)) {
- /* stop flush ipis for the previous mm */
- cpu_clear(cpu, prev->cpu_vm_mask);
-#ifdef CONFIG_SMP
- write_pda(mmu_state, TLBSTATE_OK);
- write_pda(active_mm, next);
-#endif
- cpu_set(cpu, next->cpu_vm_mask);
- load_cr3(next->pgd);
-
- if (unlikely(next->context.ldt != prev->context.ldt))
- load_LDT_nolock(&next->context);
- }
-#ifdef CONFIG_SMP
- else {
- write_pda(mmu_state, TLBSTATE_OK);
- if (read_pda(active_mm) != next)
- BUG();
- if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
- /* We were in lazy tlb mode and leave_mm disabled
- * tlb flush IPI delivery. We must reload CR3
- * to make sure to use no freed page tables.
- */
- load_cr3(next->pgd);
- load_LDT_nolock(&next->context);
- }
- }
-#endif
-}
-
-#define deactivate_mm(tsk, mm) \
-do { \
- load_gs_index(0); \
- asm volatile("movl %0,%%fs"::"r"(0)); \
-} while (0)
-
-#endif /* _ASM_X86_MMU_CONTEXT_64_H */
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 485bdf059ff..07f1af494ca 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -34,10 +34,14 @@ static inline void get_memcfg_numa(void)
extern int early_pfn_to_nid(unsigned long pfn);
+extern void resume_map_numa_kva(pgd_t *pgd);
+
#else /* !CONFIG_NUMA */
#define get_memcfg_numa get_memcfg_numa_flat
+static inline void resume_map_numa_kva(pgd_t *pgd) {}
+
#endif /* CONFIG_NUMA */
#ifdef CONFIG_DISCONTIGMEM
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 91885c28f66..5916c8df09d 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -6,13 +6,24 @@
#include <asm/mpspec_def.h>
extern int apic_version[MAX_APICS];
+extern int pic_mode;
#ifdef CONFIG_X86_32
-#include <mach_mpspec.h>
+
+/*
+ * Summit or generic (i.e. installer) kernels need lots of bus entries.
+ * Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets.
+ */
+#if CONFIG_BASE_SMALL == 0
+# define MAX_MP_BUSSES 260
+#else
+# define MAX_MP_BUSSES 32
+#endif
+
+#define MAX_IRQ_SOURCES 256
extern unsigned int def_to_bigsmp;
extern u8 apicid_2_node[];
-extern int pic_mode;
#ifdef CONFIG_X86_NUMAQ
extern int mp_bus_id_to_node[MAX_MP_BUSSES];
@@ -20,15 +31,15 @@ extern int mp_bus_id_to_local[MAX_MP_BUSSES];
extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
#endif
-#define MAX_APICID 256
+#define MAX_APICID 256
-#else
+#else /* CONFIG_X86_64: */
-#define MAX_MP_BUSSES 256
+#define MAX_MP_BUSSES 256
/* Each PCI slot may be a combo card with its own bus. 4 IRQ pins per slot. */
-#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
+#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
-#endif
+#endif /* CONFIG_X86_64 */
extern void early_find_smp_config(void);
extern void early_get_smp_config(void);
@@ -45,11 +56,13 @@ extern int smp_found_config;
extern int mpc_default_type;
extern unsigned long mp_lapic_addr;
-extern void find_smp_config(void);
extern void get_smp_config(void);
+
#ifdef CONFIG_X86_MPPARSE
+extern void find_smp_config(void);
extern void early_reserve_e820_mpc_new(void);
#else
+static inline void find_smp_config(void) { }
static inline void early_reserve_e820_mpc_new(void) { }
#endif
@@ -60,9 +73,12 @@ extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
u32 gsi);
extern void mp_config_acpi_legacy_irqs(void);
extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low);
+extern int acpi_probe_gsi(void);
#ifdef CONFIG_X86_IO_APIC
extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
u32 gsi, int triggering, int polarity);
+extern int mp_find_ioapic(int gsi);
+extern int mp_find_ioapic_pin(int ioapic, int gsi);
#else
static inline int
mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
@@ -71,6 +87,11 @@ mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
return 0;
}
#endif
+#else /* !CONFIG_ACPI: */
+static inline int acpi_probe_gsi(void)
+{
+ return 0;
+}
#endif /* CONFIG_ACPI */
#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS)
@@ -142,4 +163,10 @@ static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
extern physid_mask_t phys_cpu_present_map;
+extern int generic_mps_oem_check(struct mpc_table *, char *, char *);
+
+extern int default_acpi_madt_oem_check(char *, char *);
+
+extern void numaq_mps_oem_check(struct mpc_table *, char *, char *);
+
#endif /* _ASM_X86_MPSPEC_H */
diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h
index e3ace7d1d35..4a7f96d7c18 100644
--- a/arch/x86/include/asm/mpspec_def.h
+++ b/arch/x86/include/asm/mpspec_def.h
@@ -24,32 +24,33 @@
# endif
#endif
-struct intel_mp_floating {
- char mpf_signature[4]; /* "_MP_" */
- unsigned int mpf_physptr; /* Configuration table address */
- unsigned char mpf_length; /* Our length (paragraphs) */
- unsigned char mpf_specification;/* Specification version */
- unsigned char mpf_checksum; /* Checksum (makes sum 0) */
- unsigned char mpf_feature1; /* Standard or configuration ? */
- unsigned char mpf_feature2; /* Bit7 set for IMCR|PIC */
- unsigned char mpf_feature3; /* Unused (0) */
- unsigned char mpf_feature4; /* Unused (0) */
- unsigned char mpf_feature5; /* Unused (0) */
+/* Intel MP Floating Pointer Structure */
+struct mpf_intel {
+ char signature[4]; /* "_MP_" */
+ unsigned int physptr; /* Configuration table address */
+ unsigned char length; /* Our length (paragraphs) */
+ unsigned char specification; /* Specification version */
+ unsigned char checksum; /* Checksum (makes sum 0) */
+ unsigned char feature1; /* Standard or configuration ? */
+ unsigned char feature2; /* Bit7 set for IMCR|PIC */
+ unsigned char feature3; /* Unused (0) */
+ unsigned char feature4; /* Unused (0) */
+ unsigned char feature5; /* Unused (0) */
};
#define MPC_SIGNATURE "PCMP"
-struct mp_config_table {
- char mpc_signature[4];
- unsigned short mpc_length; /* Size of table */
- char mpc_spec; /* 0x01 */
- char mpc_checksum;
- char mpc_oem[8];
- char mpc_productid[12];
- unsigned int mpc_oemptr; /* 0 if not present */
- unsigned short mpc_oemsize; /* 0 if not present */
- unsigned short mpc_oemcount;
- unsigned int mpc_lapic; /* APIC address */
+struct mpc_table {
+ char signature[4];
+ unsigned short length; /* Size of table */
+ char spec; /* 0x01 */
+ char checksum;
+ char oem[8];
+ char productid[12];
+ unsigned int oemptr; /* 0 if not present */
+ unsigned short oemsize; /* 0 if not present */
+ unsigned short oemcount;
+ unsigned int lapic; /* APIC address */
unsigned int reserved;
};
@@ -70,20 +71,20 @@ struct mp_config_table {
#define CPU_MODEL_MASK 0x00F0
#define CPU_FAMILY_MASK 0x0F00
-struct mpc_config_processor {
- unsigned char mpc_type;
- unsigned char mpc_apicid; /* Local APIC number */
- unsigned char mpc_apicver; /* Its versions */
- unsigned char mpc_cpuflag;
- unsigned int mpc_cpufeature;
- unsigned int mpc_featureflag; /* CPUID feature value */
- unsigned int mpc_reserved[2];
+struct mpc_cpu {
+ unsigned char type;
+ unsigned char apicid; /* Local APIC number */
+ unsigned char apicver; /* Its versions */
+ unsigned char cpuflag;
+ unsigned int cpufeature;
+ unsigned int featureflag; /* CPUID feature value */
+ unsigned int reserved[2];
};
-struct mpc_config_bus {
- unsigned char mpc_type;
- unsigned char mpc_busid;
- unsigned char mpc_bustype[6];
+struct mpc_bus {
+ unsigned char type;
+ unsigned char busid;
+ unsigned char bustype[6];
};
/* List of Bus Type string values, Intel MP Spec. */
@@ -108,22 +109,22 @@ struct mpc_config_bus {
#define MPC_APIC_USABLE 0x01
-struct mpc_config_ioapic {
- unsigned char mpc_type;
- unsigned char mpc_apicid;
- unsigned char mpc_apicver;
- unsigned char mpc_flags;
- unsigned int mpc_apicaddr;
+struct mpc_ioapic {
+ unsigned char type;
+ unsigned char apicid;
+ unsigned char apicver;
+ unsigned char flags;
+ unsigned int apicaddr;
};
-struct mpc_config_intsrc {
- unsigned char mpc_type;
- unsigned char mpc_irqtype;
- unsigned short mpc_irqflag;
- unsigned char mpc_srcbus;
- unsigned char mpc_srcbusirq;
- unsigned char mpc_dstapic;
- unsigned char mpc_dstirq;
+struct mpc_intsrc {
+ unsigned char type;
+ unsigned char irqtype;
+ unsigned short irqflag;
+ unsigned char srcbus;
+ unsigned char srcbusirq;
+ unsigned char dstapic;
+ unsigned char dstirq;
};
enum mp_irq_source_types {
@@ -139,24 +140,24 @@ enum mp_irq_source_types {
#define MP_APIC_ALL 0xFF
-struct mpc_config_lintsrc {
- unsigned char mpc_type;
- unsigned char mpc_irqtype;
- unsigned short mpc_irqflag;
- unsigned char mpc_srcbusid;
- unsigned char mpc_srcbusirq;
- unsigned char mpc_destapic;
- unsigned char mpc_destapiclint;
+struct mpc_lintsrc {
+ unsigned char type;
+ unsigned char irqtype;
+ unsigned short irqflag;
+ unsigned char srcbusid;
+ unsigned char srcbusirq;
+ unsigned char destapic;
+ unsigned char destapiclint;
};
#define MPC_OEM_SIGNATURE "_OEM"
-struct mp_config_oemtable {
- char oem_signature[4];
- unsigned short oem_length; /* Size of table */
- char oem_rev; /* 0x01 */
- char oem_checksum;
- char mpc_oem[8];
+struct mpc_oemtable {
+ char signature[4];
+ unsigned short length; /* Size of table */
+ char rev; /* 0x01 */
+ char checksum;
+ char mpc[8];
};
/*
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index e38859d577a..358acc59ae0 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -85,7 +85,9 @@
/* AMD64 MSRs. Not complete. See the architecture manual for a more
complete list. */
+#define MSR_AMD64_PATCH_LEVEL 0x0000008b
#define MSR_AMD64_NB_CFG 0xc001001f
+#define MSR_AMD64_PATCH_LOADER 0xc0010020
#define MSR_AMD64_IBSFETCHCTL 0xc0011030
#define MSR_AMD64_IBSFETCHLINAD 0xc0011031
#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032
@@ -200,6 +202,35 @@
#define MSR_IA32_THERM_STATUS 0x0000019c
#define MSR_IA32_MISC_ENABLE 0x000001a0
+/* MISC_ENABLE bits: architectural */
+#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0)
+#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1)
+#define MSR_IA32_MISC_ENABLE_EMON (1ULL << 7)
+#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL (1ULL << 11)
+#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL (1ULL << 12)
+#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP (1ULL << 16)
+#define MSR_IA32_MISC_ENABLE_MWAIT (1ULL << 18)
+#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1ULL << 22)
+#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1ULL << 23)
+#define MSR_IA32_MISC_ENABLE_XD_DISABLE (1ULL << 34)
+
+/* MISC_ENABLE bits: model-specific, meaning may vary from core to core */
+#define MSR_IA32_MISC_ENABLE_X87_COMPAT (1ULL << 2)
+#define MSR_IA32_MISC_ENABLE_TM1 (1ULL << 3)
+#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE (1ULL << 4)
+#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE (1ULL << 6)
+#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK (1ULL << 8)
+#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE (1ULL << 9)
+#define MSR_IA32_MISC_ENABLE_FERR (1ULL << 10)
+#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX (1ULL << 10)
+#define MSR_IA32_MISC_ENABLE_TM2 (1ULL << 13)
+#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE (1ULL << 19)
+#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK (1ULL << 20)
+#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT (1ULL << 24)
+#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE (1ULL << 37)
+#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38)
+#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39)
+
/* Intel Model 6 */
#define MSR_P6_EVNTSEL0 0x00000186
#define MSR_P6_EVNTSEL1 0x00000187
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index c2a812ebde8..638bf624180 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -22,10 +22,10 @@ static inline unsigned long long native_read_tscp(unsigned int *aux)
}
/*
- * i386 calling convention returns 64-bit value in edx:eax, while
- * x86_64 returns at rax. Also, the "A" constraint does not really
- * mean rdx:rax in x86_64, so we need specialized behaviour for each
- * architecture
+ * both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A"
+ * constraint has different meanings. For i386, "A" means exactly
+ * edx:eax, while for x86_64 it doesn't mean rdx:rax or edx:eax. Instead,
+ * it means rax *or* rdx.
*/
#ifdef CONFIG_X86_64
#define DECLARE_ARGS(val, low, high) unsigned low, high
@@ -85,7 +85,8 @@ static inline void native_write_msr(unsigned int msr,
asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
}
-static inline int native_write_msr_safe(unsigned int msr,
+/* Can be uninlined because referenced by paravirt */
+notrace static inline int native_write_msr_safe(unsigned int msr,
unsigned low, unsigned high)
{
int err;
@@ -181,10 +182,10 @@ static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
}
#define rdtscl(low) \
- ((low) = (u32)native_read_tsc())
+ ((low) = (u32)__native_read_tsc())
#define rdtscll(val) \
- ((val) = native_read_tsc())
+ ((val) = __native_read_tsc())
#define rdpmc(counter, low, high) \
do { \
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index 7c1e4258b31..a51ada8467d 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -23,6 +23,7 @@
#ifndef _ASM_X86_MTRR_H
#define _ASM_X86_MTRR_H
+#include <linux/types.h>
#include <linux/ioctl.h>
#include <linux/errno.h>
@@ -57,6 +58,31 @@ struct mtrr_gentry {
};
#endif /* !__i386__ */
+struct mtrr_var_range {
+ __u32 base_lo;
+ __u32 base_hi;
+ __u32 mask_lo;
+ __u32 mask_hi;
+};
+
+/* In the Intel processor's MTRR interface, the MTRR type is always held in
+ an 8 bit field: */
+typedef __u8 mtrr_type;
+
+#define MTRR_NUM_FIXED_RANGES 88
+#define MTRR_MAX_VAR_RANGES 256
+
+struct mtrr_state_type {
+ struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES];
+ mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES];
+ unsigned char enabled;
+ unsigned char have_fixed;
+ mtrr_type def_type;
+};
+
+#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
+#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
+
/* These are the various ioctls */
#define MTRRIOC_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry)
#define MTRRIOC_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry)
diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h
index 1e8bd30b4c1..9f0a5f5d29e 100644
--- a/arch/x86/include/asm/numaq.h
+++ b/arch/x86/include/asm/numaq.h
@@ -31,6 +31,8 @@
extern int found_numaq;
extern int get_memcfg_numaq(void);
+extern void *xquad_portio;
+
/*
* SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the
*/
diff --git a/arch/x86/include/asm/numaq/apic.h b/arch/x86/include/asm/numaq/apic.h
deleted file mode 100644
index 0bf2a06b7a4..00000000000
--- a/arch/x86/include/asm/numaq/apic.h
+++ /dev/null
@@ -1,136 +0,0 @@
-#ifndef __ASM_NUMAQ_APIC_H
-#define __ASM_NUMAQ_APIC_H
-
-#include <asm/io.h>
-#include <linux/mmzone.h>
-#include <linux/nodemask.h>
-
-#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
-
-static inline cpumask_t target_cpus(void)
-{
- return CPU_MASK_ALL;
-}
-
-#define NO_BALANCE_IRQ (1)
-#define esr_disable (1)
-
-#define INT_DELIVERY_MODE dest_LowestPrio
-#define INT_DEST_MODE 0 /* physical delivery on LOCAL quad */
-
-static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
-{
- return physid_isset(apicid, bitmap);
-}
-static inline unsigned long check_apicid_present(int bit)
-{
- return physid_isset(bit, phys_cpu_present_map);
-}
-#define apicid_cluster(apicid) (apicid & 0xF0)
-
-static inline int apic_id_registered(void)
-{
- return 1;
-}
-
-static inline void init_apic_ldr(void)
-{
- /* Already done in NUMA-Q firmware */
-}
-
-static inline void setup_apic_routing(void)
-{
- printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
- "NUMA-Q", nr_ioapics);
-}
-
-/*
- * Skip adding the timer int on secondary nodes, which causes
- * a small but painful rift in the time-space continuum.
- */
-static inline int multi_timer_check(int apic, int irq)
-{
- return apic != 0 && irq == 0;
-}
-
-static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
-{
- /* We don't have a good way to do this yet - hack */
- return physids_promote(0xFUL);
-}
-
-/* Mapping from cpu number to logical apicid */
-extern u8 cpu_2_logical_apicid[];
-static inline int cpu_to_logical_apicid(int cpu)
-{
- if (cpu >= NR_CPUS)
- return BAD_APICID;
- return (int)cpu_2_logical_apicid[cpu];
-}
-
-/*
- * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
- * cpu to APIC ID relation to properly interact with the intelligent
- * mode of the cluster controller.
- */
-static inline int cpu_present_to_apicid(int mps_cpu)
-{
- if (mps_cpu < 60)
- return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3));
- else
- return BAD_APICID;
-}
-
-static inline int apicid_to_node(int logical_apicid)
-{
- return logical_apicid >> 4;
-}
-
-static inline physid_mask_t apicid_to_cpu_present(int logical_apicid)
-{
- int node = apicid_to_node(logical_apicid);
- int cpu = __ffs(logical_apicid & 0xf);
-
- return physid_mask_of_physid(cpu + 4*node);
-}
-
-extern void *xquad_portio;
-
-static inline void setup_portio_remap(void)
-{
- int num_quads = num_online_nodes();
-
- if (num_quads <= 1)
- return;
-
- printk("Remapping cross-quad port I/O for %d quads\n", num_quads);
- xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD);
- printk("xquad_portio vaddr 0x%08lx, len %08lx\n",
- (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
-}
-
-static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
-{
- return (1);
-}
-
-static inline void enable_apic_mode(void)
-{
-}
-
-/*
- * We use physical apicids here, not logical, so just return the default
- * physical broadcast to stop people from breaking us
- */
-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
-{
- return (int) 0xF;
-}
-
-/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
-static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
-{
- return cpuid_apic >> index_msb;
-}
-
-#endif /* __ASM_NUMAQ_APIC_H */
diff --git a/arch/x86/include/asm/numaq/apicdef.h b/arch/x86/include/asm/numaq/apicdef.h
deleted file mode 100644
index e012a46cc22..00000000000
--- a/arch/x86/include/asm/numaq/apicdef.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef __ASM_NUMAQ_APICDEF_H
-#define __ASM_NUMAQ_APICDEF_H
-
-
-#define APIC_ID_MASK (0xF<<24)
-
-static inline unsigned get_apic_id(unsigned long x)
-{
- return (((x)>>24)&0x0F);
-}
-
-#define GET_APIC_ID(x) get_apic_id(x)
-
-#endif
diff --git a/arch/x86/include/asm/numaq/ipi.h b/arch/x86/include/asm/numaq/ipi.h
deleted file mode 100644
index 935588d286c..00000000000
--- a/arch/x86/include/asm/numaq/ipi.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef __ASM_NUMAQ_IPI_H
-#define __ASM_NUMAQ_IPI_H
-
-void send_IPI_mask_sequence(cpumask_t, int vector);
-
-static inline void send_IPI_mask(cpumask_t mask, int vector)
-{
- send_IPI_mask_sequence(mask, vector);
-}
-
-static inline void send_IPI_allbutself(int vector)
-{
- cpumask_t mask = cpu_online_map;
- cpu_clear(smp_processor_id(), mask);
-
- if (!cpus_empty(mask))
- send_IPI_mask(mask, vector);
-}
-
-static inline void send_IPI_all(int vector)
-{
- send_IPI_mask(cpu_online_map, vector);
-}
-
-#endif /* __ASM_NUMAQ_IPI_H */
diff --git a/arch/x86/include/asm/numaq/mpparse.h b/arch/x86/include/asm/numaq/mpparse.h
deleted file mode 100644
index 252292e077b..00000000000
--- a/arch/x86/include/asm/numaq/mpparse.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef __ASM_NUMAQ_MPPARSE_H
-#define __ASM_NUMAQ_MPPARSE_H
-
-extern void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
- char *productid);
-
-#endif /* __ASM_NUMAQ_MPPARSE_H */
diff --git a/arch/x86/include/asm/numaq/wakecpu.h b/arch/x86/include/asm/numaq/wakecpu.h
deleted file mode 100644
index c577bda5b1c..00000000000
--- a/arch/x86/include/asm/numaq/wakecpu.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef __ASM_NUMAQ_WAKECPU_H
-#define __ASM_NUMAQ_WAKECPU_H
-
-/* This file copes with machines that wakeup secondary CPUs by NMIs */
-
-#define WAKE_SECONDARY_VIA_NMI
-
-#define TRAMPOLINE_LOW phys_to_virt(0x8)
-#define TRAMPOLINE_HIGH phys_to_virt(0xa)
-
-#define boot_cpu_apicid boot_cpu_logical_apicid
-
-/* We don't do anything here because we use NMI's to boot instead */
-static inline void wait_for_init_deassert(atomic_t *deassert)
-{
-}
-
-/*
- * Because we use NMIs rather than the INIT-STARTUP sequence to
- * bootstrap the CPUs, the APIC may be in a weird state. Kick it.
- */
-static inline void smp_callin_clear_local_apic(void)
-{
- clear_local_APIC();
-}
-
-static inline void store_NMI_vector(unsigned short *high, unsigned short *low)
-{
- printk("Storing NMI vector\n");
- *high = *((volatile unsigned short *) TRAMPOLINE_HIGH);
- *low = *((volatile unsigned short *) TRAMPOLINE_LOW);
-}
-
-static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
-{
- printk("Restoring NMI vector\n");
- *((volatile unsigned short *) TRAMPOLINE_HIGH) = *high;
- *((volatile unsigned short *) TRAMPOLINE_LOW) = *low;
-}
-
-#define inquire_remote_apic(apicid) {}
-
-#endif /* __ASM_NUMAQ_WAKECPU_H */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index e9873a2e869..40226999cbf 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -95,6 +95,11 @@ static inline pgdval_t native_pgd_val(pgd_t pgd)
return pgd.pgd;
}
+static inline pgdval_t pgd_flags(pgd_t pgd)
+{
+ return native_pgd_val(pgd) & PTE_FLAGS_MASK;
+}
+
#if PAGETABLE_LEVELS >= 3
#if PAGETABLE_LEVELS == 4
typedef struct { pudval_t pud; } pud_t;
@@ -117,6 +122,11 @@ static inline pudval_t native_pud_val(pud_t pud)
}
#endif /* PAGETABLE_LEVELS == 4 */
+static inline pudval_t pud_flags(pud_t pud)
+{
+ return native_pud_val(pud) & PTE_FLAGS_MASK;
+}
+
typedef struct { pmdval_t pmd; } pmd_t;
static inline pmd_t native_make_pmd(pmdval_t val)
@@ -128,6 +138,7 @@ static inline pmdval_t native_pmd_val(pmd_t pmd)
{
return pmd.pmd;
}
+
#else /* PAGETABLE_LEVELS == 2 */
#include <asm-generic/pgtable-nopmd.h>
@@ -137,6 +148,11 @@ static inline pmdval_t native_pmd_val(pmd_t pmd)
}
#endif /* PAGETABLE_LEVELS >= 3 */
+static inline pmdval_t pmd_flags(pmd_t pmd)
+{
+ return native_pmd_val(pmd) & PTE_FLAGS_MASK;
+}
+
static inline pte_t native_make_pte(pteval_t val)
{
return (pte_t) { .pte = val };
@@ -147,7 +163,7 @@ static inline pteval_t native_pte_val(pte_t pte)
return pte.pte;
}
-static inline pteval_t native_pte_flags(pte_t pte)
+static inline pteval_t pte_flags(pte_t pte)
{
return native_pte_val(pte) & PTE_FLAGS_MASK;
}
@@ -173,7 +189,6 @@ static inline pteval_t native_pte_flags(pte_t pte)
#endif
#define pte_val(x) native_pte_val(x)
-#define pte_flags(x) native_pte_flags(x)
#define __pte(x) native_make_pte(x)
#endif /* CONFIG_PARAVIRT */
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 5ebca29f44f..e27fdbe5f9e 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -13,8 +13,8 @@
#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
-#define IRQSTACK_ORDER 2
-#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)
+#define IRQ_STACK_ORDER 2
+#define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
#define STACKFAULT_STACK 1
#define DOUBLEFAULT_STACK 2
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ba3e2ff6aed..1c244b64573 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -12,21 +12,38 @@
#define CLBR_EAX (1 << 0)
#define CLBR_ECX (1 << 1)
#define CLBR_EDX (1 << 2)
+#define CLBR_EDI (1 << 3)
-#ifdef CONFIG_X86_64
-#define CLBR_RSI (1 << 3)
-#define CLBR_RDI (1 << 4)
+#ifdef CONFIG_X86_32
+/* CLBR_ANY should match all regs platform has. For i386, that's just it */
+#define CLBR_ANY ((1 << 4) - 1)
+
+#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX)
+#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX)
+#define CLBR_SCRATCH (0)
+#else
+#define CLBR_RAX CLBR_EAX
+#define CLBR_RCX CLBR_ECX
+#define CLBR_RDX CLBR_EDX
+#define CLBR_RDI CLBR_EDI
+#define CLBR_RSI (1 << 4)
#define CLBR_R8 (1 << 5)
#define CLBR_R9 (1 << 6)
#define CLBR_R10 (1 << 7)
#define CLBR_R11 (1 << 8)
+
#define CLBR_ANY ((1 << 9) - 1)
+
+#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \
+ CLBR_RCX | CLBR_R8 | CLBR_R9)
+#define CLBR_RET_REG (CLBR_RAX)
+#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11)
+
#include <asm/desc_defs.h>
-#else
-/* CLBR_ANY should match all regs platform has. For i386, that's just it */
-#define CLBR_ANY ((1 << 3) - 1)
#endif /* X86_64 */
+#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG)
+
#ifndef __ASSEMBLY__
#include <linux/types.h>
#include <linux/cpumask.h>
@@ -40,6 +57,14 @@ struct tss_struct;
struct mm_struct;
struct desc_struct;
+/*
+ * Wrapper type for pointers to code which uses the non-standard
+ * calling convention. See PV_CALL_SAVE_REGS_THUNK below.
+ */
+struct paravirt_callee_save {
+ void *func;
+};
+
/* general info */
struct pv_info {
unsigned int kernel_rpl;
@@ -189,11 +214,15 @@ struct pv_irq_ops {
* expected to use X86_EFLAGS_IF; all other bits
* returned from save_fl are undefined, and may be ignored by
* restore_fl.
+ *
+ * NOTE: These functions callers expect the callee to preserve
+ * more registers than the standard C calling convention.
*/
- unsigned long (*save_fl)(void);
- void (*restore_fl)(unsigned long);
- void (*irq_disable)(void);
- void (*irq_enable)(void);
+ struct paravirt_callee_save save_fl;
+ struct paravirt_callee_save restore_fl;
+ struct paravirt_callee_save irq_disable;
+ struct paravirt_callee_save irq_enable;
+
void (*safe_halt)(void);
void (*halt)(void);
@@ -244,7 +273,8 @@ struct pv_mmu_ops {
void (*flush_tlb_user)(void);
void (*flush_tlb_kernel)(void);
void (*flush_tlb_single)(unsigned long addr);
- void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm,
+ void (*flush_tlb_others)(const struct cpumask *cpus,
+ struct mm_struct *mm,
unsigned long va);
/* Hooks for allocating and freeing a pagetable top-level */
@@ -278,12 +308,11 @@ struct pv_mmu_ops {
void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte);
- pteval_t (*pte_val)(pte_t);
- pteval_t (*pte_flags)(pte_t);
- pte_t (*make_pte)(pteval_t pte);
+ struct paravirt_callee_save pte_val;
+ struct paravirt_callee_save make_pte;
- pgdval_t (*pgd_val)(pgd_t);
- pgd_t (*make_pgd)(pgdval_t pgd);
+ struct paravirt_callee_save pgd_val;
+ struct paravirt_callee_save make_pgd;
#if PAGETABLE_LEVELS >= 3
#ifdef CONFIG_X86_PAE
@@ -298,12 +327,12 @@ struct pv_mmu_ops {
void (*set_pud)(pud_t *pudp, pud_t pudval);
- pmdval_t (*pmd_val)(pmd_t);
- pmd_t (*make_pmd)(pmdval_t pmd);
+ struct paravirt_callee_save pmd_val;
+ struct paravirt_callee_save make_pmd;
#if PAGETABLE_LEVELS == 4
- pudval_t (*pud_val)(pud_t);
- pud_t (*make_pud)(pudval_t pud);
+ struct paravirt_callee_save pud_val;
+ struct paravirt_callee_save make_pud;
void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
#endif /* PAGETABLE_LEVELS == 4 */
@@ -388,6 +417,8 @@ extern struct pv_lock_ops pv_lock_ops;
asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
unsigned paravirt_patch_nop(void);
+unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len);
+unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len);
unsigned paravirt_patch_ignore(unsigned len);
unsigned paravirt_patch_call(void *insnbuf,
const void *target, u16 tgt_clobbers,
@@ -479,25 +510,45 @@ int paravirt_disable_iospace(void);
* makes sure the incoming and outgoing types are always correct.
*/
#ifdef CONFIG_X86_32
-#define PVOP_VCALL_ARGS unsigned long __eax, __edx, __ecx
+#define PVOP_VCALL_ARGS \
+ unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx
#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
+
+#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x))
+#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x))
+#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x))
+
#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \
"=c" (__ecx)
#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS
+
+#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx)
+#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
+
#define EXTRA_CLOBBERS
#define VEXTRA_CLOBBERS
-#else
-#define PVOP_VCALL_ARGS unsigned long __edi, __esi, __edx, __ecx
+#else /* CONFIG_X86_64 */
+#define PVOP_VCALL_ARGS \
+ unsigned long __edi = __edi, __esi = __esi, \
+ __edx = __edx, __ecx = __ecx
#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax
+
+#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x))
+#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x))
+#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x))
+#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x))
+
#define PVOP_VCALL_CLOBBERS "=D" (__edi), \
"=S" (__esi), "=d" (__edx), \
"=c" (__ecx)
-
#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax)
+#define PVOP_VCALLEE_CLOBBERS "=a" (__eax)
+#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
+
#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11"
#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11"
-#endif
+#endif /* CONFIG_X86_32 */
#ifdef CONFIG_PARAVIRT_DEBUG
#define PVOP_TEST_NULL(op) BUG_ON(op == NULL)
@@ -505,10 +556,11 @@ int paravirt_disable_iospace(void);
#define PVOP_TEST_NULL(op) ((void)op)
#endif
-#define __PVOP_CALL(rettype, op, pre, post, ...) \
+#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \
+ pre, post, ...) \
({ \
rettype __ret; \
- PVOP_CALL_ARGS; \
+ PVOP_CALL_ARGS; \
PVOP_TEST_NULL(op); \
/* This is 32-bit specific, but is okay in 64-bit */ \
/* since this condition will never hold */ \
@@ -516,70 +568,113 @@ int paravirt_disable_iospace(void);
asm volatile(pre \
paravirt_alt(PARAVIRT_CALL) \
post \
- : PVOP_CALL_CLOBBERS \
+ : call_clbr \
: paravirt_type(op), \
- paravirt_clobber(CLBR_ANY), \
+ paravirt_clobber(clbr), \
##__VA_ARGS__ \
- : "memory", "cc" EXTRA_CLOBBERS); \
+ : "memory", "cc" extra_clbr); \
__ret = (rettype)((((u64)__edx) << 32) | __eax); \
} else { \
asm volatile(pre \
paravirt_alt(PARAVIRT_CALL) \
post \
- : PVOP_CALL_CLOBBERS \
+ : call_clbr \
: paravirt_type(op), \
- paravirt_clobber(CLBR_ANY), \
+ paravirt_clobber(clbr), \
##__VA_ARGS__ \
- : "memory", "cc" EXTRA_CLOBBERS); \
+ : "memory", "cc" extra_clbr); \
__ret = (rettype)__eax; \
} \
__ret; \
})
-#define __PVOP_VCALL(op, pre, post, ...) \
+
+#define __PVOP_CALL(rettype, op, pre, post, ...) \
+ ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \
+ EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__)
+
+#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \
+ ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
+ PVOP_CALLEE_CLOBBERS, , \
+ pre, post, ##__VA_ARGS__)
+
+
+#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \
({ \
PVOP_VCALL_ARGS; \
PVOP_TEST_NULL(op); \
asm volatile(pre \
paravirt_alt(PARAVIRT_CALL) \
post \
- : PVOP_VCALL_CLOBBERS \
+ : call_clbr \
: paravirt_type(op), \
- paravirt_clobber(CLBR_ANY), \
+ paravirt_clobber(clbr), \
##__VA_ARGS__ \
- : "memory", "cc" VEXTRA_CLOBBERS); \
+ : "memory", "cc" extra_clbr); \
})
+#define __PVOP_VCALL(op, pre, post, ...) \
+ ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \
+ VEXTRA_CLOBBERS, \
+ pre, post, ##__VA_ARGS__)
+
+#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \
+ ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
+ PVOP_VCALLEE_CLOBBERS, , \
+ pre, post, ##__VA_ARGS__)
+
+
+
#define PVOP_CALL0(rettype, op) \
__PVOP_CALL(rettype, op, "", "")
#define PVOP_VCALL0(op) \
__PVOP_VCALL(op, "", "")
+#define PVOP_CALLEE0(rettype, op) \
+ __PVOP_CALLEESAVE(rettype, op, "", "")
+#define PVOP_VCALLEE0(op) \
+ __PVOP_VCALLEESAVE(op, "", "")
+
+
#define PVOP_CALL1(rettype, op, arg1) \
- __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)))
+ __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
#define PVOP_VCALL1(op, arg1) \
- __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)))
+ __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1))
+
+#define PVOP_CALLEE1(rettype, op, arg1) \
+ __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
+#define PVOP_VCALLEE1(op, arg1) \
+ __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1))
+
#define PVOP_CALL2(rettype, op, arg1, arg2) \
- __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \
- "1" ((unsigned long)(arg2)))
+ __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
+ PVOP_CALL_ARG2(arg2))
#define PVOP_VCALL2(op, arg1, arg2) \
- __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \
- "1" ((unsigned long)(arg2)))
+ __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
+ PVOP_CALL_ARG2(arg2))
+
+#define PVOP_CALLEE2(rettype, op, arg1, arg2) \
+ __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
+ PVOP_CALL_ARG2(arg2))
+#define PVOP_VCALLEE2(op, arg1, arg2) \
+ __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \
+ PVOP_CALL_ARG2(arg2))
+
#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \
- __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \
- "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)))
+ __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
+ PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
#define PVOP_VCALL3(op, arg1, arg2, arg3) \
- __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \
- "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)))
+ __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
+ PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
/* This is the only difference in x86_64. We can make it much simpler */
#ifdef CONFIG_X86_32
#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
__PVOP_CALL(rettype, op, \
"push %[_arg4];", "lea 4(%%esp),%%esp;", \
- "0" ((u32)(arg1)), "1" ((u32)(arg2)), \
- "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
+ PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
+ PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4)))
#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
__PVOP_VCALL(op, \
"push %[_arg4];", "lea 4(%%esp),%%esp;", \
@@ -587,13 +682,13 @@ int paravirt_disable_iospace(void);
"2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
#else
#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
- __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \
- "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \
- "3"((unsigned long)(arg4)))
+ __PVOP_CALL(rettype, op, "", "", \
+ PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
+ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
- __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \
- "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \
- "3"((unsigned long)(arg4)))
+ __PVOP_VCALL(op, "", "", \
+ PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
+ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
#endif
static inline int paravirt_enabled(void)
@@ -984,10 +1079,11 @@ static inline void __flush_tlb_single(unsigned long addr)
PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr);
}
-static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
+static inline void flush_tlb_others(const struct cpumask *cpumask,
+ struct mm_struct *mm,
unsigned long va)
{
- PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va);
+ PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, cpumask, mm, va);
}
static inline int paravirt_pgd_alloc(struct mm_struct *mm)
@@ -1059,13 +1155,13 @@ static inline pte_t __pte(pteval_t val)
pteval_t ret;
if (sizeof(pteval_t) > sizeof(long))
- ret = PVOP_CALL2(pteval_t,
- pv_mmu_ops.make_pte,
- val, (u64)val >> 32);
+ ret = PVOP_CALLEE2(pteval_t,
+ pv_mmu_ops.make_pte,
+ val, (u64)val >> 32);
else
- ret = PVOP_CALL1(pteval_t,
- pv_mmu_ops.make_pte,
- val);
+ ret = PVOP_CALLEE1(pteval_t,
+ pv_mmu_ops.make_pte,
+ val);
return (pte_t) { .pte = ret };
}
@@ -1075,29 +1171,12 @@ static inline pteval_t pte_val(pte_t pte)
pteval_t ret;
if (sizeof(pteval_t) > sizeof(long))
- ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_val,
- pte.pte, (u64)pte.pte >> 32);
- else
- ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_val,
- pte.pte);
-
- return ret;
-}
-
-static inline pteval_t pte_flags(pte_t pte)
-{
- pteval_t ret;
-
- if (sizeof(pteval_t) > sizeof(long))
- ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_flags,
- pte.pte, (u64)pte.pte >> 32);
+ ret = PVOP_CALLEE2(pteval_t, pv_mmu_ops.pte_val,
+ pte.pte, (u64)pte.pte >> 32);
else
- ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_flags,
- pte.pte);
+ ret = PVOP_CALLEE1(pteval_t, pv_mmu_ops.pte_val,
+ pte.pte);
-#ifdef CONFIG_PARAVIRT_DEBUG
- BUG_ON(ret & PTE_PFN_MASK);
-#endif
return ret;
}
@@ -1106,11 +1185,11 @@ static inline pgd_t __pgd(pgdval_t val)
pgdval_t ret;
if (sizeof(pgdval_t) > sizeof(long))
- ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.make_pgd,
- val, (u64)val >> 32);
+ ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.make_pgd,
+ val, (u64)val >> 32);
else
- ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.make_pgd,
- val);
+ ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.make_pgd,
+ val);
return (pgd_t) { ret };
}
@@ -1120,11 +1199,11 @@ static inline pgdval_t pgd_val(pgd_t pgd)
pgdval_t ret;
if (sizeof(pgdval_t) > sizeof(long))
- ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.pgd_val,
- pgd.pgd, (u64)pgd.pgd >> 32);
+ ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.pgd_val,
+ pgd.pgd, (u64)pgd.pgd >> 32);
else
- ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.pgd_val,
- pgd.pgd);
+ ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.pgd_val,
+ pgd.pgd);
return ret;
}
@@ -1188,11 +1267,11 @@ static inline pmd_t __pmd(pmdval_t val)
pmdval_t ret;
if (sizeof(pmdval_t) > sizeof(long))
- ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.make_pmd,
- val, (u64)val >> 32);
+ ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.make_pmd,
+ val, (u64)val >> 32);
else
- ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.make_pmd,
- val);
+ ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.make_pmd,
+ val);
return (pmd_t) { ret };
}
@@ -1202,11 +1281,11 @@ static inline pmdval_t pmd_val(pmd_t pmd)
pmdval_t ret;
if (sizeof(pmdval_t) > sizeof(long))
- ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.pmd_val,
- pmd.pmd, (u64)pmd.pmd >> 32);
+ ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.pmd_val,
+ pmd.pmd, (u64)pmd.pmd >> 32);
else
- ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.pmd_val,
- pmd.pmd);
+ ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.pmd_val,
+ pmd.pmd);
return ret;
}
@@ -1228,11 +1307,11 @@ static inline pud_t __pud(pudval_t val)
pudval_t ret;
if (sizeof(pudval_t) > sizeof(long))
- ret = PVOP_CALL2(pudval_t, pv_mmu_ops.make_pud,
- val, (u64)val >> 32);
+ ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.make_pud,
+ val, (u64)val >> 32);
else
- ret = PVOP_CALL1(pudval_t, pv_mmu_ops.make_pud,
- val);
+ ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.make_pud,
+ val);
return (pud_t) { ret };
}
@@ -1242,11 +1321,11 @@ static inline pudval_t pud_val(pud_t pud)
pudval_t ret;
if (sizeof(pudval_t) > sizeof(long))
- ret = PVOP_CALL2(pudval_t, pv_mmu_ops.pud_val,
- pud.pud, (u64)pud.pud >> 32);
+ ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.pud_val,
+ pud.pud, (u64)pud.pud >> 32);
else
- ret = PVOP_CALL1(pudval_t, pv_mmu_ops.pud_val,
- pud.pud);
+ ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.pud_val,
+ pud.pud);
return ret;
}
@@ -1387,9 +1466,10 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
}
void _paravirt_nop(void);
-#define paravirt_nop ((void *)_paravirt_nop)
+u32 _paravirt_ident_32(u32);
+u64 _paravirt_ident_64(u64);
-void paravirt_use_bytelocks(void);
+#define paravirt_nop ((void *)_paravirt_nop)
#ifdef CONFIG_SMP
@@ -1402,6 +1482,7 @@ static inline int __raw_spin_is_contended(struct raw_spinlock *lock)
{
return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock);
}
+#define __raw_spin_is_contended __raw_spin_is_contended
static __always_inline void __raw_spin_lock(struct raw_spinlock *lock)
{
@@ -1438,12 +1519,37 @@ extern struct paravirt_patch_site __parainstructions[],
__parainstructions_end[];
#ifdef CONFIG_X86_32
-#define PV_SAVE_REGS "pushl %%ecx; pushl %%edx;"
-#define PV_RESTORE_REGS "popl %%edx; popl %%ecx"
+#define PV_SAVE_REGS "pushl %ecx; pushl %edx;"
+#define PV_RESTORE_REGS "popl %edx; popl %ecx;"
+
+/* save and restore all caller-save registers, except return value */
+#define PV_SAVE_ALL_CALLER_REGS "pushl %ecx;"
+#define PV_RESTORE_ALL_CALLER_REGS "popl %ecx;"
+
#define PV_FLAGS_ARG "0"
#define PV_EXTRA_CLOBBERS
#define PV_VEXTRA_CLOBBERS
#else
+/* save and restore all caller-save registers, except return value */
+#define PV_SAVE_ALL_CALLER_REGS \
+ "push %rcx;" \
+ "push %rdx;" \
+ "push %rsi;" \
+ "push %rdi;" \
+ "push %r8;" \
+ "push %r9;" \
+ "push %r10;" \
+ "push %r11;"
+#define PV_RESTORE_ALL_CALLER_REGS \
+ "pop %r11;" \
+ "pop %r10;" \
+ "pop %r9;" \
+ "pop %r8;" \
+ "pop %rdi;" \
+ "pop %rsi;" \
+ "pop %rdx;" \
+ "pop %rcx;"
+
/* We save some registers, but all of them, that's too much. We clobber all
* caller saved registers but the argument parameter */
#define PV_SAVE_REGS "pushq %%rdi;"
@@ -1453,52 +1559,76 @@ extern struct paravirt_patch_site __parainstructions[],
#define PV_FLAGS_ARG "D"
#endif
+/*
+ * Generate a thunk around a function which saves all caller-save
+ * registers except for the return value. This allows C functions to
+ * be called from assembler code where fewer than normal registers are
+ * available. It may also help code generation around calls from C
+ * code if the common case doesn't use many registers.
+ *
+ * When a callee is wrapped in a thunk, the caller can assume that all
+ * arg regs and all scratch registers are preserved across the
+ * call. The return value in rax/eax will not be saved, even for void
+ * functions.
+ */
+#define PV_CALLEE_SAVE_REGS_THUNK(func) \
+ extern typeof(func) __raw_callee_save_##func; \
+ static void *__##func##__ __used = func; \
+ \
+ asm(".pushsection .text;" \
+ "__raw_callee_save_" #func ": " \
+ PV_SAVE_ALL_CALLER_REGS \
+ "call " #func ";" \
+ PV_RESTORE_ALL_CALLER_REGS \
+ "ret;" \
+ ".popsection")
+
+/* Get a reference to a callee-save function */
+#define PV_CALLEE_SAVE(func) \
+ ((struct paravirt_callee_save) { __raw_callee_save_##func })
+
+/* Promise that "func" already uses the right calling convention */
+#define __PV_IS_CALLEE_SAVE(func) \
+ ((struct paravirt_callee_save) { func })
+
static inline unsigned long __raw_local_save_flags(void)
{
unsigned long f;
- asm volatile(paravirt_alt(PV_SAVE_REGS
- PARAVIRT_CALL
- PV_RESTORE_REGS)
+ asm volatile(paravirt_alt(PARAVIRT_CALL)
: "=a"(f)
: paravirt_type(pv_irq_ops.save_fl),
paravirt_clobber(CLBR_EAX)
- : "memory", "cc" PV_VEXTRA_CLOBBERS);
+ : "memory", "cc");
return f;
}
static inline void raw_local_irq_restore(unsigned long f)
{
- asm volatile(paravirt_alt(PV_SAVE_REGS
- PARAVIRT_CALL
- PV_RESTORE_REGS)
+ asm volatile(paravirt_alt(PARAVIRT_CALL)
: "=a"(f)
: PV_FLAGS_ARG(f),
paravirt_type(pv_irq_ops.restore_fl),
paravirt_clobber(CLBR_EAX)
- : "memory", "cc" PV_EXTRA_CLOBBERS);
+ : "memory", "cc");
}
static inline void raw_local_irq_disable(void)
{
- asm volatile(paravirt_alt(PV_SAVE_REGS
- PARAVIRT_CALL
- PV_RESTORE_REGS)
+ asm volatile(paravirt_alt(PARAVIRT_CALL)
:
: paravirt_type(pv_irq_ops.irq_disable),
paravirt_clobber(CLBR_EAX)
- : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
+ : "memory", "eax", "cc");
}
static inline void raw_local_irq_enable(void)
{
- asm volatile(paravirt_alt(PV_SAVE_REGS
- PARAVIRT_CALL
- PV_RESTORE_REGS)
+ asm volatile(paravirt_alt(PARAVIRT_CALL)
:
: paravirt_type(pv_irq_ops.irq_enable),
paravirt_clobber(CLBR_EAX)
- : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
+ : "memory", "eax", "cc");
}
static inline unsigned long __raw_local_irq_save(void)
@@ -1541,33 +1671,49 @@ static inline unsigned long __raw_local_irq_save(void)
.popsection
+#define COND_PUSH(set, mask, reg) \
+ .if ((~(set)) & mask); push %reg; .endif
+#define COND_POP(set, mask, reg) \
+ .if ((~(set)) & mask); pop %reg; .endif
+
#ifdef CONFIG_X86_64
-#define PV_SAVE_REGS \
- push %rax; \
- push %rcx; \
- push %rdx; \
- push %rsi; \
- push %rdi; \
- push %r8; \
- push %r9; \
- push %r10; \
- push %r11
-#define PV_RESTORE_REGS \
- pop %r11; \
- pop %r10; \
- pop %r9; \
- pop %r8; \
- pop %rdi; \
- pop %rsi; \
- pop %rdx; \
- pop %rcx; \
- pop %rax
+
+#define PV_SAVE_REGS(set) \
+ COND_PUSH(set, CLBR_RAX, rax); \
+ COND_PUSH(set, CLBR_RCX, rcx); \
+ COND_PUSH(set, CLBR_RDX, rdx); \
+ COND_PUSH(set, CLBR_RSI, rsi); \
+ COND_PUSH(set, CLBR_RDI, rdi); \
+ COND_PUSH(set, CLBR_R8, r8); \
+ COND_PUSH(set, CLBR_R9, r9); \
+ COND_PUSH(set, CLBR_R10, r10); \
+ COND_PUSH(set, CLBR_R11, r11)
+#define PV_RESTORE_REGS(set) \
+ COND_POP(set, CLBR_R11, r11); \
+ COND_POP(set, CLBR_R10, r10); \
+ COND_POP(set, CLBR_R9, r9); \
+ COND_POP(set, CLBR_R8, r8); \
+ COND_POP(set, CLBR_RDI, rdi); \
+ COND_POP(set, CLBR_RSI, rsi); \
+ COND_POP(set, CLBR_RDX, rdx); \
+ COND_POP(set, CLBR_RCX, rcx); \
+ COND_POP(set, CLBR_RAX, rax)
+
#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8)
#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
#define PARA_INDIRECT(addr) *addr(%rip)
#else
-#define PV_SAVE_REGS pushl %eax; pushl %edi; pushl %ecx; pushl %edx
-#define PV_RESTORE_REGS popl %edx; popl %ecx; popl %edi; popl %eax
+#define PV_SAVE_REGS(set) \
+ COND_PUSH(set, CLBR_EAX, eax); \
+ COND_PUSH(set, CLBR_EDI, edi); \
+ COND_PUSH(set, CLBR_ECX, ecx); \
+ COND_PUSH(set, CLBR_EDX, edx)
+#define PV_RESTORE_REGS(set) \
+ COND_POP(set, CLBR_EDX, edx); \
+ COND_POP(set, CLBR_ECX, ecx); \
+ COND_POP(set, CLBR_EDI, edi); \
+ COND_POP(set, CLBR_EAX, eax)
+
#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4)
#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4)
#define PARA_INDIRECT(addr) *%cs:addr
@@ -1579,15 +1725,15 @@ static inline unsigned long __raw_local_irq_save(void)
#define DISABLE_INTERRUPTS(clobbers) \
PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
- PV_SAVE_REGS; \
+ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \
- PV_RESTORE_REGS;) \
+ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
#define ENABLE_INTERRUPTS(clobbers) \
PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \
- PV_SAVE_REGS; \
+ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \
- PV_RESTORE_REGS;)
+ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
#define USERGS_SYSRET32 \
PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \
@@ -1617,11 +1763,15 @@ static inline unsigned long __raw_local_irq_save(void)
PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
swapgs)
+/*
+ * Note: swapgs is very special, and in practise is either going to be
+ * implemented with a single "swapgs" instruction or something very
+ * special. Either way, we don't need to save any registers for
+ * it.
+ */
#define SWAPGS \
PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
- PV_SAVE_REGS; \
- call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs); \
- PV_RESTORE_REGS \
+ call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs) \
)
#define GET_CR2_INTO_RCX \
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index b8493b3b989..9709fdff661 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -5,10 +5,8 @@
#ifdef CONFIG_X86_PAT
extern int pat_enabled;
-extern void validate_pat_support(struct cpuinfo_x86 *c);
#else
static const int pat_enabled;
-static inline void validate_pat_support(struct cpuinfo_x86 *c) { }
#endif
extern void pat_init(void);
@@ -17,6 +15,4 @@ extern int reserve_memtype(u64 start, u64 end,
unsigned long req_type, unsigned long *ret_type);
extern int free_memtype(u64 start, u64 end);
-extern void pat_disable(char *reason);
-
#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/mach-default/pci-functions.h b/arch/x86/include/asm/pci-functions.h
index ed0bab42735..ed0bab42735 100644
--- a/arch/x86/include/asm/mach-default/pci-functions.h
+++ b/arch/x86/include/asm/pci-functions.h
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 875b38edf19..a977de23cb4 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -19,6 +19,8 @@ struct pci_sysdata {
};
extern int pci_routeirq;
+extern int noioapicquirk;
+extern int noioapicreroute;
/* scan a bus after allocating a pci_sysdata for it */
extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
@@ -82,6 +84,8 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
static inline void early_quirks(void) { }
#endif
+extern void pci_iommu_alloc(void);
+
#endif /* __KERNEL__ */
#ifdef CONFIG_X86_32
@@ -98,9 +102,9 @@ static inline void early_quirks(void) { }
#ifdef CONFIG_NUMA
/* Returns the node based on pci bus */
-static inline int __pcibus_to_node(struct pci_bus *bus)
+static inline int __pcibus_to_node(const struct pci_bus *bus)
{
- struct pci_sysdata *sd = bus->sysdata;
+ const struct pci_sysdata *sd = bus->sysdata;
return sd->node;
}
@@ -109,6 +113,12 @@ static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
{
return node_to_cpumask(__pcibus_to_node(bus));
}
+
+static inline const struct cpumask *
+cpumask_of_pcibus(const struct pci_bus *bus)
+{
+ return cpumask_of_node(__pcibus_to_node(bus));
+}
#endif
#endif /* _ASM_X86_PCI_H */
diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h
index 5b28995d664..4da20798277 100644
--- a/arch/x86/include/asm/pci_64.h
+++ b/arch/x86/include/asm/pci_64.h
@@ -23,7 +23,6 @@ extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
int reg, int len, u32 value);
extern void dma32_reserve_bootmem(void);
-extern void pci_iommu_alloc(void);
/* The PCI address space does equal the physical memory
* address space. The networking and block device layers use
@@ -34,8 +33,6 @@ extern void pci_iommu_alloc(void);
*/
#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
-#if defined(CONFIG_GART_IOMMU) || defined(CONFIG_CALGARY_IOMMU)
-
#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
dma_addr_t ADDR_NAME;
#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
@@ -49,18 +46,6 @@ extern void pci_iommu_alloc(void);
#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
(((PTR)->LEN_NAME) = (VAL))
-#else
-/* No IOMMU */
-
-#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
-#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
-#define pci_unmap_addr(PTR, ADDR_NAME) (0)
-#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
-#define pci_unmap_len(PTR, LEN_NAME) (0)
-#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
-
-#endif
-
#endif /* __KERNEL__ */
#endif /* _ASM_X86_PCI_64_H */
diff --git a/arch/x86/pci/pci.h b/arch/x86/include/asm/pci_x86.h
index 15b9cf6be72..e60fd3e14bd 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -57,7 +57,8 @@ extern struct pci_ops pci_root_ops;
struct irq_info {
u8 bus, devfn; /* Bus, device and function */
struct {
- u8 link; /* IRQ line ID, chipset dependent, 0=not routed */
+ u8 link; /* IRQ line ID, chipset dependent,
+ 0 = not routed */
u16 bitmap; /* Available IRQs */
} __attribute__((packed)) irq[4];
u8 slot; /* Slot number, 0=onboard */
@@ -69,11 +70,13 @@ struct irq_routing_table {
u16 version; /* PIRQ_VERSION */
u16 size; /* Table size in bytes */
u8 rtr_bus, rtr_devfn; /* Where the interrupt router lies */
- u16 exclusive_irqs; /* IRQs devoted exclusively to PCI usage */
- u16 rtr_vendor, rtr_device; /* Vendor and device ID of interrupt router */
+ u16 exclusive_irqs; /* IRQs devoted exclusively to
+ PCI usage */
+ u16 rtr_vendor, rtr_device; /* Vendor and device ID of
+ interrupt router */
u32 miniport_data; /* Crap */
u8 rfu[11];
- u8 checksum; /* Modulo 256 checksum must give zero */
+ u8 checksum; /* Modulo 256 checksum must give 0 */
struct irq_info slots[0];
} __attribute__((packed));
@@ -96,6 +99,7 @@ extern struct pci_raw_ops *raw_pci_ops;
extern struct pci_raw_ops *raw_pci_ext_ops;
extern struct pci_raw_ops pci_direct_conf1;
+extern bool port_cf9_safe;
/* arch_initcall level */
extern int pci_direct_probe(void);
@@ -147,15 +151,15 @@ static inline unsigned int mmio_config_readl(void __iomem *pos)
static inline void mmio_config_writeb(void __iomem *pos, u8 val)
{
- asm volatile("movb %%al,(%1)" :: "a" (val), "r" (pos) : "memory");
+ asm volatile("movb %%al,(%1)" : : "a" (val), "r" (pos) : "memory");
}
static inline void mmio_config_writew(void __iomem *pos, u16 val)
{
- asm volatile("movw %%ax,(%1)" :: "a" (val), "r" (pos) : "memory");
+ asm volatile("movw %%ax,(%1)" : : "a" (val), "r" (pos) : "memory");
}
static inline void mmio_config_writel(void __iomem *pos, u32 val)
{
- asm volatile("movl %%eax,(%1)" :: "a" (val), "r" (pos) : "memory");
+ asm volatile("movl %%eax,(%1)" : : "a" (val), "r" (pos) : "memory");
}
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
deleted file mode 100644
index 2fbfff88df3..00000000000
--- a/arch/x86/include/asm/pda.h
+++ /dev/null
@@ -1,137 +0,0 @@
-#ifndef _ASM_X86_PDA_H
-#define _ASM_X86_PDA_H
-
-#ifndef __ASSEMBLY__
-#include <linux/stddef.h>
-#include <linux/types.h>
-#include <linux/cache.h>
-#include <asm/page.h>
-
-/* Per processor datastructure. %gs points to it while the kernel runs */
-struct x8664_pda {
- struct task_struct *pcurrent; /* 0 Current process */
- unsigned long data_offset; /* 8 Per cpu data offset from linker
- address */
- unsigned long kernelstack; /* 16 top of kernel stack for current */
- unsigned long oldrsp; /* 24 user rsp for system call */
- int irqcount; /* 32 Irq nesting counter. Starts -1 */
- unsigned int cpunumber; /* 36 Logical CPU number */
-#ifdef CONFIG_CC_STACKPROTECTOR
- unsigned long stack_canary; /* 40 stack canary value */
- /* gcc-ABI: this canary MUST be at
- offset 40!!! */
-#endif
- char *irqstackptr;
- short nodenumber; /* number of current node (32k max) */
- short in_bootmem; /* pda lives in bootmem */
- unsigned int __softirq_pending;
- unsigned int __nmi_count; /* number of NMI on this CPUs */
- short mmu_state;
- short isidle;
- struct mm_struct *active_mm;
- unsigned apic_timer_irqs;
- unsigned irq0_irqs;
- unsigned irq_resched_count;
- unsigned irq_call_count;
- unsigned irq_tlb_count;
- unsigned irq_thermal_count;
- unsigned irq_threshold_count;
- unsigned irq_spurious_count;
-} ____cacheline_aligned_in_smp;
-
-extern struct x8664_pda **_cpu_pda;
-extern void pda_init(int);
-
-#define cpu_pda(i) (_cpu_pda[i])
-
-/*
- * There is no fast way to get the base address of the PDA, all the accesses
- * have to mention %fs/%gs. So it needs to be done this Torvaldian way.
- */
-extern void __bad_pda_field(void) __attribute__((noreturn));
-
-/*
- * proxy_pda doesn't actually exist, but tell gcc it is accessed for
- * all PDA accesses so it gets read/write dependencies right.
- */
-extern struct x8664_pda _proxy_pda;
-
-#define pda_offset(field) offsetof(struct x8664_pda, field)
-
-#define pda_to_op(op, field, val) \
-do { \
- typedef typeof(_proxy_pda.field) T__; \
- if (0) { T__ tmp__; tmp__ = (val); } /* type checking */ \
- switch (sizeof(_proxy_pda.field)) { \
- case 2: \
- asm(op "w %1,%%gs:%c2" : \
- "+m" (_proxy_pda.field) : \
- "ri" ((T__)val), \
- "i"(pda_offset(field))); \
- break; \
- case 4: \
- asm(op "l %1,%%gs:%c2" : \
- "+m" (_proxy_pda.field) : \
- "ri" ((T__)val), \
- "i" (pda_offset(field))); \
- break; \
- case 8: \
- asm(op "q %1,%%gs:%c2": \
- "+m" (_proxy_pda.field) : \
- "ri" ((T__)val), \
- "i"(pda_offset(field))); \
- break; \
- default: \
- __bad_pda_field(); \
- } \
-} while (0)
-
-#define pda_from_op(op, field) \
-({ \
- typeof(_proxy_pda.field) ret__; \
- switch (sizeof(_proxy_pda.field)) { \
- case 2: \
- asm(op "w %%gs:%c1,%0" : \
- "=r" (ret__) : \
- "i" (pda_offset(field)), \
- "m" (_proxy_pda.field)); \
- break; \
- case 4: \
- asm(op "l %%gs:%c1,%0": \
- "=r" (ret__): \
- "i" (pda_offset(field)), \
- "m" (_proxy_pda.field)); \
- break; \
- case 8: \
- asm(op "q %%gs:%c1,%0": \
- "=r" (ret__) : \
- "i" (pda_offset(field)), \
- "m" (_proxy_pda.field)); \
- break; \
- default: \
- __bad_pda_field(); \
- } \
- ret__; \
-})
-
-#define read_pda(field) pda_from_op("mov", field)
-#define write_pda(field, val) pda_to_op("mov", field, val)
-#define add_pda(field, val) pda_to_op("add", field, val)
-#define sub_pda(field, val) pda_to_op("sub", field, val)
-#define or_pda(field, val) pda_to_op("or", field, val)
-
-/* This is not atomic against other CPUs -- CPU preemption needs to be off */
-#define test_and_clear_bit_pda(bit, field) \
-({ \
- int old__; \
- asm volatile("btr %2,%%gs:%c3\n\tsbbl %0,%0" \
- : "=r" (old__), "+m" (_proxy_pda.field) \
- : "dIr" (bit), "i" (pda_offset(field)) : "memory");\
- old__; \
-})
-
-#endif
-
-#define PDA_STACKOFFSET (5*8)
-
-#endif /* _ASM_X86_PDA_H */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index ece72053ba6..aee103b26d0 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -2,53 +2,12 @@
#define _ASM_X86_PERCPU_H
#ifdef CONFIG_X86_64
-#include <linux/compiler.h>
-
-/* Same as asm-generic/percpu.h, except that we store the per cpu offset
- in the PDA. Longer term the PDA and every per cpu variable
- should be just put into a single section and referenced directly
- from %gs */
-
-#ifdef CONFIG_SMP
-#include <asm/pda.h>
-
-#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
-#define __my_cpu_offset read_pda(data_offset)
-
-#define per_cpu_offset(x) (__per_cpu_offset(x))
-
+#define __percpu_seg gs
+#define __percpu_mov_op movq
+#else
+#define __percpu_seg fs
+#define __percpu_mov_op movl
#endif
-#include <asm-generic/percpu.h>
-
-DECLARE_PER_CPU(struct x8664_pda, pda);
-
-/*
- * These are supposed to be implemented as a single instruction which
- * operates on the per-cpu data base segment. x86-64 doesn't have
- * that yet, so this is a fairly inefficient workaround for the
- * meantime. The single instruction is atomic with respect to
- * preemption and interrupts, so we need to explicitly disable
- * interrupts here to achieve the same effect. However, because it
- * can be used from within interrupt-disable/enable, we can't actually
- * disable interrupts; disabling preemption is enough.
- */
-#define x86_read_percpu(var) \
- ({ \
- typeof(per_cpu_var(var)) __tmp; \
- preempt_disable(); \
- __tmp = __get_cpu_var(var); \
- preempt_enable(); \
- __tmp; \
- })
-
-#define x86_write_percpu(var, val) \
- do { \
- preempt_disable(); \
- __get_cpu_var(var) = (val); \
- preempt_enable(); \
- } while(0)
-
-#else /* CONFIG_X86_64 */
#ifdef __ASSEMBLY__
@@ -65,47 +24,48 @@ DECLARE_PER_CPU(struct x8664_pda, pda);
* PER_CPU(cpu_gdt_descr, %ebx)
*/
#ifdef CONFIG_SMP
-#define PER_CPU(var, reg) \
- movl %fs:per_cpu__##this_cpu_off, reg; \
+#define PER_CPU(var, reg) \
+ __percpu_mov_op %__percpu_seg:per_cpu__this_cpu_off, reg; \
lea per_cpu__##var(reg), reg
-#define PER_CPU_VAR(var) %fs:per_cpu__##var
+#define PER_CPU_VAR(var) %__percpu_seg:per_cpu__##var
#else /* ! SMP */
-#define PER_CPU(var, reg) \
- movl $per_cpu__##var, reg
+#define PER_CPU(var, reg) \
+ __percpu_mov_op $per_cpu__##var, reg
#define PER_CPU_VAR(var) per_cpu__##var
#endif /* SMP */
+#ifdef CONFIG_X86_64_SMP
+#define INIT_PER_CPU_VAR(var) init_per_cpu__##var
+#else
+#define INIT_PER_CPU_VAR(var) per_cpu__##var
+#endif
+
#else /* ...!ASSEMBLY */
+#include <linux/stringify.h>
+
+#ifdef CONFIG_SMP
+#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x
+#define __my_cpu_offset percpu_read(this_cpu_off)
+#else
+#define __percpu_arg(x) "%" #x
+#endif
+
/*
- * PER_CPU finds an address of a per-cpu variable.
+ * Initialized pointers to per-cpu variables needed for the boot
+ * processor need to use these macros to get the proper address
+ * offset from __per_cpu_load on SMP.
*
- * Args:
- * var - variable name
- * cpu - 32bit register containing the current CPU number
- *
- * The resulting address is stored in the "cpu" argument.
- *
- * Example:
- * PER_CPU(cpu_gdt_descr, %ebx)
+ * There also must be an entry in vmlinux_64.lds.S
*/
-#ifdef CONFIG_SMP
-
-#define __my_cpu_offset x86_read_percpu(this_cpu_off)
-
-/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
-#define __percpu_seg "%%fs:"
-
-#else /* !SMP */
-
-#define __percpu_seg ""
-
-#endif /* SMP */
-
-#include <asm-generic/percpu.h>
+#define DECLARE_INIT_PER_CPU(var) \
+ extern typeof(per_cpu_var(var)) init_per_cpu_var(var)
-/* We can use this directly for local CPU (faster). */
-DECLARE_PER_CPU(unsigned long, this_cpu_off);
+#ifdef CONFIG_X86_64_SMP
+#define init_per_cpu_var(var) init_per_cpu__##var
+#else
+#define init_per_cpu_var(var) per_cpu_var(var)
+#endif
/* For arch-specific code, we can use direct single-insn ops (they
* don't give an lvalue though). */
@@ -120,20 +80,25 @@ do { \
} \
switch (sizeof(var)) { \
case 1: \
- asm(op "b %1,"__percpu_seg"%0" \
+ asm(op "b %1,"__percpu_arg(0) \
: "+m" (var) \
: "ri" ((T__)val)); \
break; \
case 2: \
- asm(op "w %1,"__percpu_seg"%0" \
+ asm(op "w %1,"__percpu_arg(0) \
: "+m" (var) \
: "ri" ((T__)val)); \
break; \
case 4: \
- asm(op "l %1,"__percpu_seg"%0" \
+ asm(op "l %1,"__percpu_arg(0) \
: "+m" (var) \
: "ri" ((T__)val)); \
break; \
+ case 8: \
+ asm(op "q %1,"__percpu_arg(0) \
+ : "+m" (var) \
+ : "re" ((T__)val)); \
+ break; \
default: __bad_percpu_size(); \
} \
} while (0)
@@ -143,17 +108,22 @@ do { \
typeof(var) ret__; \
switch (sizeof(var)) { \
case 1: \
- asm(op "b "__percpu_seg"%1,%0" \
+ asm(op "b "__percpu_arg(1)",%0" \
: "=r" (ret__) \
: "m" (var)); \
break; \
case 2: \
- asm(op "w "__percpu_seg"%1,%0" \
+ asm(op "w "__percpu_arg(1)",%0" \
: "=r" (ret__) \
: "m" (var)); \
break; \
case 4: \
- asm(op "l "__percpu_seg"%1,%0" \
+ asm(op "l "__percpu_arg(1)",%0" \
+ : "=r" (ret__) \
+ : "m" (var)); \
+ break; \
+ case 8: \
+ asm(op "q "__percpu_arg(1)",%0" \
: "=r" (ret__) \
: "m" (var)); \
break; \
@@ -162,13 +132,30 @@ do { \
ret__; \
})
-#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var)
-#define x86_write_percpu(var, val) percpu_to_op("mov", per_cpu__##var, val)
-#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu__##var, val)
-#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val)
-#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
+#define percpu_read(var) percpu_from_op("mov", per_cpu__##var)
+#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val)
+#define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val)
+#define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val)
+#define percpu_and(var, val) percpu_to_op("and", per_cpu__##var, val)
+#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val)
+#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val)
+
+/* This is not atomic against other CPUs -- CPU preemption needs to be off */
+#define x86_test_and_clear_bit_percpu(bit, var) \
+({ \
+ int old__; \
+ asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \
+ : "=r" (old__), "+m" (per_cpu__##var) \
+ : "dIr" (bit)); \
+ old__; \
+})
+
+#include <asm-generic/percpu.h>
+
+/* We can use this directly for local CPU (faster). */
+DECLARE_PER_CPU(unsigned long, this_cpu_off);
+
#endif /* !__ASSEMBLY__ */
-#endif /* !CONFIG_X86_64 */
#ifdef CONFIG_SMP
@@ -195,9 +182,9 @@ do { \
#define early_per_cpu_ptr(_name) (_name##_early_ptr)
#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
#define early_per_cpu(_name, _cpu) \
- (early_per_cpu_ptr(_name) ? \
- early_per_cpu_ptr(_name)[_cpu] : \
- per_cpu(_name, _cpu))
+ *(early_per_cpu_ptr(_name) ? \
+ &early_per_cpu_ptr(_name)[_cpu] : \
+ &per_cpu(_name, _cpu))
#else /* !CONFIG_SMP */
#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index cb7c151a8bf..dd14c54ac71 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -42,6 +42,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
static inline void pte_free(struct mm_struct *mm, struct page *pte)
{
+ pgtable_page_dtor(pte);
__free_page(pte);
}
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index b17edfd2362..c1774ac9da7 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -53,26 +53,56 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
#endif
-#define pte_none(x) (!(x).pte_low)
-
/*
- * Bits 0, 6 and 7 are taken, split up the 29 bits of offset
- * into this range:
+ * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
+ * split up the 29 bits of offset into this range:
*/
#define PTE_FILE_MAX_BITS 29
+#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1)
+#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
+#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1)
+#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1)
+#else
+#define PTE_FILE_SHIFT2 (_PAGE_BIT_PROTNONE + 1)
+#define PTE_FILE_SHIFT3 (_PAGE_BIT_FILE + 1)
+#endif
+#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
+#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
#define pte_to_pgoff(pte) \
- ((((pte).pte_low >> 1) & 0x1f) + (((pte).pte_low >> 8) << 5))
+ ((((pte).pte_low >> PTE_FILE_SHIFT1) \
+ & ((1U << PTE_FILE_BITS1) - 1)) \
+ + ((((pte).pte_low >> PTE_FILE_SHIFT2) \
+ & ((1U << PTE_FILE_BITS2) - 1)) << PTE_FILE_BITS1) \
+ + (((pte).pte_low >> PTE_FILE_SHIFT3) \
+ << (PTE_FILE_BITS1 + PTE_FILE_BITS2)))
#define pgoff_to_pte(off) \
- ((pte_t) { .pte_low = (((off) & 0x1f) << 1) + \
- (((off) >> 5) << 8) + _PAGE_FILE })
+ ((pte_t) { .pte_low = \
+ (((off) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \
+ + ((((off) >> PTE_FILE_BITS1) & ((1U << PTE_FILE_BITS2) - 1)) \
+ << PTE_FILE_SHIFT2) \
+ + (((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \
+ << PTE_FILE_SHIFT3) \
+ + _PAGE_FILE })
/* Encode and de-code a swap entry */
-#define __swp_type(x) (((x).val >> 1) & 0x1f)
-#define __swp_offset(x) ((x).val >> 8)
-#define __swp_entry(type, offset) \
- ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
+#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
+#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
+#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
+#else
+#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
+#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
+#endif
+
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
+
+#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
+ & ((1U << SWP_TYPE_BITS) - 1))
+#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
+#define __swp_entry(type, offset) ((swp_entry_t) { \
+ ((type) << (_PAGE_BIT_PRESENT + 1)) \
+ | ((offset) << SWP_OFFSET_SHIFT) })
#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 52597aeadff..3f13cdf6115 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -18,21 +18,6 @@
printk("%s:%d: bad pgd %p(%016Lx).\n", \
__FILE__, __LINE__, &(e), pgd_val(e))
-static inline int pud_none(pud_t pud)
-{
- return pud_val(pud) == 0;
-}
-
-static inline int pud_bad(pud_t pud)
-{
- return (pud_val(pud) & ~(PTE_PFN_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
-}
-
-static inline int pud_present(pud_t pud)
-{
- return pud_val(pud) & _PAGE_PRESENT;
-}
-
/* Rules for using set_pte: the pte being assigned *must* be
* either not present or in a state where the hardware will
* not attempt to update the pte. In places where this is
@@ -120,15 +105,6 @@ static inline void pud_clear(pud_t *pudp)
write_cr3(pgd);
}
-#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT)
-
-#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK))
-
-
-/* Find an entry in the second-level page table.. */
-#define pmd_offset(pud, address) ((pmd_t *)pud_page_vaddr(*(pud)) + \
- pmd_index(address))
-
#ifdef CONFIG_SMP
static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
{
@@ -145,17 +121,6 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
#endif
-#define __HAVE_ARCH_PTE_SAME
-static inline int pte_same(pte_t a, pte_t b)
-{
- return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
-}
-
-static inline int pte_none(pte_t pte)
-{
- return !pte.pte_low && !pte.pte_high;
-}
-
/*
* Bits 0, 6 and 7 are taken in the low part of the pte,
* put the 32 bits of offset into the high part.
@@ -166,6 +131,7 @@ static inline int pte_none(pte_t pte)
#define PTE_FILE_MAX_BITS 32
/* Encode and de-code a swap entry */
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
#define __swp_type(x) (((x).val) & 0x1f)
#define __swp_offset(x) ((x).val >> 5)
#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index c012f3b1167..8fef0f6bfbb 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_PGTABLE_H
#define _ASM_X86_PGTABLE_H
+#include <asm/page.h>
+
#define FIRST_USER_ADDRESS 0
#define _PAGE_BIT_PRESENT 0 /* is present */
@@ -10,7 +12,6 @@
#define _PAGE_BIT_PCD 4 /* page cache disabled */
#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
-#define _PAGE_BIT_FILE 6
#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
#define _PAGE_BIT_PAT 7 /* on 4KB pages */
#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
@@ -22,6 +23,12 @@
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
+/* If _PAGE_BIT_PRESENT is clear, we use these: */
+/* - if the user mapped it with PROT_NONE; pte_present gives true */
+#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
+/* - set: nonlinear file mapping, saved PTE; unset:swap */
+#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY
+
#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
@@ -46,11 +53,8 @@
#define _PAGE_NX (_AT(pteval_t, 0))
#endif
-/* If _PAGE_PRESENT is clear, we use these: */
-#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping,
- * saved PTE; unset:swap */
-#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE;
- pte_present gives true */
+#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
+#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_DIRTY)
@@ -158,8 +162,19 @@
#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */
#endif
+/*
+ * Macro to mark a page protection value as UC-
+ */
+#define pgprot_noncached(prot) \
+ ((boot_cpu_data.x86 > 3) \
+ ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS)) \
+ : (prot))
+
#ifndef __ASSEMBLY__
+#define pgprot_writecombine pgprot_writecombine
+extern pgprot_t pgprot_writecombine(pgprot_t prot);
+
/*
* ZERO_PAGE is a global shared page that is always zero: used
* for zero-mapped memory areas etc..
@@ -223,82 +238,110 @@ static inline unsigned long pte_pfn(pte_t pte)
static inline int pmd_large(pmd_t pte)
{
- return (pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
+ return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
(_PAGE_PSE | _PAGE_PRESENT);
}
+static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
+{
+ pteval_t v = native_pte_val(pte);
+
+ return native_make_pte(v | set);
+}
+
+static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
+{
+ pteval_t v = native_pte_val(pte);
+
+ return native_make_pte(v & ~clear);
+}
+
static inline pte_t pte_mkclean(pte_t pte)
{
- return __pte(pte_val(pte) & ~_PAGE_DIRTY);
+ return pte_clear_flags(pte, _PAGE_DIRTY);
}
static inline pte_t pte_mkold(pte_t pte)
{
- return __pte(pte_val(pte) & ~_PAGE_ACCESSED);
+ return pte_clear_flags(pte, _PAGE_ACCESSED);
}
static inline pte_t pte_wrprotect(pte_t pte)
{
- return __pte(pte_val(pte) & ~_PAGE_RW);
+ return pte_clear_flags(pte, _PAGE_RW);
}
static inline pte_t pte_mkexec(pte_t pte)
{
- return __pte(pte_val(pte) & ~_PAGE_NX);
+ return pte_clear_flags(pte, _PAGE_NX);
}
static inline pte_t pte_mkdirty(pte_t pte)
{
- return __pte(pte_val(pte) | _PAGE_DIRTY);
+ return pte_set_flags(pte, _PAGE_DIRTY);
}
static inline pte_t pte_mkyoung(pte_t pte)
{
- return __pte(pte_val(pte) | _PAGE_ACCESSED);
+ return pte_set_flags(pte, _PAGE_ACCESSED);
}
static inline pte_t pte_mkwrite(pte_t pte)
{
- return __pte(pte_val(pte) | _PAGE_RW);
+ return pte_set_flags(pte, _PAGE_RW);
}
static inline pte_t pte_mkhuge(pte_t pte)
{
- return __pte(pte_val(pte) | _PAGE_PSE);
+ return pte_set_flags(pte, _PAGE_PSE);
}
static inline pte_t pte_clrhuge(pte_t pte)
{
- return __pte(pte_val(pte) & ~_PAGE_PSE);
+ return pte_clear_flags(pte, _PAGE_PSE);
}
static inline pte_t pte_mkglobal(pte_t pte)
{
- return __pte(pte_val(pte) | _PAGE_GLOBAL);
+ return pte_set_flags(pte, _PAGE_GLOBAL);
}
static inline pte_t pte_clrglobal(pte_t pte)
{
- return __pte(pte_val(pte) & ~_PAGE_GLOBAL);
+ return pte_clear_flags(pte, _PAGE_GLOBAL);
}
static inline pte_t pte_mkspecial(pte_t pte)
{
- return __pte(pte_val(pte) | _PAGE_SPECIAL);
+ return pte_set_flags(pte, _PAGE_SPECIAL);
}
extern pteval_t __supported_pte_mask;
+/*
+ * Mask out unsupported bits in a present pgprot. Non-present pgprots
+ * can use those bits for other purposes, so leave them be.
+ */
+static inline pgprotval_t massage_pgprot(pgprot_t pgprot)
+{
+ pgprotval_t protval = pgprot_val(pgprot);
+
+ if (protval & _PAGE_PRESENT)
+ protval &= __supported_pte_mask;
+
+ return protval;
+}
+
static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
- return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
- pgprot_val(pgprot)) & __supported_pte_mask);
+ return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) |
+ massage_pgprot(pgprot));
}
static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
{
- return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
- pgprot_val(pgprot)) & __supported_pte_mask);
+ return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) |
+ massage_pgprot(pgprot));
}
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
@@ -310,7 +353,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
* the newprot (if present):
*/
val &= _PAGE_CHG_MASK;
- val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
+ val |= massage_pgprot(newprot) & ~_PAGE_CHG_MASK;
return __pte(val);
}
@@ -326,9 +369,31 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
-#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
+#define canon_pgprot(p) __pgprot(massage_pgprot(p))
+
+static inline int is_new_memtype_allowed(unsigned long flags,
+ unsigned long new_flags)
+{
+ /*
+ * Certain new memtypes are not allowed with certain
+ * requested memtype:
+ * - request is uncached, return cannot be write-back
+ * - request is write-combine, return cannot be write-back
+ */
+ if ((flags == _PAGE_CACHE_UC_MINUS &&
+ new_flags == _PAGE_CACHE_WB) ||
+ (flags == _PAGE_CACHE_WC &&
+ new_flags == _PAGE_CACHE_WB)) {
+ return 0;
+ }
+
+ return 1;
+}
#ifndef __ASSEMBLY__
+/* Indicate that x86 has its own track and untrack pfn vma functions */
+#define __HAVE_PFNMAP_TRACKING
+
#define __HAVE_PHYS_MEM_ACCESS_PROT
struct file;
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
@@ -402,6 +467,190 @@ static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
# include "pgtable_64.h"
#endif
+#ifndef __ASSEMBLY__
+#include <linux/mm_types.h>
+
+static inline int pte_none(pte_t pte)
+{
+ return !pte.pte;
+}
+
+#define __HAVE_ARCH_PTE_SAME
+static inline int pte_same(pte_t a, pte_t b)
+{
+ return a.pte == b.pte;
+}
+
+static inline int pte_present(pte_t a)
+{
+ return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
+}
+
+static inline int pmd_present(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_PRESENT;
+}
+
+static inline int pmd_none(pmd_t pmd)
+{
+ /* Only check low word on 32-bit platforms, since it might be
+ out of sync with upper half. */
+ return (unsigned long)native_pmd_val(pmd) == 0;
+}
+
+static inline unsigned long pmd_page_vaddr(pmd_t pmd)
+{
+ return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK);
+}
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)
+
+/*
+ * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
+ *
+ * this macro returns the index of the entry in the pmd page which would
+ * control the given virtual address
+ */
+static inline unsigned pmd_index(unsigned long address)
+{
+ return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
+}
+
+/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ *
+ * (Currently stuck as a macro because of indirect forward reference
+ * to linux/mm.h:page_to_nid())
+ */
+#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
+
+/*
+ * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
+ *
+ * this function returns the index of the entry in the pte page which would
+ * control the given virtual address
+ */
+static inline unsigned pte_index(unsigned long address)
+{
+ return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+}
+
+static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
+{
+ return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
+}
+
+static inline int pmd_bad(pmd_t pmd)
+{
+ return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
+}
+
+static inline unsigned long pages_to_mb(unsigned long npg)
+{
+ return npg >> (20 - PAGE_SHIFT);
+}
+
+#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
+ remap_pfn_range(vma, vaddr, pfn, size, prot)
+
+#if PAGETABLE_LEVELS == 2
+static inline int pud_large(pud_t pud)
+{
+ return 0;
+}
+#endif
+
+#if PAGETABLE_LEVELS > 2
+static inline int pud_none(pud_t pud)
+{
+ return native_pud_val(pud) == 0;
+}
+
+static inline int pud_present(pud_t pud)
+{
+ return pud_flags(pud) & _PAGE_PRESENT;
+}
+
+static inline unsigned long pud_page_vaddr(pud_t pud)
+{
+ return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK);
+}
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT)
+
+/* Find an entry in the second-level page table.. */
+static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
+{
+ return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address);
+}
+
+static inline unsigned long pmd_pfn(pmd_t pmd)
+{
+ return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT;
+}
+
+static inline int pud_large(pud_t pud)
+{
+ return (pud_flags(pud) & (_PAGE_PSE | _PAGE_PRESENT)) ==
+ (_PAGE_PSE | _PAGE_PRESENT);
+}
+
+static inline int pud_bad(pud_t pud)
+{
+ return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
+}
+#endif /* PAGETABLE_LEVELS > 2 */
+
+#if PAGETABLE_LEVELS > 3
+static inline int pgd_present(pgd_t pgd)
+{
+ return pgd_flags(pgd) & _PAGE_PRESENT;
+}
+
+static inline unsigned long pgd_page_vaddr(pgd_t pgd)
+{
+ return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK);
+}
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)
+
+/* to find an entry in a page-table-directory. */
+static inline unsigned pud_index(unsigned long address)
+{
+ return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
+}
+
+static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
+{
+ return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address);
+}
+
+static inline int pgd_bad(pgd_t pgd)
+{
+ return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+}
+
+static inline int pgd_none(pgd_t pgd)
+{
+ return !native_pgd_val(pgd);
+}
+#endif /* PAGETABLE_LEVELS > 3 */
+
+#endif /* __ASSEMBLY__ */
+
/*
* the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
*
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index f9d5889b336..1952bb762aa 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -85,64 +85,12 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
/* The boot page tables (all created as a single array) */
extern unsigned long pg0[];
-#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
-
-/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
-#define pmd_none(x) (!(unsigned long)pmd_val((x)))
-#define pmd_present(x) (pmd_val((x)) & _PAGE_PRESENT)
-#define pmd_bad(x) ((pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
-
-#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
-
#ifdef CONFIG_X86_PAE
# include <asm/pgtable-3level.h>
#else
# include <asm/pgtable-2level.h>
#endif
-/*
- * Macro to mark a page protection value as "uncacheable".
- * On processors which do not support it, this is a no-op.
- */
-#define pgprot_noncached(prot) \
- ((boot_cpu_data.x86 > 3) \
- ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \
- : (prot))
-
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- */
-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
-
-
-static inline int pud_large(pud_t pud) { return 0; }
-
-/*
- * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
- *
- * this macro returns the index of the entry in the pmd page which would
- * control the given virtual address
- */
-#define pmd_index(address) \
- (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
-
-/*
- * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
- *
- * this macro returns the index of the entry in the pte page which would
- * control the given virtual address
- */
-#define pte_index(address) \
- (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-#define pte_offset_kernel(dir, address) \
- ((pte_t *)pmd_page_vaddr(*(dir)) + pte_index((address)))
-
-#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
-
-#define pmd_page_vaddr(pmd) \
- ((unsigned long)__va(pmd_val((pmd)) & PTE_PFN_MASK))
-
#if defined(CONFIG_HIGHPTE)
#define pte_offset_map(dir, address) \
((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \
@@ -185,7 +133,4 @@ do { \
#define kern_addr_valid(kaddr) (0)
#endif
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
- remap_pfn_range(vma, vaddr, pfn, size, prot)
-
#endif /* _ASM_X86_PGTABLE_32_H */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 545a0e042bb..1c4e247c51f 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -11,7 +11,6 @@
#include <asm/processor.h>
#include <linux/bitops.h>
#include <linux/threads.h>
-#include <asm/pda.h>
extern pud_t level3_kernel_pgt[512];
extern pud_t level3_ident_pgt[512];
@@ -67,9 +66,6 @@ extern void paging_init(void);
printk("%s:%d: bad pgd %p(%016lx).\n", \
__FILE__, __LINE__, &(e), pgd_val(e))
-#define pgd_none(x) (!pgd_val(x))
-#define pud_none(x) (!pud_val(x))
-
struct mm_struct;
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
@@ -134,8 +130,6 @@ static inline void native_pgd_clear(pgd_t *pgd)
native_set_pgd(pgd, native_make_pgd(0));
}
-#define pte_same(a, b) ((a).pte == (b).pte)
-
#endif /* !__ASSEMBLY__ */
#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
@@ -146,7 +140,7 @@ static inline void native_pgd_clear(pgd_t *pgd)
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
-#define MAXMEM _AC(0x00003fffffffffff, UL)
+#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
#define VMALLOC_START _AC(0xffffc20000000000, UL)
#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
#define VMEMMAP_START _AC(0xffffe20000000000, UL)
@@ -156,32 +150,6 @@ static inline void native_pgd_clear(pgd_t *pgd)
#ifndef __ASSEMBLY__
-static inline int pgd_bad(pgd_t pgd)
-{
- return (pgd_val(pgd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
-}
-
-static inline int pud_bad(pud_t pud)
-{
- return (pud_val(pud) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
-}
-
-static inline int pmd_bad(pmd_t pmd)
-{
- return (pmd_val(pmd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
-}
-
-#define pte_none(x) (!pte_val((x)))
-#define pte_present(x) (pte_val((x)) & (_PAGE_PRESENT | _PAGE_PROTNONE))
-
-#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
-
-/*
- * Macro to mark a page protection value as "uncacheable".
- */
-#define pgprot_noncached(prot) \
- (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT))
-
/*
* Conversion functions: convert a page and protection to a page entry,
* and a page entry and page directory to the page they refer to.
@@ -190,41 +158,12 @@ static inline int pmd_bad(pmd_t pmd)
/*
* Level 4 access.
*/
-#define pgd_page_vaddr(pgd) \
- ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_PFN_MASK))
-#define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
-#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT)
static inline int pgd_large(pgd_t pgd) { return 0; }
#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
/* PUD - Level3 access */
-/* to find an entry in a page-table-directory. */
-#define pud_page_vaddr(pud) \
- ((unsigned long)__va(pud_val((pud)) & PHYSICAL_PAGE_MASK))
-#define pud_page(pud) (pfn_to_page(pud_val((pud)) >> PAGE_SHIFT))
-#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
-#define pud_offset(pgd, address) \
- ((pud_t *)pgd_page_vaddr(*(pgd)) + pud_index((address)))
-#define pud_present(pud) (pud_val((pud)) & _PAGE_PRESENT)
-
-static inline int pud_large(pud_t pte)
-{
- return (pud_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
- (_PAGE_PSE | _PAGE_PRESENT);
-}
/* PMD - Level 2 access */
-#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_PFN_MASK))
-#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
-
-#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
-#define pmd_offset(dir, address) ((pmd_t *)pud_page_vaddr(*(dir)) + \
- pmd_index(address))
-#define pmd_none(x) (!pmd_val((x)))
-#define pmd_present(x) (pmd_val((x)) & _PAGE_PRESENT)
-#define pfn_pmd(nr, prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val((prot))))
-#define pmd_pfn(x) ((pmd_val((x)) & __PHYSICAL_MASK) >> PAGE_SHIFT)
-
#define pte_to_pgoff(pte) ((pte_val((pte)) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \
_PAGE_FILE })
@@ -232,13 +171,6 @@ static inline int pud_large(pud_t pte)
/* PTE - Level 1 access. */
-/* page, protection -> pte */
-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn((page)), (pgprot))
-
-#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
- pte_index((address)))
-
/* x86-64 always has all page tables mapped. */
#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
@@ -250,19 +182,28 @@ static inline int pud_large(pud_t pte)
extern int direct_gbpages;
/* Encode and de-code a swap entry */
-#define __swp_type(x) (((x).val >> 1) & 0x3f)
-#define __swp_offset(x) ((x).val >> 8)
-#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | \
- ((offset) << 8) })
+#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
+#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
+#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
+#else
+#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
+#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
+#endif
+
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
+
+#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
+ & ((1U << SWP_TYPE_BITS) - 1))
+#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
+#define __swp_entry(type, offset) ((swp_entry_t) { \
+ ((type) << (_PAGE_BIT_PRESENT + 1)) \
+ | ((offset) << SWP_OFFSET_SHIFT) })
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
extern int kern_addr_valid(unsigned long addr);
extern void cleanup_highmap(void);
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
- remap_pfn_range(vma, vaddr, pfn, size, prot)
-
#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
diff --git a/arch/x86/include/asm/prctl.h b/arch/x86/include/asm/prctl.h
index fe681147a4f..3ac5032fae0 100644
--- a/arch/x86/include/asm/prctl.h
+++ b/arch/x86/include/asm/prctl.h
@@ -6,5 +6,4 @@
#define ARCH_GET_FS 0x1003
#define ARCH_GET_GS 0x1004
-
#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 5ca01e38326..a0133838b67 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -73,7 +73,7 @@ struct cpuinfo_x86 {
char pad0;
#else
/* Number of 4K pages in DTLB/ITLB combined(in pages): */
- int x86_tlbsize;
+ int x86_tlbsize;
__u8 x86_virt_bits;
__u8 x86_phys_bits;
#endif
@@ -110,6 +110,7 @@ struct cpuinfo_x86 {
/* Index into per_cpu list: */
u16 cpu_index;
#endif
+ unsigned int x86_hyper_vendor;
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
#define X86_VENDOR_INTEL 0
@@ -123,6 +124,9 @@ struct cpuinfo_x86 {
#define X86_VENDOR_UNKNOWN 0xff
+#define X86_HYPER_VENDOR_NONE 0
+#define X86_HYPER_VENDOR_VMWARE 1
+
/*
* capabilities of CPUs
*/
@@ -349,7 +353,7 @@ struct i387_soft_struct {
u8 no_update;
u8 rm;
u8 alimit;
- struct info *info;
+ struct math_emu_info *info;
u32 entry_eip;
};
@@ -374,7 +378,29 @@ union thread_xstate {
#ifdef CONFIG_X86_64
DECLARE_PER_CPU(struct orig_ist, orig_ist);
+
+union irq_stack_union {
+ char irq_stack[IRQ_STACK_SIZE];
+ /*
+ * GCC hardcodes the stack canary as %gs:40. Since the
+ * irq_stack is the object at %gs:0, we reserve the bottom
+ * 48 bytes of the irq stack for the canary.
+ */
+ struct {
+ char gs_base[40];
+ unsigned long stack_canary;
+ };
+};
+
+DECLARE_PER_CPU(union irq_stack_union, irq_stack_union);
+DECLARE_INIT_PER_CPU(irq_stack_union);
+
+DECLARE_PER_CPU(char *, irq_stack_ptr);
+#else /* X86_64 */
+#ifdef CONFIG_CC_STACKPROTECTOR
+DECLARE_PER_CPU(unsigned long, stack_canary);
#endif
+#endif /* X86_64 */
extern void print_cpu_info(struct cpuinfo_x86 *);
extern unsigned int xstate_size;
@@ -748,9 +774,22 @@ extern int sysenter_setup(void);
extern struct desc_ptr early_gdt_descr;
extern void cpu_set_gdt(int);
-extern void switch_to_new_gdt(void);
+extern void switch_to_new_gdt(int);
+extern void load_percpu_segment(int);
extern void cpu_init(void);
-extern void init_gdt(int cpu);
+
+static inline unsigned long get_debugctlmsr(void)
+{
+ unsigned long debugctlmsr = 0;
+
+#ifndef CONFIG_X86_DEBUGCTLMSR
+ if (boot_cpu_data.x86 < 6)
+ return 0;
+#endif
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+
+ return debugctlmsr;
+}
static inline void update_debugctlmsr(unsigned long debugctlmsr)
{
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index d6a22f92ba7..49fb3ecf3bb 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -18,11 +18,7 @@ extern void syscall32_cpu_init(void);
extern void check_efer(void);
-#ifdef CONFIG_X86_BIOS_REBOOT
extern int reboot_force;
-#else
-static const int reboot_force = 0;
-#endif
long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h
index 25f1bb8fc62..8e0f8d199e0 100644
--- a/arch/x86/include/asm/ptrace-abi.h
+++ b/arch/x86/include/asm/ptrace-abi.h
@@ -83,7 +83,7 @@
#ifdef CONFIG_X86_PTRACE_BTS
#ifndef __ASSEMBLY__
-#include <asm/types.h>
+#include <linux/types.h>
/* configuration/status structure used in PTRACE_BTS_CONFIG and
PTRACE_BTS_STATUS commands.
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index d1531c8480b..e304b66abee 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -6,7 +6,6 @@
#include <asm/processor-flags.h>
#ifdef __KERNEL__
-#include <asm/ds.h> /* the DS BTS struct is used for ptrace too */
#include <asm/segment.h>
#endif
@@ -29,7 +28,7 @@ struct pt_regs {
int xds;
int xes;
int xfs;
- /* int gs; */
+ int xgs;
long orig_eax;
long eip;
int xcs;
@@ -51,7 +50,7 @@ struct pt_regs {
unsigned long ds;
unsigned long es;
unsigned long fs;
- /* int gs; */
+ unsigned long gs;
unsigned long orig_ax;
unsigned long ip;
unsigned long cs;
@@ -128,34 +127,6 @@ struct pt_regs {
#endif /* !__i386__ */
-#ifdef CONFIG_X86_PTRACE_BTS
-/* a branch trace record entry
- *
- * In order to unify the interface between various processor versions,
- * we use the below data structure for all processors.
- */
-enum bts_qualifier {
- BTS_INVALID = 0,
- BTS_BRANCH,
- BTS_TASK_ARRIVES,
- BTS_TASK_DEPARTS
-};
-
-struct bts_struct {
- __u64 qualifier;
- union {
- /* BTS_BRANCH */
- struct {
- __u64 from_ip;
- __u64 to_ip;
- } lbr;
- /* BTS_TASK_ARRIVES or
- BTS_TASK_DEPARTS */
- __u64 jiffies;
- } variant;
-};
-#endif /* CONFIG_X86_PTRACE_BTS */
-
#ifdef __KERNEL__
#include <linux/init.h>
@@ -163,13 +134,6 @@ struct bts_struct {
struct cpuinfo_x86;
struct task_struct;
-#ifdef CONFIG_X86_PTRACE_BTS
-extern void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *);
-extern void ptrace_bts_take_timestamp(struct task_struct *, enum bts_qualifier);
-#else
-#define ptrace_bts_init_intel(config) do {} while (0)
-#endif /* CONFIG_X86_PTRACE_BTS */
-
extern unsigned long profile_pc(struct pt_regs *regs);
extern unsigned long
@@ -271,7 +235,12 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
extern int do_set_thread_area(struct task_struct *p, int idx,
struct user_desc __user *info, int can_allocate);
-#define __ARCH_WANT_COMPAT_SYS_PTRACE
+extern void x86_ptrace_untrace(struct task_struct *);
+extern void x86_ptrace_fork(struct task_struct *child,
+ unsigned long clone_flags);
+
+#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk)
+#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags)
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h b/arch/x86/include/asm/rdc321x_defs.h
index c8e9c8bed3d..c8e9c8bed3d 100644
--- a/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h
+++ b/arch/x86/include/asm/rdc321x_defs.h
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 1dc1b51ac62..14e0ed86a6f 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -61,7 +61,7 @@
*
* 26 - ESPFIX small SS
* 27 - per-cpu [ offset to per-cpu data area ]
- * 28 - unused
+ * 28 - stack_canary-20 [ for stack protector ]
* 29 - unused
* 30 - unused
* 31 - TSS for double fault handler
@@ -95,6 +95,13 @@
#define __KERNEL_PERCPU 0
#endif
+#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE + 16)
+#ifdef CONFIG_CC_STACKPROTECTOR
+#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY * 8)
+#else
+#define __KERNEL_STACK_CANARY 0
+#endif
+
#define GDT_ENTRY_DOUBLEFAULT_TSS 31
/*
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index f12d3723746..45b40278b58 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_SETUP_H
#define _ASM_X86_SETUP_H
+#ifdef __KERNEL__
+
#define COMMAND_LINE_SIZE 2048
#ifndef __ASSEMBLY__
@@ -8,6 +10,8 @@
/* Interrupt control for vSMPowered x86_64 systems */
void vsmp_init(void);
+void setup_bios_corruption_check(void);
+
#ifdef CONFIG_X86_VISWS
extern void visws_early_detect(void);
extern int is_visws_box(void);
@@ -16,12 +20,14 @@ static inline void visws_early_detect(void) { }
static inline int is_visws_box(void) { return 0; }
#endif
+extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
+extern int wakeup_secondary_cpu_via_init(int apicid, unsigned long start_eip);
/*
* Any setup quirks to be performed?
*/
-struct mpc_config_processor;
-struct mpc_config_bus;
-struct mp_config_oemtable;
+struct mpc_cpu;
+struct mpc_bus;
+struct mpc_oemtable;
struct x86_quirks {
int (*arch_pre_time_init)(void);
int (*arch_time_init)(void);
@@ -33,12 +39,13 @@ struct x86_quirks {
int (*mach_find_smp_config)(unsigned int reserve);
int *mpc_record;
- int (*mpc_apic_id)(struct mpc_config_processor *m);
- void (*mpc_oem_bus_info)(struct mpc_config_bus *m, char *name);
- void (*mpc_oem_pci_bus)(struct mpc_config_bus *m);
- void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable,
- unsigned short oemsize);
+ int (*mpc_apic_id)(struct mpc_cpu *m);
+ void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
+ void (*mpc_oem_pci_bus)(struct mpc_bus *m);
+ void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable,
+ unsigned short oemsize);
int (*setup_ioapic_ids)(void);
+ int (*update_genapic)(void);
};
extern struct x86_quirks *x86_quirks;
@@ -49,8 +56,6 @@ extern unsigned long saved_video_mode;
#endif
#endif /* __ASSEMBLY__ */
-#ifdef __KERNEL__
-
#ifdef __i386__
#include <linux/pfn.h>
@@ -93,7 +98,6 @@ extern unsigned long init_pg_tables_start;
extern unsigned long init_pg_tables_end;
#else
-void __init x86_64_init_pda(void);
void __init x86_64_start_kernel(char *real_mode);
void __init x86_64_start_reservations(char *real_mode_data);
diff --git a/arch/x86/include/asm/mach-default/setup_arch.h b/arch/x86/include/asm/setup_arch.h
index 38846208b54..38846208b54 100644
--- a/arch/x86/include/asm/mach-default/setup_arch.h
+++ b/arch/x86/include/asm/setup_arch.h
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index 0afcb5e58ac..ec666491aaa 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -2,7 +2,7 @@
#define _ASM_X86_SIGCONTEXT_H
#include <linux/compiler.h>
-#include <asm/types.h>
+#include <linux/types.h>
#define FP_XSTATE_MAGIC1 0x46505853U
#define FP_XSTATE_MAGIC2 0x46505845U
diff --git a/arch/x86/include/asm/sigcontext32.h b/arch/x86/include/asm/sigcontext32.h
index 6126188cf3a..ad1478c4ae1 100644
--- a/arch/x86/include/asm/sigcontext32.h
+++ b/arch/x86/include/asm/sigcontext32.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_SIGCONTEXT32_H
#define _ASM_X86_SIGCONTEXT32_H
+#include <linux/types.h>
+
/* signal context for 32bit programs. */
#define X86_FXSR_MAGIC 0x0000
diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
new file mode 100644
index 00000000000..4e0fe26d27d
--- /dev/null
+++ b/arch/x86/include/asm/sigframe.h
@@ -0,0 +1,70 @@
+#ifndef _ASM_X86_SIGFRAME_H
+#define _ASM_X86_SIGFRAME_H
+
+#include <asm/sigcontext.h>
+#include <asm/siginfo.h>
+#include <asm/ucontext.h>
+
+#ifdef CONFIG_X86_32
+#define sigframe_ia32 sigframe
+#define rt_sigframe_ia32 rt_sigframe
+#define sigcontext_ia32 sigcontext
+#define _fpstate_ia32 _fpstate
+#define ucontext_ia32 ucontext
+#else /* !CONFIG_X86_32 */
+
+#ifdef CONFIG_IA32_EMULATION
+#include <asm/ia32.h>
+#endif /* CONFIG_IA32_EMULATION */
+
+#endif /* CONFIG_X86_32 */
+
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+struct sigframe_ia32 {
+ u32 pretcode;
+ int sig;
+ struct sigcontext_ia32 sc;
+ /*
+ * fpstate is unused. fpstate is moved/allocated after
+ * retcode[] below. This movement allows to have the FP state and the
+ * future state extensions (xsave) stay together.
+ * And at the same time retaining the unused fpstate, prevents changing
+ * the offset of extramask[] in the sigframe and thus prevent any
+ * legacy application accessing/modifying it.
+ */
+ struct _fpstate_ia32 fpstate_unused;
+#ifdef CONFIG_IA32_EMULATION
+ unsigned int extramask[_COMPAT_NSIG_WORDS-1];
+#else /* !CONFIG_IA32_EMULATION */
+ unsigned long extramask[_NSIG_WORDS-1];
+#endif /* CONFIG_IA32_EMULATION */
+ char retcode[8];
+ /* fp state follows here */
+};
+
+struct rt_sigframe_ia32 {
+ u32 pretcode;
+ int sig;
+ u32 pinfo;
+ u32 puc;
+#ifdef CONFIG_IA32_EMULATION
+ compat_siginfo_t info;
+#else /* !CONFIG_IA32_EMULATION */
+ struct siginfo info;
+#endif /* CONFIG_IA32_EMULATION */
+ struct ucontext_ia32 uc;
+ char retcode[8];
+ /* fp state follows here */
+};
+#endif /* defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) */
+
+#ifdef CONFIG_X86_64
+struct rt_sigframe {
+ char __user *pretcode;
+ struct ucontext uc;
+ struct siginfo info;
+ /* fp state follows here */
+};
+#endif /* CONFIG_X86_64 */
+
+#endif /* _ASM_X86_SIGFRAME_H */
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 96ac44f275d..7761a5d554b 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -121,6 +121,10 @@ typedef unsigned long sigset_t;
#ifndef __ASSEMBLY__
+# ifdef __KERNEL__
+extern void do_notify_resume(struct pt_regs *, void *, __u32);
+# endif /* __KERNEL__ */
+
#ifdef __i386__
# ifdef __KERNEL__
struct old_sigaction {
@@ -141,8 +145,6 @@ struct k_sigaction {
struct sigaction sa;
};
-extern void do_notify_resume(struct pt_regs *, void *, __u32);
-
# else /* __KERNEL__ */
/* Here we must cater to libcs that poke about in kernel headers. */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index d12811ce51d..47d0e21f2b9 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -15,28 +15,26 @@
# include <asm/io_apic.h>
# endif
#endif
-#include <asm/pda.h>
#include <asm/thread_info.h>
-
-extern cpumask_t cpu_callout_map;
-extern cpumask_t cpu_initialized;
-extern cpumask_t cpu_callin_map;
-
-extern void (*mtrr_hook)(void);
-extern void zap_low_mappings(void);
-
-extern int __cpuinit get_local_pda(int cpu);
+#include <asm/cpumask.h>
extern int smp_num_siblings;
extern unsigned int num_processors;
-extern cpumask_t cpu_initialized;
DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
DECLARE_PER_CPU(cpumask_t, cpu_core_map);
DECLARE_PER_CPU(u16, cpu_llc_id);
-#ifdef CONFIG_X86_32
DECLARE_PER_CPU(int, cpu_number);
-#endif
+
+static inline struct cpumask *cpu_sibling_mask(int cpu)
+{
+ return &per_cpu(cpu_sibling_map, cpu);
+}
+
+static inline struct cpumask *cpu_core_mask(int cpu)
+{
+ return &per_cpu(cpu_core_map, cpu);
+}
DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
@@ -60,7 +58,7 @@ struct smp_ops {
void (*cpu_die)(unsigned int cpu);
void (*play_dead)(void);
- void (*send_call_func_ipi)(cpumask_t mask);
+ void (*send_call_func_ipi)(const struct cpumask *mask);
void (*send_call_func_single_ipi)(int cpu);
};
@@ -125,7 +123,7 @@ static inline void arch_send_call_function_single_ipi(int cpu)
static inline void arch_send_call_function_ipi(cpumask_t mask)
{
- smp_ops.send_call_func_ipi(mask);
+ smp_ops.send_call_func_ipi(&mask);
}
void cpu_disable_common(void);
@@ -138,22 +136,16 @@ void native_cpu_die(unsigned int cpu);
void native_play_dead(void);
void play_dead_common(void);
-void native_send_call_func_ipi(cpumask_t mask);
+void native_send_call_func_ipi(const struct cpumask *mask);
void native_send_call_func_single_ipi(int cpu);
-extern void prefill_possible_map(void);
-
void smp_store_cpu_info(int id);
#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
/* We don't mark CPUs online until __cpu_up(), so we need another measure */
static inline int num_booting_cpus(void)
{
- return cpus_weight(cpu_callout_map);
-}
-#else
-static inline void prefill_possible_map(void)
-{
+ return cpumask_weight(cpu_callout_mask);
}
#endif /* CONFIG_SMP */
@@ -165,11 +157,11 @@ extern unsigned disabled_cpus __cpuinitdata;
* from the initial startup. We map APIC_BASE very early in page_setup(),
* so this is correct in the x86 case.
*/
-#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
+#define raw_smp_processor_id() (percpu_read(cpu_number))
extern int safe_smp_processor_id(void);
#elif defined(CONFIG_X86_64_SMP)
-#define raw_smp_processor_id() read_pda(cpunumber)
+#define raw_smp_processor_id() (percpu_read(cpu_number))
#define stack_smp_processor_id() \
({ \
@@ -179,10 +171,6 @@ extern int safe_smp_processor_id(void);
})
#define safe_smp_processor_id() smp_processor_id()
-#else /* !CONFIG_X86_32_SMP && !CONFIG_X86_64_SMP */
-#define cpu_physical_id(cpu) boot_cpu_physical_apicid
-#define safe_smp_processor_id() 0
-#define stack_smp_processor_id() 0
#endif
#ifdef CONFIG_X86_LOCAL_APIC
@@ -194,28 +182,9 @@ static inline int logical_smp_processor_id(void)
return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
}
-#include <mach_apicdef.h>
-static inline unsigned int read_apic_id(void)
-{
- unsigned int reg;
-
- reg = *(u32 *)(APIC_BASE + APIC_ID);
-
- return GET_APIC_ID(reg);
-}
#endif
-
-# if defined(APIC_DEFINITION) || defined(CONFIG_X86_64)
extern int hard_smp_processor_id(void);
-# else
-#include <mach_apicdef.h>
-static inline int hard_smp_processor_id(void)
-{
- /* we don't want to mark this access volatile - bad code generation */
- return read_apic_id();
-}
-# endif /* APIC_DEFINITION */
#else /* CONFIG_X86_LOCAL_APIC */
@@ -225,11 +194,5 @@ static inline int hard_smp_processor_id(void)
#endif /* CONFIG_X86_LOCAL_APIC */
-#ifdef CONFIG_X86_HAS_BOOT_CPU_ID
-extern unsigned char boot_cpu_id;
-#else
-#define boot_cpu_id 0
-#endif
-
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/include/asm/mach-default/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h
index dbab36d64d4..1def6011490 100644
--- a/arch/x86/include/asm/mach-default/smpboot_hooks.h
+++ b/arch/x86/include/asm/smpboot_hooks.h
@@ -13,9 +13,11 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
CMOS_WRITE(0xa, 0xf);
local_flush_tlb();
pr_debug("1.\n");
- *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4;
+ *((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_high)) =
+ start_eip >> 4;
pr_debug("2.\n");
- *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf;
+ *((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_low)) =
+ start_eip & 0xf;
pr_debug("3.\n");
}
@@ -32,7 +34,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
*/
CMOS_WRITE(0, 0xf);
- *((volatile long *) phys_to_virt(0x467)) = 0;
+ *((volatile long *)phys_to_virt(apic->trampoline_phys_low)) = 0;
}
static inline void __init smpboot_setup_io_apic(void)
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index be44f7dab39..e3cc3c063ec 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -27,7 +27,7 @@
#else /* CONFIG_X86_32 */
# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
# define MAX_PHYSADDR_BITS 44
-# define MAX_PHYSMEM_BITS 44
+# define MAX_PHYSMEM_BITS 44 /* Can be max 45 bits */
#endif
#endif /* CONFIG_SPARSEMEM */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index d17c91981da..3a569665668 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -172,70 +172,8 @@ static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
}
-#ifdef CONFIG_PARAVIRT
-/*
- * Define virtualization-friendly old-style lock byte lock, for use in
- * pv_lock_ops if desired.
- *
- * This differs from the pre-2.6.24 spinlock by always using xchgb
- * rather than decb to take the lock; this allows it to use a
- * zero-initialized lock structure. It also maintains a 1-byte
- * contention counter, so that we can implement
- * __byte_spin_is_contended.
- */
-struct __byte_spinlock {
- s8 lock;
- s8 spinners;
-};
-
-static inline int __byte_spin_is_locked(raw_spinlock_t *lock)
-{
- struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
- return bl->lock != 0;
-}
-
-static inline int __byte_spin_is_contended(raw_spinlock_t *lock)
-{
- struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
- return bl->spinners != 0;
-}
-
-static inline void __byte_spin_lock(raw_spinlock_t *lock)
-{
- struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
- s8 val = 1;
-
- asm("1: xchgb %1, %0\n"
- " test %1,%1\n"
- " jz 3f\n"
- " " LOCK_PREFIX "incb %2\n"
- "2: rep;nop\n"
- " cmpb $1, %0\n"
- " je 2b\n"
- " " LOCK_PREFIX "decb %2\n"
- " jmp 1b\n"
- "3:"
- : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory");
-}
-
-static inline int __byte_spin_trylock(raw_spinlock_t *lock)
-{
- struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
- u8 old = 1;
-
- asm("xchgb %1,%0"
- : "+m" (bl->lock), "+q" (old) : : "memory");
+#ifndef CONFIG_PARAVIRT
- return old == 0;
-}
-
-static inline void __byte_spin_unlock(raw_spinlock_t *lock)
-{
- struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
- smp_wmb();
- bl->lock = 0;
-}
-#else /* !CONFIG_PARAVIRT */
static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
{
return __ticket_spin_is_locked(lock);
@@ -245,6 +183,7 @@ static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
{
return __ticket_spin_is_contended(lock);
}
+#define __raw_spin_is_contended __raw_spin_is_contended
static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
{
@@ -267,7 +206,7 @@ static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
__raw_spin_lock(lock);
}
-#endif /* CONFIG_PARAVIRT */
+#endif
static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
{
@@ -329,8 +268,7 @@ static inline int __raw_read_trylock(raw_rwlock_t *lock)
{
atomic_t *count = (atomic_t *)lock;
- atomic_dec(count);
- if (atomic_read(count) >= 0)
+ if (atomic_dec_return(count) >= 0)
return 1;
atomic_inc(count);
return 0;
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
new file mode 100644
index 00000000000..c2d742c6e15
--- /dev/null
+++ b/arch/x86/include/asm/stackprotector.h
@@ -0,0 +1,124 @@
+/*
+ * GCC stack protector support.
+ *
+ * Stack protector works by putting predefined pattern at the start of
+ * the stack frame and verifying that it hasn't been overwritten when
+ * returning from the function. The pattern is called stack canary
+ * and unfortunately gcc requires it to be at a fixed offset from %gs.
+ * On x86_64, the offset is 40 bytes and on x86_32 20 bytes. x86_64
+ * and x86_32 use segment registers differently and thus handles this
+ * requirement differently.
+ *
+ * On x86_64, %gs is shared by percpu area and stack canary. All
+ * percpu symbols are zero based and %gs points to the base of percpu
+ * area. The first occupant of the percpu area is always
+ * irq_stack_union which contains stack_canary at offset 40. Userland
+ * %gs is always saved and restored on kernel entry and exit using
+ * swapgs, so stack protector doesn't add any complexity there.
+ *
+ * On x86_32, it's slightly more complicated. As in x86_64, %gs is
+ * used for userland TLS. Unfortunately, some processors are much
+ * slower at loading segment registers with different value when
+ * entering and leaving the kernel, so the kernel uses %fs for percpu
+ * area and manages %gs lazily so that %gs is switched only when
+ * necessary, usually during task switch.
+ *
+ * As gcc requires the stack canary at %gs:20, %gs can't be managed
+ * lazily if stack protector is enabled, so the kernel saves and
+ * restores userland %gs on kernel entry and exit. This behavior is
+ * controlled by CONFIG_X86_32_LAZY_GS and accessors are defined in
+ * system.h to hide the details.
+ */
+
+#ifndef _ASM_STACKPROTECTOR_H
+#define _ASM_STACKPROTECTOR_H 1
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+
+#include <asm/tsc.h>
+#include <asm/processor.h>
+#include <asm/percpu.h>
+#include <asm/system.h>
+#include <asm/desc.h>
+#include <linux/random.h>
+
+/*
+ * 24 byte read-only segment initializer for stack canary. Linker
+ * can't handle the address bit shifting. Address will be set in
+ * head_32 for boot CPU and setup_per_cpu_areas() for others.
+ */
+#define GDT_STACK_CANARY_INIT \
+ [GDT_ENTRY_STACK_CANARY] = { { { 0x00000018, 0x00409000 } } },
+
+/*
+ * Initialize the stackprotector canary value.
+ *
+ * NOTE: this must only be called from functions that never return,
+ * and it must always be inlined.
+ */
+static __always_inline void boot_init_stack_canary(void)
+{
+ u64 canary;
+ u64 tsc;
+
+#ifdef CONFIG_X86_64
+ BUILD_BUG_ON(offsetof(union irq_stack_union, stack_canary) != 40);
+#endif
+ /*
+ * We both use the random pool and the current TSC as a source
+ * of randomness. The TSC only matters for very early init,
+ * there it already has some randomness on most systems. Later
+ * on during the bootup the random pool has true entropy too.
+ */
+ get_random_bytes(&canary, sizeof(canary));
+ tsc = __native_read_tsc();
+ canary += tsc + (tsc << 32UL);
+
+ current->stack_canary = canary;
+#ifdef CONFIG_X86_64
+ percpu_write(irq_stack_union.stack_canary, canary);
+#else
+ percpu_write(stack_canary, canary);
+#endif
+}
+
+static inline void setup_stack_canary_segment(int cpu)
+{
+#ifdef CONFIG_X86_32
+ unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu) - 20;
+ struct desc_struct *gdt_table = get_cpu_gdt_table(cpu);
+ struct desc_struct desc;
+
+ desc = gdt_table[GDT_ENTRY_STACK_CANARY];
+ desc.base0 = canary & 0xffff;
+ desc.base1 = (canary >> 16) & 0xff;
+ desc.base2 = (canary >> 24) & 0xff;
+ write_gdt_entry(gdt_table, GDT_ENTRY_STACK_CANARY, &desc, DESCTYPE_S);
+#endif
+}
+
+static inline void load_stack_canary_segment(void)
+{
+#ifdef CONFIG_X86_32
+ asm("mov %0, %%gs" : : "r" (__KERNEL_STACK_CANARY) : "memory");
+#endif
+}
+
+#else /* CC_STACKPROTECTOR */
+
+#define GDT_STACK_CANARY_INIT
+
+/* dummy boot_init_stack_canary() is defined in linux/stackprotector.h */
+
+static inline void setup_stack_canary_segment(int cpu)
+{ }
+
+static inline void load_stack_canary_segment(void)
+{
+#ifdef CONFIG_X86_32
+ asm volatile ("mov %0, %%gs" : : "r" (0));
+#endif
+}
+
+#endif /* CC_STACKPROTECTOR */
+#endif /* _ASM_STACKPROTECTOR_H */
diff --git a/arch/x86/include/asm/summit/apic.h b/arch/x86/include/asm/summit/apic.h
deleted file mode 100644
index 9b3070f1c2a..00000000000
--- a/arch/x86/include/asm/summit/apic.h
+++ /dev/null
@@ -1,184 +0,0 @@
-#ifndef __ASM_SUMMIT_APIC_H
-#define __ASM_SUMMIT_APIC_H
-
-#include <asm/smp.h>
-
-#define esr_disable (1)
-#define NO_BALANCE_IRQ (0)
-
-/* In clustered mode, the high nibble of APIC ID is a cluster number.
- * The low nibble is a 4-bit bitmap. */
-#define XAPIC_DEST_CPUS_SHIFT 4
-#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
-#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
-
-#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
-
-static inline cpumask_t target_cpus(void)
-{
- /* CPU_MASK_ALL (0xff) has undefined behaviour with
- * dest_LowestPrio mode logical clustered apic interrupt routing
- * Just start on cpu 0. IRQ balancing will spread load
- */
- return cpumask_of_cpu(0);
-}
-
-#define INT_DELIVERY_MODE (dest_LowestPrio)
-#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */
-
-static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
-{
- return 0;
-}
-
-/* we don't use the phys_cpu_present_map to indicate apicid presence */
-static inline unsigned long check_apicid_present(int bit)
-{
- return 1;
-}
-
-#define apicid_cluster(apicid) ((apicid) & XAPIC_DEST_CLUSTER_MASK)
-
-extern u8 cpu_2_logical_apicid[];
-
-static inline void init_apic_ldr(void)
-{
- unsigned long val, id;
- int count = 0;
- u8 my_id = (u8)hard_smp_processor_id();
- u8 my_cluster = (u8)apicid_cluster(my_id);
-#ifdef CONFIG_SMP
- u8 lid;
- int i;
-
- /* Create logical APIC IDs by counting CPUs already in cluster. */
- for (count = 0, i = NR_CPUS; --i >= 0; ) {
- lid = cpu_2_logical_apicid[i];
- if (lid != BAD_APICID && apicid_cluster(lid) == my_cluster)
- ++count;
- }
-#endif
- /* We only have a 4 wide bitmap in cluster mode. If a deranged
- * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
- BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
- id = my_cluster | (1UL << count);
- apic_write(APIC_DFR, APIC_DFR_VALUE);
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- val |= SET_APIC_LOGICAL_ID(id);
- apic_write(APIC_LDR, val);
-}
-
-static inline int multi_timer_check(int apic, int irq)
-{
- return 0;
-}
-
-static inline int apic_id_registered(void)
-{
- return 1;
-}
-
-static inline void setup_apic_routing(void)
-{
- printk("Enabling APIC mode: Summit. Using %d I/O APICs\n",
- nr_ioapics);
-}
-
-static inline int apicid_to_node(int logical_apicid)
-{
-#ifdef CONFIG_SMP
- return apicid_2_node[hard_smp_processor_id()];
-#else
- return 0;
-#endif
-}
-
-/* Mapping from cpu number to logical apicid */
-static inline int cpu_to_logical_apicid(int cpu)
-{
-#ifdef CONFIG_SMP
- if (cpu >= NR_CPUS)
- return BAD_APICID;
- return (int)cpu_2_logical_apicid[cpu];
-#else
- return logical_smp_processor_id();
-#endif
-}
-
-static inline int cpu_present_to_apicid(int mps_cpu)
-{
- if (mps_cpu < NR_CPUS)
- return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
- else
- return BAD_APICID;
-}
-
-static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_id_map)
-{
- /* For clustered we don't have a good way to do this yet - hack */
- return physids_promote(0x0F);
-}
-
-static inline physid_mask_t apicid_to_cpu_present(int apicid)
-{
- return physid_mask_of_physid(0);
-}
-
-static inline void setup_portio_remap(void)
-{
-}
-
-static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
-{
- return 1;
-}
-
-static inline void enable_apic_mode(void)
-{
-}
-
-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
-{
- int num_bits_set;
- int cpus_found = 0;
- int cpu;
- int apicid;
-
- num_bits_set = cpus_weight(cpumask);
- /* Return id to all */
- if (num_bits_set == NR_CPUS)
- return (int) 0xFF;
- /*
- * The cpus in the mask must all be on the apic cluster. If are not
- * on the same apicid cluster return default value of TARGET_CPUS.
- */
- cpu = first_cpu(cpumask);
- apicid = cpu_to_logical_apicid(cpu);
- while (cpus_found < num_bits_set) {
- if (cpu_isset(cpu, cpumask)) {
- int new_apicid = cpu_to_logical_apicid(cpu);
- if (apicid_cluster(apicid) !=
- apicid_cluster(new_apicid)){
- printk ("%s: Not a valid mask!\n", __func__);
- return 0xFF;
- }
- apicid = apicid | new_apicid;
- cpus_found++;
- }
- cpu++;
- }
- return apicid;
-}
-
-/* cpuid returns the value latched in the HW at reset, not the APIC ID
- * register's value. For any box whose BIOS changes APIC IDs, like
- * clustered APIC systems, we must use hard_smp_processor_id.
- *
- * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
- */
-static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
-{
- return hard_smp_processor_id() >> index_msb;
-}
-
-#endif /* __ASM_SUMMIT_APIC_H */
diff --git a/arch/x86/include/asm/summit/apicdef.h b/arch/x86/include/asm/summit/apicdef.h
deleted file mode 100644
index f3fbca1f61c..00000000000
--- a/arch/x86/include/asm/summit/apicdef.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __ASM_SUMMIT_APICDEF_H
-#define __ASM_SUMMIT_APICDEF_H
-
-#define APIC_ID_MASK (0xFF<<24)
-
-static inline unsigned get_apic_id(unsigned long x)
-{
- return (x>>24)&0xFF;
-}
-
-#define GET_APIC_ID(x) get_apic_id(x)
-
-#endif
diff --git a/arch/x86/include/asm/summit/ipi.h b/arch/x86/include/asm/summit/ipi.h
deleted file mode 100644
index 53bd1e7bd7b..00000000000
--- a/arch/x86/include/asm/summit/ipi.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef __ASM_SUMMIT_IPI_H
-#define __ASM_SUMMIT_IPI_H
-
-void send_IPI_mask_sequence(cpumask_t mask, int vector);
-
-static inline void send_IPI_mask(cpumask_t mask, int vector)
-{
- send_IPI_mask_sequence(mask, vector);
-}
-
-static inline void send_IPI_allbutself(int vector)
-{
- cpumask_t mask = cpu_online_map;
- cpu_clear(smp_processor_id(), mask);
-
- if (!cpus_empty(mask))
- send_IPI_mask(mask, vector);
-}
-
-static inline void send_IPI_all(int vector)
-{
- send_IPI_mask(cpu_online_map, vector);
-}
-
-#endif /* __ASM_SUMMIT_IPI_H */
diff --git a/arch/x86/include/asm/summit/mpparse.h b/arch/x86/include/asm/summit/mpparse.h
deleted file mode 100644
index 013ce6fab2d..00000000000
--- a/arch/x86/include/asm/summit/mpparse.h
+++ /dev/null
@@ -1,109 +0,0 @@
-#ifndef __ASM_SUMMIT_MPPARSE_H
-#define __ASM_SUMMIT_MPPARSE_H
-
-#include <asm/tsc.h>
-
-extern int use_cyclone;
-
-#ifdef CONFIG_X86_SUMMIT_NUMA
-extern void setup_summit(void);
-#else
-#define setup_summit() {}
-#endif
-
-static inline int mps_oem_check(struct mp_config_table *mpc, char *oem,
- char *productid)
-{
- if (!strncmp(oem, "IBM ENSW", 8) &&
- (!strncmp(productid, "VIGIL SMP", 9)
- || !strncmp(productid, "EXA", 3)
- || !strncmp(productid, "RUTHLESS SMP", 12))){
- mark_tsc_unstable("Summit based system");
- use_cyclone = 1; /*enable cyclone-timer*/
- setup_summit();
- return 1;
- }
- return 0;
-}
-
-/* Hook from generic ACPI tables.c */
-static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- if (!strncmp(oem_id, "IBM", 3) &&
- (!strncmp(oem_table_id, "SERVIGIL", 8)
- || !strncmp(oem_table_id, "EXA", 3))){
- mark_tsc_unstable("Summit based system");
- use_cyclone = 1; /*enable cyclone-timer*/
- setup_summit();
- return 1;
- }
- return 0;
-}
-
-struct rio_table_hdr {
- unsigned char version; /* Version number of this data structure */
- /* Version 3 adds chassis_num & WP_index */
- unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */
- unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */
-} __attribute__((packed));
-
-struct scal_detail {
- unsigned char node_id; /* Scalability Node ID */
- unsigned long CBAR; /* Address of 1MB register space */
- unsigned char port0node; /* Node ID port connected to: 0xFF=None */
- unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char port1node; /* Node ID port connected to: 0xFF = None */
- unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char port2node; /* Node ID port connected to: 0xFF = None */
- unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */
-} __attribute__((packed));
-
-struct rio_detail {
- unsigned char node_id; /* RIO Node ID */
- unsigned long BBAR; /* Address of 1MB register space */
- unsigned char type; /* Type of device */
- unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/
- /* For CYC: Node ID of Twister that owns this CYC */
- unsigned char port0node; /* Node ID port connected to: 0xFF=None */
- unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char port1node; /* Node ID port connected to: 0xFF=None */
- unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */
- /* For CYC: 0 */
- unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */
- /* = 0 : the XAPIC is not used, ie:*/
- /* ints fwded to another XAPIC */
- /* Bits1:7 Reserved */
- /* For CYC: Bits0:7 Reserved */
- unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */
- /* lower slot numbers/PCI bus numbers */
- /* For CYC: No meaning */
- unsigned char chassis_num; /* 1 based Chassis number */
- /* For LookOut WPEGs this field indicates the */
- /* Expansion Chassis #, enumerated from Boot */
- /* Node WPEG external port, then Boot Node CYC */
- /* external port, then Next Vigil chassis WPEG */
- /* external port, etc. */
- /* Shared Lookouts have only 1 chassis number (the */
- /* first one assigned) */
-} __attribute__((packed));
-
-
-typedef enum {
- CompatTwister = 0, /* Compatibility Twister */
- AltTwister = 1, /* Alternate Twister of internal 8-way */
- CompatCyclone = 2, /* Compatibility Cyclone */
- AltCyclone = 3, /* Alternate Cyclone of internal 8-way */
- CompatWPEG = 4, /* Compatibility WPEG */
- AltWPEG = 5, /* Second Planar WPEG */
- LookOutAWPEG = 6, /* LookOut WPEG */
- LookOutBWPEG = 7, /* LookOut WPEG */
-} node_type;
-
-static inline int is_WPEG(struct rio_detail *rio){
- return (rio->type == CompatWPEG || rio->type == AltWPEG ||
- rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
-}
-
-#endif /* __ASM_SUMMIT_MPPARSE_H */
diff --git a/arch/x86/kvm/svm.h b/arch/x86/include/asm/svm.h
index 1b8afa78e86..1b8afa78e86 100644
--- a/arch/x86/kvm/svm.h
+++ b/arch/x86/include/asm/svm.h
diff --git a/arch/x86/include/asm/swab.h b/arch/x86/include/asm/swab.h
new file mode 100644
index 00000000000..557cd9f0066
--- /dev/null
+++ b/arch/x86/include/asm/swab.h
@@ -0,0 +1,61 @@
+#ifndef _ASM_X86_SWAB_H
+#define _ASM_X86_SWAB_H
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+
+static inline __attribute_const__ __u32 __arch_swab32(__u32 val)
+{
+#ifdef __i386__
+# ifdef CONFIG_X86_BSWAP
+ asm("bswap %0" : "=r" (val) : "0" (val));
+# else
+ asm("xchgb %b0,%h0\n\t" /* swap lower bytes */
+ "rorl $16,%0\n\t" /* swap words */
+ "xchgb %b0,%h0" /* swap higher bytes */
+ : "=q" (val)
+ : "0" (val));
+# endif
+
+#else /* __i386__ */
+ asm("bswapl %0"
+ : "=r" (val)
+ : "0" (val));
+#endif
+ return val;
+}
+#define __arch_swab32 __arch_swab32
+
+static inline __attribute_const__ __u64 __arch_swab64(__u64 val)
+{
+#ifdef __i386__
+ union {
+ struct {
+ __u32 a;
+ __u32 b;
+ } s;
+ __u64 u;
+ } v;
+ v.u = val;
+# ifdef CONFIG_X86_BSWAP
+ asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1"
+ : "=r" (v.s.a), "=r" (v.s.b)
+ : "0" (v.s.a), "1" (v.s.b));
+# else
+ v.s.a = __arch_swab32(v.s.a);
+ v.s.b = __arch_swab32(v.s.b);
+ asm("xchgl %0,%1"
+ : "=r" (v.s.a), "=r" (v.s.b)
+ : "0" (v.s.a), "1" (v.s.b));
+# endif
+ return v.u;
+#else /* __i386__ */
+ asm("bswapq %0"
+ : "=r" (val)
+ : "0" (val));
+ return val;
+#endif
+}
+#define __arch_swab64 __arch_swab64
+
+#endif /* _ASM_X86_SWAB_H */
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index 51fb2c76ad7..b9e4e20174f 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -1,46 +1,10 @@
#ifndef _ASM_X86_SWIOTLB_H
#define _ASM_X86_SWIOTLB_H
-#include <asm/dma-mapping.h>
+#include <linux/swiotlb.h>
/* SWIOTLB interface */
-extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr,
- size_t size, int dir);
-extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
- dma_addr_t *dma_handle, gfp_t flags);
-extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
- size_t size, int dir);
-extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
- dma_addr_t dev_addr,
- size_t size, int dir);
-extern void swiotlb_sync_single_for_device(struct device *hwdev,
- dma_addr_t dev_addr,
- size_t size, int dir);
-extern void swiotlb_sync_single_range_for_cpu(struct device *hwdev,
- dma_addr_t dev_addr,
- unsigned long offset,
- size_t size, int dir);
-extern void swiotlb_sync_single_range_for_device(struct device *hwdev,
- dma_addr_t dev_addr,
- unsigned long offset,
- size_t size, int dir);
-extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
- struct scatterlist *sg, int nelems,
- int dir);
-extern void swiotlb_sync_sg_for_device(struct device *hwdev,
- struct scatterlist *sg, int nelems,
- int dir);
-extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
- int nents, int direction);
-extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
- int nents, int direction);
-extern int swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr);
-extern void swiotlb_free_coherent(struct device *hwdev, size_t size,
- void *vaddr, dma_addr_t dma_handle);
-extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
-extern void swiotlb_init(void);
-
extern int swiotlb_force;
#ifdef CONFIG_SWIOTLB
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
new file mode 100644
index 00000000000..ffb08be2a53
--- /dev/null
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -0,0 +1,101 @@
+/*
+ * sys_ia32.h - Linux ia32 syscall interfaces
+ *
+ * Copyright (c) 2008 Jaswinder Singh Rajput
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#ifndef _ASM_X86_SYS_IA32_H
+#define _ASM_X86_SYS_IA32_H
+
+#include <linux/compiler.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/signal.h>
+#include <asm/compat.h>
+#include <asm/ia32.h>
+
+/* ia32/sys_ia32.c */
+asmlinkage long sys32_truncate64(char __user *, unsigned long, unsigned long);
+asmlinkage long sys32_ftruncate64(unsigned int, unsigned long, unsigned long);
+
+asmlinkage long sys32_stat64(char __user *, struct stat64 __user *);
+asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *);
+asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *);
+asmlinkage long sys32_fstatat(unsigned int, char __user *,
+ struct stat64 __user *, int);
+struct mmap_arg_struct;
+asmlinkage long sys32_mmap(struct mmap_arg_struct __user *);
+asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long);
+
+asmlinkage long sys32_pipe(int __user *);
+struct sigaction32;
+struct old_sigaction32;
+asmlinkage long sys32_rt_sigaction(int, struct sigaction32 __user *,
+ struct sigaction32 __user *, unsigned int);
+asmlinkage long sys32_sigaction(int, struct old_sigaction32 __user *,
+ struct old_sigaction32 __user *);
+asmlinkage long sys32_rt_sigprocmask(int, compat_sigset_t __user *,
+ compat_sigset_t __user *, unsigned int);
+asmlinkage long sys32_alarm(unsigned int);
+
+struct sel_arg_struct;
+asmlinkage long sys32_old_select(struct sel_arg_struct __user *);
+asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int);
+asmlinkage long sys32_sysfs(int, u32, u32);
+
+asmlinkage long sys32_sched_rr_get_interval(compat_pid_t,
+ struct compat_timespec __user *);
+asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t);
+asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *);
+
+#ifdef CONFIG_SYSCTL_SYSCALL
+struct sysctl_ia32;
+asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *);
+#endif
+
+asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
+asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32);
+
+asmlinkage long sys32_personality(unsigned long);
+asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32);
+
+asmlinkage long sys32_mmap2(unsigned long, unsigned long, unsigned long,
+ unsigned long, unsigned long, unsigned long);
+
+struct oldold_utsname;
+struct old_utsname;
+asmlinkage long sys32_olduname(struct oldold_utsname __user *);
+long sys32_uname(struct old_utsname __user *);
+
+long sys32_ustat(unsigned, struct ustat32 __user *);
+
+asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *,
+ compat_uptr_t __user *, struct pt_regs *);
+asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *);
+
+long sys32_lseek(unsigned int, int, unsigned int);
+long sys32_kill(int, int);
+long sys32_fadvise64_64(int, __u32, __u32, __u32, __u32, int);
+long sys32_vm86_warning(void);
+long sys32_lookup_dcookie(u32, u32, char __user *, size_t);
+
+asmlinkage ssize_t sys32_readahead(int, unsigned, unsigned, size_t);
+asmlinkage long sys32_sync_file_range(int, unsigned, unsigned,
+ unsigned, unsigned, int);
+asmlinkage long sys32_fadvise64(int, unsigned, unsigned, size_t, int);
+asmlinkage long sys32_fallocate(int, int, unsigned,
+ unsigned, unsigned, unsigned);
+
+/* ia32/ia32_signal.c */
+asmlinkage long sys32_sigsuspend(int, int, old_sigset_t);
+asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *,
+ stack_ia32_t __user *, struct pt_regs *);
+asmlinkage long sys32_sigreturn(struct pt_regs *);
+asmlinkage long sys32_rt_sigreturn(struct pt_regs *);
+
+/* ia32/ipc32.c */
+asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32);
+#endif /* _ASM_X86_SYS_IA32_H */
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 87803da4401..68b1be10cfa 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -19,27 +19,31 @@
/* kernel/ioport.c */
asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
+/* kernel/ldt.c */
+asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
+
+/* kernel/tls.c */
+asmlinkage int sys_set_thread_area(struct user_desc __user *);
+asmlinkage int sys_get_thread_area(struct user_desc __user *);
+
/* X86_32 only */
#ifdef CONFIG_X86_32
/* kernel/process_32.c */
-asmlinkage int sys_fork(struct pt_regs);
-asmlinkage int sys_clone(struct pt_regs);
-asmlinkage int sys_vfork(struct pt_regs);
-asmlinkage int sys_execve(struct pt_regs);
+int sys_fork(struct pt_regs *);
+int sys_clone(struct pt_regs *);
+int sys_vfork(struct pt_regs *);
+int sys_execve(struct pt_regs *);
/* kernel/signal_32.c */
asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
struct old_sigaction __user *);
-asmlinkage int sys_sigaltstack(unsigned long);
-asmlinkage unsigned long sys_sigreturn(unsigned long);
-asmlinkage int sys_rt_sigreturn(unsigned long);
+int sys_sigaltstack(struct pt_regs *);
+unsigned long sys_sigreturn(struct pt_regs *);
+long sys_rt_sigreturn(struct pt_regs *);
/* kernel/ioport.c */
-asmlinkage long sys_iopl(unsigned long);
-
-/* kernel/ldt.c */
-asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
+long sys_iopl(struct pt_regs *);
/* kernel/sys_i386_32.c */
asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
@@ -54,13 +58,9 @@ asmlinkage int sys_uname(struct old_utsname __user *);
struct oldold_utsname;
asmlinkage int sys_olduname(struct oldold_utsname __user *);
-/* kernel/tls.c */
-asmlinkage int sys_set_thread_area(struct user_desc __user *);
-asmlinkage int sys_get_thread_area(struct user_desc __user *);
-
/* kernel/vm86_32.c */
-asmlinkage int sys_vm86old(struct pt_regs);
-asmlinkage int sys_vm86(struct pt_regs);
+int sys_vm86old(struct pt_regs *);
+int sys_vm86(struct pt_regs *);
#else /* CONFIG_X86_32 */
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 2ed3f0f44ff..c00bfdbdd45 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -17,12 +17,26 @@
# define AT_VECTOR_SIZE_ARCH 1
#endif
-#ifdef CONFIG_X86_32
-
struct task_struct; /* one of the stranger aspects of C forward declarations */
struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *next);
+#ifdef CONFIG_X86_32
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+#define __switch_canary \
+ "movl %P[task_canary](%[next]), %%ebx\n\t" \
+ "movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
+#define __switch_canary_oparam \
+ , [stack_canary] "=m" (per_cpu_var(stack_canary))
+#define __switch_canary_iparam \
+ , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
+#else /* CC_STACKPROTECTOR */
+#define __switch_canary
+#define __switch_canary_oparam
+#define __switch_canary_iparam
+#endif /* CC_STACKPROTECTOR */
+
/*
* Saving eflags is important. It switches not only IOPL between tasks,
* it also protects other tasks from NT leaking through sysenter etc.
@@ -44,6 +58,7 @@ do { \
"movl %[next_sp],%%esp\n\t" /* restore ESP */ \
"movl $1f,%[prev_ip]\n\t" /* save EIP */ \
"pushl %[next_ip]\n\t" /* restore EIP */ \
+ __switch_canary \
"jmp __switch_to\n" /* regparm call */ \
"1:\t" \
"popl %%ebp\n\t" /* restore EBP */ \
@@ -58,6 +73,8 @@ do { \
"=b" (ebx), "=c" (ecx), "=d" (edx), \
"=S" (esi), "=D" (edi) \
\
+ __switch_canary_oparam \
+ \
/* input parameters: */ \
: [next_sp] "m" (next->thread.sp), \
[next_ip] "m" (next->thread.ip), \
@@ -66,6 +83,8 @@ do { \
[prev] "a" (prev), \
[next] "d" (next) \
\
+ __switch_canary_iparam \
+ \
: /* reloaded segment registers */ \
"memory"); \
} while (0)
@@ -86,27 +105,44 @@ do { \
, "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
"r12", "r13", "r14", "r15"
+#ifdef CONFIG_CC_STACKPROTECTOR
+#define __switch_canary \
+ "movq %P[task_canary](%%rsi),%%r8\n\t" \
+ "movq %%r8,"__percpu_arg([gs_canary])"\n\t"
+#define __switch_canary_oparam \
+ , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary))
+#define __switch_canary_iparam \
+ , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
+#else /* CC_STACKPROTECTOR */
+#define __switch_canary
+#define __switch_canary_oparam
+#define __switch_canary_iparam
+#endif /* CC_STACKPROTECTOR */
+
/* Save restore flags to clear handle leaking NT */
#define switch_to(prev, next, last) \
- asm volatile(SAVE_CONTEXT \
+ asm volatile(SAVE_CONTEXT \
"movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
"movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
"call __switch_to\n\t" \
".globl thread_return\n" \
"thread_return:\n\t" \
- "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
+ "movq "__percpu_arg([current_task])",%%rsi\n\t" \
+ __switch_canary \
"movq %P[thread_info](%%rsi),%%r8\n\t" \
- LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
"movq %%rax,%%rdi\n\t" \
- "jc ret_from_fork\n\t" \
+ "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \
+ "jnz ret_from_fork\n\t" \
RESTORE_CONTEXT \
: "=a" (last) \
+ __switch_canary_oparam \
: [next] "S" (next), [prev] "D" (prev), \
[threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
[ti_flags] "i" (offsetof(struct thread_info, flags)), \
- [tif_fork] "i" (TIF_FORK), \
+ [_tif_fork] "i" (_TIF_FORK), \
[thread_info] "i" (offsetof(struct task_struct, stack)), \
- [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
+ [current_task] "m" (per_cpu_var(current_task)) \
+ __switch_canary_iparam \
: "memory", "cc" __EXTRA_CLOBBER)
#endif
@@ -165,6 +201,25 @@ extern void native_load_gs_index(unsigned);
#define savesegment(seg, value) \
asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
+/*
+ * x86_32 user gs accessors.
+ */
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_32_LAZY_GS
+#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;})
+#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v))
+#define task_user_gs(tsk) ((tsk)->thread.gs)
+#define lazy_save_gs(v) savesegment(gs, (v))
+#define lazy_load_gs(v) loadsegment(gs, (v))
+#else /* X86_32_LAZY_GS */
+#define get_user_gs(regs) (u16)((regs)->gs)
+#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0)
+#define task_user_gs(tsk) (task_pt_regs(tsk)->gs)
+#define lazy_save_gs(v) do { } while (0)
+#define lazy_load_gs(v) do { } while (0)
+#endif /* X86_32_LAZY_GS */
+#endif /* X86_32 */
+
static inline unsigned long get_limit(unsigned long segment)
{
unsigned long __limit;
@@ -314,6 +369,8 @@ extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
void default_idle(void);
+void stop_this_cpu(void *dummy);
+
/*
* Force strict CPU ordering.
* And yes, this is required on UP too when we're talking
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e44d379faad..df9d5f78385 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -20,11 +20,13 @@
struct task_struct;
struct exec_domain;
#include <asm/processor.h>
+#include <asm/ftrace.h>
+#include <asm/atomic.h>
struct thread_info {
struct task_struct *task; /* main task structure */
struct exec_domain *exec_domain; /* execution domain */
- unsigned long flags; /* low level flags */
+ __u32 flags; /* low level flags */
__u32 status; /* thread synchronous flags */
__u32 cpu; /* current CPU */
int preempt_count; /* 0 => preemptable,
@@ -38,6 +40,7 @@ struct thread_info {
*/
__u8 supervisor_stack[0];
#endif
+ int uaccess_err;
};
#define INIT_THREAD_INFO(tsk) \
@@ -91,7 +94,6 @@ struct thread_info {
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
-#define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -113,7 +115,6 @@ struct thread_info {
#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
-#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS)
/* work to do in syscall_trace_enter() */
#define _TIF_WORK_SYSCALL_ENTRY \
@@ -139,8 +140,7 @@ struct thread_info {
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
- (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \
- _TIF_NOTSC)
+ (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
@@ -195,25 +195,21 @@ static inline struct thread_info *current_thread_info(void)
#else /* X86_32 */
-#include <asm/pda.h>
+#include <asm/percpu.h>
+#define KERNEL_STACK_OFFSET (5*8)
/*
* macros/functions for gaining access to the thread information structure
* preempt_count needs to be 1 initially, until the scheduler is functional.
*/
#ifndef __ASSEMBLY__
-static inline struct thread_info *current_thread_info(void)
-{
- struct thread_info *ti;
- ti = (void *)(read_pda(kernelstack) + PDA_STACKOFFSET - THREAD_SIZE);
- return ti;
-}
+DECLARE_PER_CPU(unsigned long, kernel_stack);
-/* do not use in interrupt context */
-static inline struct thread_info *stack_thread_info(void)
+static inline struct thread_info *current_thread_info(void)
{
struct thread_info *ti;
- asm("andq %%rsp,%0; " : "=r" (ti) : "0" (~(THREAD_SIZE - 1)));
+ ti = (void *)(percpu_read(kernel_stack) +
+ KERNEL_STACK_OFFSET - THREAD_SIZE);
return ti;
}
@@ -221,8 +217,8 @@ static inline struct thread_info *stack_thread_info(void)
/* how to get the thread information struct from ASM */
#define GET_THREAD_INFO(reg) \
- movq %gs:pda_kernelstack,reg ; \
- subq $(THREAD_SIZE-PDA_STACKOFFSET),reg
+ movq PER_CPU_VAR(kernel_stack),reg ; \
+ subq $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg
#endif
diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h
index 1287dc1347d..b5c9d45c981 100644
--- a/arch/x86/include/asm/timex.h
+++ b/arch/x86/include/asm/timex.h
@@ -1,18 +1,13 @@
-/* x86 architecture timex specifications */
#ifndef _ASM_X86_TIMEX_H
#define _ASM_X86_TIMEX_H
#include <asm/processor.h>
#include <asm/tsc.h>
-#ifdef CONFIG_X86_ELAN
-# define PIT_TICK_RATE 1189200 /* AMD Elan has different frequency! */
-#elif defined(CONFIG_X86_RDC321X)
-# define PIT_TICK_RATE 1041667 /* Underlying HZ for R8610 */
-#else
-# define PIT_TICK_RATE 1193182 /* Underlying HZ */
-#endif
-#define CLOCK_TICK_RATE PIT_TICK_RATE
+/* The PIT ticks at this frequency (in HZ): */
+#define PIT_TICK_RATE 1193182
+
+#define CLOCK_TICK_RATE PIT_TICK_RATE
#define ARCH_HAS_READ_CURRENT_TIMER
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 0e7bbb54911..d3539f998f8 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -113,7 +113,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
__flush_tlb();
}
-static inline void native_flush_tlb_others(const cpumask_t *cpumask,
+static inline void native_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm,
unsigned long va)
{
@@ -142,31 +142,28 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
flush_tlb_mm(vma->vm_mm);
}
-void native_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm,
- unsigned long va);
+void native_flush_tlb_others(const struct cpumask *cpumask,
+ struct mm_struct *mm, unsigned long va);
#define TLBSTATE_OK 1
#define TLBSTATE_LAZY 2
-#ifdef CONFIG_X86_32
struct tlb_state {
struct mm_struct *active_mm;
int state;
- char __cacheline_padding[L1_CACHE_BYTES-8];
};
DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
-void reset_lazy_tlbstate(void);
-#else
static inline void reset_lazy_tlbstate(void)
{
+ percpu_write(cpu_tlbstate.state, 0);
+ percpu_write(cpu_tlbstate.active_mm, &init_mm);
}
-#endif
#endif /* SMP */
#ifndef CONFIG_PARAVIRT
-#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(&mask, mm, va)
+#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(mask, mm, va)
#endif
static inline void flush_tlb_kernel_range(unsigned long start,
@@ -175,4 +172,6 @@ static inline void flush_tlb_kernel_range(unsigned long start,
flush_tlb_all();
}
+extern void zap_low_mappings(void);
+
#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 4850e4b02b6..77cfb2cfb38 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -61,13 +61,21 @@ static inline int cpu_to_node(int cpu)
*
* Side note: this function creates the returned cpumask on the stack
* so with a high NR_CPUS count, excessive stack space is used. The
- * node_to_cpumask_ptr function should be used whenever possible.
+ * cpumask_of_node function should be used whenever possible.
*/
static inline cpumask_t node_to_cpumask(int node)
{
return node_to_cpumask_map[node];
}
+/* Returns a bitmask of CPUs on Node 'node'. */
+static inline const struct cpumask *cpumask_of_node(int node)
+{
+ return &node_to_cpumask_map[node];
+}
+
+static inline void setup_node_to_cpumask_map(void) { }
+
#else /* CONFIG_X86_64 */
/* Mappings between node number and cpus on that node. */
@@ -77,12 +85,13 @@ extern cpumask_t *node_to_cpumask_map;
DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
/* Returns the number of the current Node. */
-#define numa_node_id() read_pda(nodenumber)
+DECLARE_PER_CPU(int, node_number);
+#define numa_node_id() percpu_read(node_number)
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
extern int cpu_to_node(int cpu);
extern int early_cpu_to_node(int cpu);
-extern const cpumask_t *_node_to_cpumask_ptr(int node);
+extern const cpumask_t *cpumask_of_node(int node);
extern cpumask_t node_to_cpumask(int node);
#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
@@ -96,14 +105,11 @@ static inline int cpu_to_node(int cpu)
/* Same function but used if called before per_cpu areas are setup */
static inline int early_cpu_to_node(int cpu)
{
- if (early_per_cpu_ptr(x86_cpu_to_node_map))
- return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
-
- return per_cpu(x86_cpu_to_node_map, cpu);
+ return early_per_cpu(x86_cpu_to_node_map, cpu);
}
/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
-static inline const cpumask_t *_node_to_cpumask_ptr(int node)
+static inline const cpumask_t *cpumask_of_node(int node)
{
return &node_to_cpumask_map[node];
}
@@ -116,12 +122,17 @@ static inline cpumask_t node_to_cpumask(int node)
#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
-/* Replace default node_to_cpumask_ptr with optimized version */
+extern void setup_node_to_cpumask_map(void);
+
+/*
+ * Replace default node_to_cpumask_ptr with optimized version
+ * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
+ */
#define node_to_cpumask_ptr(v, node) \
- const cpumask_t *v = _node_to_cpumask_ptr(node)
+ const cpumask_t *v = cpumask_of_node(node)
#define node_to_cpumask_ptr_next(v, node) \
- v = _node_to_cpumask_ptr(node)
+ v = cpumask_of_node(node)
#endif /* CONFIG_X86_64 */
@@ -183,11 +194,22 @@ extern int __node_distance(int, int);
#else /* !CONFIG_NUMA */
-#define numa_node_id() 0
-#define cpu_to_node(cpu) 0
-#define early_cpu_to_node(cpu) 0
+static inline int numa_node_id(void)
+{
+ return 0;
+}
-static inline const cpumask_t *_node_to_cpumask_ptr(int node)
+static inline int cpu_to_node(int cpu)
+{
+ return 0;
+}
+
+static inline int early_cpu_to_node(int cpu)
+{
+ return 0;
+}
+
+static inline const cpumask_t *cpumask_of_node(int node)
{
return &cpu_online_map;
}
@@ -200,12 +222,17 @@ static inline int node_to_first_cpu(int node)
return first_cpu(cpu_online_map);
}
-/* Replace default node_to_cpumask_ptr with optimized version */
+static inline void setup_node_to_cpumask_map(void) { }
+
+/*
+ * Replace default node_to_cpumask_ptr with optimized version
+ * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
+ */
#define node_to_cpumask_ptr(v, node) \
- const cpumask_t *v = _node_to_cpumask_ptr(node)
+ const cpumask_t *v = cpumask_of_node(node)
#define node_to_cpumask_ptr_next(v, node) \
- v = _node_to_cpumask_ptr(node)
+ v = cpumask_of_node(node)
#endif
#include <asm-generic/topology.h>
@@ -214,18 +241,20 @@ static inline int node_to_first_cpu(int node)
/* Returns the number of the first CPU on Node 'node'. */
static inline int node_to_first_cpu(int node)
{
- node_to_cpumask_ptr(mask, node);
- return first_cpu(*mask);
+ return cpumask_first(cpumask_of_node(node));
}
#endif
extern cpumask_t cpu_coregroup_map(int cpu);
+extern const struct cpumask *cpu_coregroup_mask(int cpu);
#ifdef ENABLE_TOPO_DEFINES
#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
#define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu))
#define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu))
+#define topology_core_cpumask(cpu) (&per_cpu(cpu_core_map, cpu))
+#define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu))
/* indicates that pointers to the topology cpumask_t maps are valid */
#define arch_provides_topology_pointers yes
@@ -239,7 +268,7 @@ struct pci_bus;
void set_pci_bus_resources_arch_default(struct pci_bus *b);
#ifdef CONFIG_SMP
-#define mc_capable() (boot_cpu_data.x86_max_cores > 1)
+#define mc_capable() (cpus_weight(per_cpu(cpu_core_map, 0)) != nr_cpu_ids)
#define smt_capable() (smp_num_siblings > 1)
#endif
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index fa0d79facdb..90f06c25221 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -3,6 +3,7 @@
#ifndef __ASSEMBLY__
+#ifdef CONFIG_X86_TRAMPOLINE
/*
* Trampoline 80x86 program as an array.
*/
@@ -12,9 +13,16 @@ extern unsigned char *trampoline_base;
extern unsigned long init_rsp;
extern unsigned long initial_code;
+extern unsigned long initial_gs;
+#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE)
#define TRAMPOLINE_BASE 0x6000
+
extern unsigned long setup_trampoline(void);
+extern void __init reserve_trampoline_memory(void);
+#else
+static inline void reserve_trampoline_memory(void) {};
+#endif /* CONFIG_X86_TRAMPOLINE */
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 45dee286e45..0d5342515b8 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -46,6 +46,10 @@ dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *, long);
dotraplinkage void do_invalid_TSS(struct pt_regs *, long);
dotraplinkage void do_segment_not_present(struct pt_regs *, long);
dotraplinkage void do_stack_segment(struct pt_regs *, long);
+#ifdef CONFIG_X86_64
+dotraplinkage void do_double_fault(struct pt_regs *, long);
+asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *);
+#endif
dotraplinkage void do_general_protection(struct pt_regs *, long);
dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
@@ -72,10 +76,13 @@ static inline int get_si_code(unsigned long condition)
extern int panic_on_unrecovered_nmi;
extern int kstack_depth_to_print;
-#ifdef CONFIG_X86_32
void math_error(void __user *);
+void math_emulate(struct math_emu_info *);
+#ifdef CONFIG_X86_32
unsigned long patch_espfix_desc(unsigned long, unsigned long);
-asmlinkage void math_emulate(long);
+#else
+asmlinkage void smp_thermal_interrupt(void);
+asmlinkage void mce_threshold_interrupt(void);
#endif
#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 9cd83a8e40d..38ae163cc91 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -34,8 +34,6 @@ static inline cycles_t get_cycles(void)
static __always_inline cycles_t vget_cycles(void)
{
- cycles_t cycles;
-
/*
* We only do VDSOs on TSC capable CPUs, so this shouldnt
* access boot_cpu_data (which is not VDSO-safe):
@@ -44,11 +42,7 @@ static __always_inline cycles_t vget_cycles(void)
if (!cpu_has_tsc)
return 0;
#endif
- rdtsc_barrier();
- cycles = (cycles_t)__native_read_tsc();
- rdtsc_barrier();
-
- return cycles;
+ return (cycles_t)__native_read_tsc();
}
extern void tsc_init(void);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 35c54921b2e..b685ece89d5 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -121,7 +121,7 @@ extern int __get_user_bad(void);
#define __get_user_x(size, ret, x, ptr) \
asm volatile("call __get_user_" #size \
- : "=a" (ret),"=d" (x) \
+ : "=a" (ret), "=d" (x) \
: "0" (ptr)) \
/* Careful: we have to cast the result to the type of the pointer
@@ -157,6 +157,7 @@ extern int __get_user_bad(void);
int __ret_gu; \
unsigned long __val_gu; \
__chk_user_ptr(ptr); \
+ might_fault(); \
switch (sizeof(*(ptr))) { \
case 1: \
__get_user_x(1, __ret_gu, __val_gu, ptr); \
@@ -180,12 +181,12 @@ extern int __get_user_bad(void);
#define __put_user_x(size, x, ptr, __ret_pu) \
asm volatile("call __put_user_" #size : "=a" (__ret_pu) \
- :"0" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
+ : "0" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
#ifdef CONFIG_X86_32
-#define __put_user_u64(x, addr, err) \
+#define __put_user_asm_u64(x, addr, err, errret) \
asm volatile("1: movl %%eax,0(%2)\n" \
"2: movl %%edx,4(%2)\n" \
"3:\n" \
@@ -196,14 +197,24 @@ extern int __get_user_bad(void);
_ASM_EXTABLE(1b, 4b) \
_ASM_EXTABLE(2b, 4b) \
: "=r" (err) \
- : "A" (x), "r" (addr), "i" (-EFAULT), "0" (err))
+ : "A" (x), "r" (addr), "i" (errret), "0" (err))
+
+#define __put_user_asm_ex_u64(x, addr) \
+ asm volatile("1: movl %%eax,0(%1)\n" \
+ "2: movl %%edx,4(%1)\n" \
+ "3:\n" \
+ _ASM_EXTABLE(1b, 2b - 1b) \
+ _ASM_EXTABLE(2b, 3b - 2b) \
+ : : "A" (x), "r" (addr))
#define __put_user_x8(x, ptr, __ret_pu) \
asm volatile("call __put_user_8" : "=a" (__ret_pu) \
: "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
#else
-#define __put_user_u64(x, ptr, retval) \
- __put_user_asm(x, ptr, retval, "q", "", "Zr", -EFAULT)
+#define __put_user_asm_u64(x, ptr, retval, errret) \
+ __put_user_asm(x, ptr, retval, "q", "", "Zr", errret)
+#define __put_user_asm_ex_u64(x, addr) \
+ __put_user_asm_ex(x, addr, "q", "", "Zr")
#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu)
#endif
@@ -241,6 +252,7 @@ extern void __put_user_8(void);
int __ret_pu; \
__typeof__(*(ptr)) __pu_val; \
__chk_user_ptr(ptr); \
+ might_fault(); \
__pu_val = x; \
switch (sizeof(*(ptr))) { \
case 1: \
@@ -274,10 +286,32 @@ do { \
__put_user_asm(x, ptr, retval, "w", "w", "ir", errret); \
break; \
case 4: \
- __put_user_asm(x, ptr, retval, "l", "k", "ir", errret);\
+ __put_user_asm(x, ptr, retval, "l", "k", "ir", errret); \
break; \
case 8: \
- __put_user_u64((__typeof__(*ptr))(x), ptr, retval); \
+ __put_user_asm_u64((__typeof__(*ptr))(x), ptr, retval, \
+ errret); \
+ break; \
+ default: \
+ __put_user_bad(); \
+ } \
+} while (0)
+
+#define __put_user_size_ex(x, ptr, size) \
+do { \
+ __chk_user_ptr(ptr); \
+ switch (size) { \
+ case 1: \
+ __put_user_asm_ex(x, ptr, "b", "b", "iq"); \
+ break; \
+ case 2: \
+ __put_user_asm_ex(x, ptr, "w", "w", "ir"); \
+ break; \
+ case 4: \
+ __put_user_asm_ex(x, ptr, "l", "k", "ir"); \
+ break; \
+ case 8: \
+ __put_user_asm_ex_u64((__typeof__(*ptr))(x), ptr); \
break; \
default: \
__put_user_bad(); \
@@ -309,9 +343,12 @@ do { \
#ifdef CONFIG_X86_32
#define __get_user_asm_u64(x, ptr, retval, errret) (x) = __get_user_bad()
+#define __get_user_asm_ex_u64(x, ptr) (x) = __get_user_bad()
#else
#define __get_user_asm_u64(x, ptr, retval, errret) \
__get_user_asm(x, ptr, retval, "q", "", "=r", errret)
+#define __get_user_asm_ex_u64(x, ptr) \
+ __get_user_asm_ex(x, ptr, "q", "", "=r")
#endif
#define __get_user_size(x, ptr, size, retval, errret) \
@@ -348,16 +385,43 @@ do { \
: "=r" (err), ltype(x) \
: "m" (__m(addr)), "i" (errret), "0" (err))
+#define __get_user_size_ex(x, ptr, size) \
+do { \
+ __chk_user_ptr(ptr); \
+ switch (size) { \
+ case 1: \
+ __get_user_asm_ex(x, ptr, "b", "b", "=q"); \
+ break; \
+ case 2: \
+ __get_user_asm_ex(x, ptr, "w", "w", "=r"); \
+ break; \
+ case 4: \
+ __get_user_asm_ex(x, ptr, "l", "k", "=r"); \
+ break; \
+ case 8: \
+ __get_user_asm_ex_u64(x, ptr); \
+ break; \
+ default: \
+ (x) = __get_user_bad(); \
+ } \
+} while (0)
+
+#define __get_user_asm_ex(x, addr, itype, rtype, ltype) \
+ asm volatile("1: mov"itype" %1,%"rtype"0\n" \
+ "2:\n" \
+ _ASM_EXTABLE(1b, 2b - 1b) \
+ : ltype(x) : "m" (__m(addr)))
+
#define __put_user_nocheck(x, ptr, size) \
({ \
- long __pu_err; \
+ int __pu_err; \
__put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \
__pu_err; \
})
#define __get_user_nocheck(x, ptr, size) \
({ \
- long __gu_err; \
+ int __gu_err; \
unsigned long __gu_val; \
__get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
@@ -383,6 +447,26 @@ struct __large_struct { unsigned long buf[100]; };
_ASM_EXTABLE(1b, 3b) \
: "=r"(err) \
: ltype(x), "m" (__m(addr)), "i" (errret), "0" (err))
+
+#define __put_user_asm_ex(x, addr, itype, rtype, ltype) \
+ asm volatile("1: mov"itype" %"rtype"0,%1\n" \
+ "2:\n" \
+ _ASM_EXTABLE(1b, 2b - 1b) \
+ : : ltype(x), "m" (__m(addr)))
+
+/*
+ * uaccess_try and catch
+ */
+#define uaccess_try do { \
+ int prev_err = current_thread_info()->uaccess_err; \
+ current_thread_info()->uaccess_err = 0; \
+ barrier();
+
+#define uaccess_catch(err) \
+ (err) |= current_thread_info()->uaccess_err; \
+ current_thread_info()->uaccess_err = prev_err; \
+} while (0)
+
/**
* __get_user: - Get a simple variable from user space, with less checking.
* @x: Variable to store result.
@@ -406,6 +490,7 @@ struct __large_struct { unsigned long buf[100]; };
#define __get_user(x, ptr) \
__get_user_nocheck((x), (ptr), sizeof(*(ptr)))
+
/**
* __put_user: - Write a simple value into user space, with less checking.
* @x: Value to copy to user space.
@@ -433,6 +518,45 @@ struct __large_struct { unsigned long buf[100]; };
#define __put_user_unaligned __put_user
/*
+ * {get|put}_user_try and catch
+ *
+ * get_user_try {
+ * get_user_ex(...);
+ * } get_user_catch(err)
+ */
+#define get_user_try uaccess_try
+#define get_user_catch(err) uaccess_catch(err)
+
+#define get_user_ex(x, ptr) do { \
+ unsigned long __gue_val; \
+ __get_user_size_ex((__gue_val), (ptr), (sizeof(*(ptr)))); \
+ (x) = (__force __typeof__(*(ptr)))__gue_val; \
+} while (0)
+
+#ifdef CONFIG_X86_WP_WORKS_OK
+
+#define put_user_try uaccess_try
+#define put_user_catch(err) uaccess_catch(err)
+
+#define put_user_ex(x, ptr) \
+ __put_user_size_ex((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
+
+#else /* !CONFIG_X86_WP_WORKS_OK */
+
+#define put_user_try do { \
+ int __uaccess_err = 0;
+
+#define put_user_catch(err) \
+ (err) |= __uaccess_err; \
+} while (0)
+
+#define put_user_ex(x, ptr) do { \
+ __uaccess_err |= __put_user(x, ptr); \
+} while (0)
+
+#endif /* CONFIG_X86_WP_WORKS_OK */
+
+/*
* movsl can be slow when source and dest are not both 8-byte aligned
*/
#ifdef CONFIG_X86_INTEL_USERCOPY
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index d095a3aeea1..5e06259e90e 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -82,8 +82,8 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
static __always_inline unsigned long __must_check
__copy_to_user(void __user *to, const void *from, unsigned long n)
{
- might_sleep();
- return __copy_to_user_inatomic(to, from, n);
+ might_fault();
+ return __copy_to_user_inatomic(to, from, n);
}
static __always_inline unsigned long
@@ -137,7 +137,7 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
static __always_inline unsigned long
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
- might_sleep();
+ might_fault();
if (__builtin_constant_p(n)) {
unsigned long ret;
@@ -159,7 +159,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
static __always_inline unsigned long __copy_from_user_nocache(void *to,
const void __user *from, unsigned long n)
{
- might_sleep();
+ might_fault();
if (__builtin_constant_p(n)) {
unsigned long ret;
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 664f15280f1..84210c479fc 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -29,6 +29,8 @@ static __always_inline __must_check
int __copy_from_user(void *dst, const void __user *src, unsigned size)
{
int ret = 0;
+
+ might_fault();
if (!__builtin_constant_p(size))
return copy_user_generic(dst, (__force void *)src, size);
switch (size) {
@@ -46,7 +48,7 @@ int __copy_from_user(void *dst, const void __user *src, unsigned size)
return ret;
case 10:
__get_user_asm(*(u64 *)dst, (u64 __user *)src,
- ret, "q", "", "=r", 16);
+ ret, "q", "", "=r", 10);
if (unlikely(ret))
return ret;
__get_user_asm(*(u16 *)(8 + (char *)dst),
@@ -71,6 +73,8 @@ static __always_inline __must_check
int __copy_to_user(void __user *dst, const void *src, unsigned size)
{
int ret = 0;
+
+ might_fault();
if (!__builtin_constant_p(size))
return copy_user_generic((__force void *)dst, src, size);
switch (size) {
@@ -113,6 +117,8 @@ static __always_inline __must_check
int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
{
int ret = 0;
+
+ might_fault();
if (!__builtin_constant_p(size))
return copy_user_generic((__force void *)dst,
(__force void *)src, size);
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 834b2c1d89f..d2e415e6666 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -639,8 +639,8 @@ __SYSCALL(__NR_fallocate, sys_fallocate)
__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime)
#define __NR_timerfd_gettime 287
__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime)
-#define __NR_paccept 288
-__SYSCALL(__NR_paccept, sys_paccept)
+#define __NR_accept4 288
+__SYSCALL(__NR_accept4, sys_accept4)
#define __NR_signalfd4 289
__SYSCALL(__NR_signalfd4, sys_signalfd4)
#define __NR_eventfd2 290
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
deleted file mode 100644
index 8b064bd9c55..00000000000
--- a/arch/x86/include/asm/unwind.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef _ASM_X86_UNWIND_H
-#define _ASM_X86_UNWIND_H
-
-#define UNW_PC(frame) ((void)(frame), 0UL)
-#define UNW_SP(frame) ((void)(frame), 0UL)
-#define UNW_FP(frame) ((void)(frame), 0UL)
-
-static inline int arch_unw_user_mode(const void *info)
-{
- return 0;
-}
-
-#endif /* _ASM_X86_UNWIND_H */
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index d931d3b7e6f..7ed17ff502b 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -32,13 +32,18 @@
enum uv_bios_cmd {
UV_BIOS_COMMON,
UV_BIOS_GET_SN_INFO,
- UV_BIOS_FREQ_BASE
+ UV_BIOS_FREQ_BASE,
+ UV_BIOS_WATCHLIST_ALLOC,
+ UV_BIOS_WATCHLIST_FREE,
+ UV_BIOS_MEMPROTECT,
+ UV_BIOS_GET_PARTITION_ADDR
};
/*
* Status values returned from a BIOS call.
*/
enum {
+ BIOS_STATUS_MORE_PASSES = 1,
BIOS_STATUS_SUCCESS = 0,
BIOS_STATUS_UNIMPLEMENTED = -ENOSYS,
BIOS_STATUS_EINVAL = -EINVAL,
@@ -71,6 +76,21 @@ union partition_info_u {
};
};
+union uv_watchlist_u {
+ u64 val;
+ struct {
+ u64 blade : 16,
+ size : 32,
+ filler : 16;
+ };
+};
+
+enum uv_memprotect {
+ UV_MEMPROT_RESTRICT_ACCESS,
+ UV_MEMPROT_ALLOW_AMO,
+ UV_MEMPROT_ALLOW_RW
+};
+
/*
* bios calls have 6 parameters
*/
@@ -80,14 +100,20 @@ extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64);
extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *);
extern s64 uv_bios_freq_base(u64, u64 *);
+extern int uv_bios_mq_watchlist_alloc(int, unsigned long, unsigned int,
+ unsigned long *);
+extern int uv_bios_mq_watchlist_free(int, int);
+extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect);
+extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *);
extern void uv_bios_init(void);
+extern unsigned long sn_rtc_cycles_per_second;
extern int uv_type;
extern long sn_partition_id;
-extern long uv_coherency_id;
-extern long uv_region_size;
-#define partition_coherence_id() (uv_coherency_id)
+extern long sn_coherency_id;
+extern long sn_region_size;
+#define partition_coherence_id() (sn_coherency_id)
extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
new file mode 100644
index 00000000000..8242bf96581
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv.h
@@ -0,0 +1,36 @@
+#ifndef _ASM_X86_UV_UV_H
+#define _ASM_X86_UV_UV_H
+
+enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
+
+struct cpumask;
+struct mm_struct;
+
+#ifdef CONFIG_X86_UV
+
+extern enum uv_system_type get_uv_system_type(void);
+extern int is_uv_system(void);
+extern void uv_cpu_init(void);
+extern void uv_system_init(void);
+extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
+extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
+ struct mm_struct *mm,
+ unsigned long va,
+ unsigned int cpu);
+
+#else /* X86_UV */
+
+static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; }
+static inline int is_uv_system(void) { return 0; }
+static inline void uv_cpu_init(void) { }
+static inline void uv_system_init(void) { }
+static inline int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
+{ return 1; }
+static inline const struct cpumask *
+uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
+ unsigned long va, unsigned int cpu)
+{ return cpumask; }
+
+#endif /* X86_UV */
+
+#endif /* _ASM_X86_UV_UV_H */
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index e2363253bbb..9b0e61bf7a8 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -133,61 +133,61 @@ struct bau_msg_payload {
* see table 4.2.3.0.1 in broacast_assist spec.
*/
struct bau_msg_header {
- int dest_subnodeid:6; /* must be zero */
+ unsigned int dest_subnodeid:6; /* must be zero */
/* bits 5:0 */
- int base_dest_nodeid:15; /* nasid>>1 (pnode) of first bit in node_map */
- /* bits 20:6 */
- int command:8; /* message type */
+ unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */
+ /* bits 20:6 */ /* first bit in node_map */
+ unsigned int command:8; /* message type */
/* bits 28:21 */
/* 0x38: SN3net EndPoint Message */
- int rsvd_1:3; /* must be zero */
+ unsigned int rsvd_1:3; /* must be zero */
/* bits 31:29 */
/* int will align on 32 bits */
- int rsvd_2:9; /* must be zero */
+ unsigned int rsvd_2:9; /* must be zero */
/* bits 40:32 */
/* Suppl_A is 56-41 */
- int payload_2a:8; /* becomes byte 16 of msg */
+ unsigned int payload_2a:8;/* becomes byte 16 of msg */
/* bits 48:41 */ /* not currently using */
- int payload_2b:8; /* becomes byte 17 of msg */
+ unsigned int payload_2b:8;/* becomes byte 17 of msg */
/* bits 56:49 */ /* not currently using */
/* Address field (96:57) is never used as an
address (these are address bits 42:3) */
- int rsvd_3:1; /* must be zero */
+ unsigned int rsvd_3:1; /* must be zero */
/* bit 57 */
/* address bits 27:4 are payload */
/* these 24 bits become bytes 12-14 of msg */
- int replied_to:1; /* sent as 0 by the source to byte 12 */
+ unsigned int replied_to:1;/* sent as 0 by the source to byte 12 */
/* bit 58 */
- int payload_1a:5; /* not currently used */
+ unsigned int payload_1a:5;/* not currently used */
/* bits 63:59 */
- int payload_1b:8; /* not currently used */
+ unsigned int payload_1b:8;/* not currently used */
/* bits 71:64 */
- int payload_1c:8; /* not currently used */
+ unsigned int payload_1c:8;/* not currently used */
/* bits 79:72 */
- int payload_1d:2; /* not currently used */
+ unsigned int payload_1d:2;/* not currently used */
/* bits 81:80 */
- int rsvd_4:7; /* must be zero */
+ unsigned int rsvd_4:7; /* must be zero */
/* bits 88:82 */
- int sw_ack_flag:1; /* software acknowledge flag */
+ unsigned int sw_ack_flag:1;/* software acknowledge flag */
/* bit 89 */
/* INTD trasactions at destination are to
wait for software acknowledge */
- int rsvd_5:6; /* must be zero */
+ unsigned int rsvd_5:6; /* must be zero */
/* bits 95:90 */
- int rsvd_6:5; /* must be zero */
+ unsigned int rsvd_6:5; /* must be zero */
/* bits 100:96 */
- int int_both:1; /* if 1, interrupt both sockets on the blade */
+ unsigned int int_both:1;/* if 1, interrupt both sockets on the blade */
/* bit 101*/
- int fairness:3; /* usually zero */
+ unsigned int fairness:3;/* usually zero */
/* bits 104:102 */
- int multilevel:1; /* multi-level multicast format */
+ unsigned int multilevel:1; /* multi-level multicast format */
/* bit 105 */
/* 0 for TLB: endpoint multi-unicast messages */
- int chaining:1; /* next descriptor is part of this activation*/
+ unsigned int chaining:1;/* next descriptor is part of this activation*/
/* bit 106 */
- int rsvd_7:21; /* must be zero */
+ unsigned int rsvd_7:21; /* must be zero */
/* bits 127:107 */
};
@@ -325,7 +325,6 @@ static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
#define cpubit_isset(cpu, bau_local_cpumask) \
test_bit((cpu), (bau_local_cpumask).bits)
-extern int uv_flush_tlb_others(cpumask_t *, struct mm_struct *, unsigned long);
extern void uv_bau_message_intr1(void);
extern void uv_bau_timeout_intr1(void);
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 7a5782610b2..777327ef05c 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -113,25 +113,37 @@
*/
#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2)
+struct uv_scir_s {
+ struct timer_list timer;
+ unsigned long offset;
+ unsigned long last;
+ unsigned long idle_on;
+ unsigned long idle_off;
+ unsigned char state;
+ unsigned char enabled;
+};
+
/*
* The following defines attributes of the HUB chip. These attributes are
* frequently referenced and are kept in the per-cpu data areas of each cpu.
* They are kept together in a struct to minimize cache misses.
*/
struct uv_hub_info_s {
- unsigned long global_mmr_base;
- unsigned long gpa_mask;
- unsigned long gnode_upper;
- unsigned long lowmem_remap_top;
- unsigned long lowmem_remap_base;
- unsigned short pnode;
- unsigned short pnode_mask;
- unsigned short coherency_domain_number;
- unsigned short numa_blade_id;
- unsigned char blade_processor_id;
- unsigned char m_val;
- unsigned char n_val;
+ unsigned long global_mmr_base;
+ unsigned long gpa_mask;
+ unsigned long gnode_upper;
+ unsigned long lowmem_remap_top;
+ unsigned long lowmem_remap_base;
+ unsigned short pnode;
+ unsigned short pnode_mask;
+ unsigned short coherency_domain_number;
+ unsigned short numa_blade_id;
+ unsigned char blade_processor_id;
+ unsigned char m_val;
+ unsigned char n_val;
+ struct uv_scir_s scir;
};
+
DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
#define uv_hub_info (&__get_cpu_var(__uv_hub_info))
#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu))
@@ -163,6 +175,30 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
#define UV_APIC_PNODE_SHIFT 6
+/* Local Bus from cpu's perspective */
+#define LOCAL_BUS_BASE 0x1c00000
+#define LOCAL_BUS_SIZE (4 * 1024 * 1024)
+
+/*
+ * System Controller Interface Reg
+ *
+ * Note there are NO leds on a UV system. This register is only
+ * used by the system controller to monitor system-wide operation.
+ * There are 64 regs per node. With Nahelem cpus (2 cores per node,
+ * 8 cpus per core, 2 threads per cpu) there are 32 cpu threads on
+ * a node.
+ *
+ * The window is located at top of ACPI MMR space
+ */
+#define SCIR_WINDOW_COUNT 64
+#define SCIR_LOCAL_MMR_BASE (LOCAL_BUS_BASE + \
+ LOCAL_BUS_SIZE - \
+ SCIR_WINDOW_COUNT)
+
+#define SCIR_CPU_HEARTBEAT 0x01 /* timer interrupt */
+#define SCIR_CPU_ACTIVITY 0x02 /* not idle */
+#define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */
+
/*
* Macros for converting between kernel virtual addresses, socket local physical
* addresses, and UV global physical addresses.
@@ -174,7 +210,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr)
{
if (paddr < uv_hub_info->lowmem_remap_top)
- paddr += uv_hub_info->lowmem_remap_base;
+ paddr |= uv_hub_info->lowmem_remap_base;
return paddr | uv_hub_info->gnode_upper;
}
@@ -182,19 +218,7 @@ static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr)
/* socket virtual --> UV global physical address */
static inline unsigned long uv_gpa(void *v)
{
- return __pa(v) | uv_hub_info->gnode_upper;
-}
-
-/* socket virtual --> UV global physical address */
-static inline void *uv_vgpa(void *v)
-{
- return (void *)uv_gpa(v);
-}
-
-/* UV global physical address --> socket virtual */
-static inline void *uv_va(unsigned long gpa)
-{
- return __va(gpa & uv_hub_info->gpa_mask);
+ return uv_soc_phys_ram_to_gpa(__pa(v));
}
/* pnode, offset --> socket virtual */
@@ -277,6 +301,16 @@ static inline void uv_write_local_mmr(unsigned long offset, unsigned long val)
*uv_local_mmr_address(offset) = val;
}
+static inline unsigned char uv_read_local_mmr8(unsigned long offset)
+{
+ return *((unsigned char *)uv_local_mmr_address(offset));
+}
+
+static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val)
+{
+ *((unsigned char *)uv_local_mmr_address(offset)) = val;
+}
+
/*
* Structures and definitions for converting between cpu, node, pnode, and blade
* numbers.
@@ -351,5 +385,20 @@ static inline int uv_num_possible_blades(void)
return uv_possible_blades;
}
-#endif /* _ASM_X86_UV_UV_HUB_H */
+/* Update SCIR state */
+static inline void uv_set_scir_bits(unsigned char value)
+{
+ if (uv_hub_info->scir.state != value) {
+ uv_hub_info->scir.state = value;
+ uv_write_local_mmr8(uv_hub_info->scir.offset, value);
+ }
+}
+static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
+{
+ if (uv_cpu_hub_info(cpu)->scir.state != value) {
+ uv_cpu_hub_info(cpu)->scir.state = value;
+ uv_write_local_mmr8(uv_cpu_hub_info(cpu)->scir.offset, value);
+ }
+}
+#endif /* _ASM_X86_UV_UV_HUB_H */
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
new file mode 100644
index 00000000000..59363627523
--- /dev/null
+++ b/arch/x86/include/asm/virtext.h
@@ -0,0 +1,132 @@
+/* CPU virtualization extensions handling
+ *
+ * This should carry the code for handling CPU virtualization extensions
+ * that needs to live in the kernel core.
+ *
+ * Author: Eduardo Habkost <ehabkost@redhat.com>
+ *
+ * Copyright (C) 2008, Red Hat Inc.
+ *
+ * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+#ifndef _ASM_X86_VIRTEX_H
+#define _ASM_X86_VIRTEX_H
+
+#include <asm/processor.h>
+#include <asm/system.h>
+
+#include <asm/vmx.h>
+#include <asm/svm.h>
+
+/*
+ * VMX functions:
+ */
+
+static inline int cpu_has_vmx(void)
+{
+ unsigned long ecx = cpuid_ecx(1);
+ return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
+}
+
+
+/** Disable VMX on the current CPU
+ *
+ * vmxoff causes a undefined-opcode exception if vmxon was not run
+ * on the CPU previously. Only call this function if you know VMX
+ * is enabled.
+ */
+static inline void cpu_vmxoff(void)
+{
+ asm volatile (ASM_VMX_VMXOFF : : : "cc");
+ write_cr4(read_cr4() & ~X86_CR4_VMXE);
+}
+
+static inline int cpu_vmx_enabled(void)
+{
+ return read_cr4() & X86_CR4_VMXE;
+}
+
+/** Disable VMX if it is enabled on the current CPU
+ *
+ * You shouldn't call this if cpu_has_vmx() returns 0.
+ */
+static inline void __cpu_emergency_vmxoff(void)
+{
+ if (cpu_vmx_enabled())
+ cpu_vmxoff();
+}
+
+/** Disable VMX if it is supported and enabled on the current CPU
+ */
+static inline void cpu_emergency_vmxoff(void)
+{
+ if (cpu_has_vmx())
+ __cpu_emergency_vmxoff();
+}
+
+
+
+
+/*
+ * SVM functions:
+ */
+
+/** Check if the CPU has SVM support
+ *
+ * You can use the 'msg' arg to get a message describing the problem,
+ * if the function returns zero. Simply pass NULL if you are not interested
+ * on the messages; gcc should take care of not generating code for
+ * the messages on this case.
+ */
+static inline int cpu_has_svm(const char **msg)
+{
+ uint32_t eax, ebx, ecx, edx;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
+ if (msg)
+ *msg = "not amd";
+ return 0;
+ }
+
+ cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
+ if (eax < SVM_CPUID_FUNC) {
+ if (msg)
+ *msg = "can't execute cpuid_8000000a";
+ return 0;
+ }
+
+ cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+ if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
+ if (msg)
+ *msg = "svm not available";
+ return 0;
+ }
+ return 1;
+}
+
+
+/** Disable SVM on the current CPU
+ *
+ * You should call this only if cpu_has_svm() returned true.
+ */
+static inline void cpu_svm_disable(void)
+{
+ uint64_t efer;
+
+ wrmsrl(MSR_VM_HSAVE_PA, 0);
+ rdmsrl(MSR_EFER, efer);
+ wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
+}
+
+/** Makes sure SVM is disabled, if it is supported on the CPU
+ */
+static inline void cpu_emergency_svm_disable(void)
+{
+ if (cpu_has_svm(NULL))
+ cpu_svm_disable();
+}
+
+#endif /* _ASM_X86_VIRTEX_H */
diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h
index b7c0dea119f..61e08c0a290 100644
--- a/arch/x86/include/asm/vmi.h
+++ b/arch/x86/include/asm/vmi.h
@@ -223,9 +223,15 @@ struct pci_header {
} __attribute__((packed));
/* Function prototypes for bootstrapping */
+#ifdef CONFIG_VMI
extern void vmi_init(void);
+extern void vmi_activate(void);
extern void vmi_bringup(void);
-extern void vmi_apply_boot_page_allocations(void);
+#else
+static inline void vmi_init(void) {}
+static inline void vmi_activate(void) {}
+static inline void vmi_bringup(void) {}
+#endif
/* State needed to start an application processor in an SMP system. */
struct vmi_ap_state {
diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h
new file mode 100644
index 00000000000..c11b7e100d8
--- /dev/null
+++ b/arch/x86/include/asm/vmware.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2008, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#ifndef ASM_X86__VMWARE_H
+#define ASM_X86__VMWARE_H
+
+extern unsigned long vmware_get_tsc_khz(void);
+extern int vmware_platform(void);
+extern void vmware_set_feature_bits(struct cpuinfo_x86 *c);
+
+#endif
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/include/asm/vmx.h
index 3e010d21fdd..d0238e6151d 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -63,10 +63,13 @@
#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
+#define VM_EXIT_SAVE_IA32_PAT 0x00040000
+#define VM_EXIT_LOAD_IA32_PAT 0x00080000
#define VM_ENTRY_IA32E_MODE 0x00000200
#define VM_ENTRY_SMM 0x00000400
#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
+#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
/* VMCS Encodings */
enum vmcs_field {
@@ -112,6 +115,8 @@ enum vmcs_field {
VMCS_LINK_POINTER_HIGH = 0x00002801,
GUEST_IA32_DEBUGCTL = 0x00002802,
GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
+ GUEST_IA32_PAT = 0x00002804,
+ GUEST_IA32_PAT_HIGH = 0x00002805,
GUEST_PDPTR0 = 0x0000280a,
GUEST_PDPTR0_HIGH = 0x0000280b,
GUEST_PDPTR1 = 0x0000280c,
@@ -120,6 +125,8 @@ enum vmcs_field {
GUEST_PDPTR2_HIGH = 0x0000280f,
GUEST_PDPTR3 = 0x00002810,
GUEST_PDPTR3_HIGH = 0x00002811,
+ HOST_IA32_PAT = 0x00002c00,
+ HOST_IA32_PAT_HIGH = 0x00002c01,
PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
EXCEPTION_BITMAP = 0x00004004,
@@ -331,8 +338,9 @@ enum vmcs_field {
#define AR_RESERVD_MASK 0xfffe0f00
-#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
-#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10
+#define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0)
+#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1)
+#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2)
#define VMX_NR_VPIDS (1 << 16)
#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1
@@ -352,7 +360,23 @@ enum vmcs_field {
#define VMX_EPT_READABLE_MASK 0x1ull
#define VMX_EPT_WRITABLE_MASK 0x2ull
#define VMX_EPT_EXECUTABLE_MASK 0x4ull
+#define VMX_EPT_IGMT_BIT (1ull << 6)
#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
+
+#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
+#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
+#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
+#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
+#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
+#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
+#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
+#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
+
+
+
#endif
diff --git a/arch/x86/include/asm/voyager.h b/arch/x86/include/asm/voyager.h
index b3e64730762..c1635d43616 100644
--- a/arch/x86/include/asm/voyager.h
+++ b/arch/x86/include/asm/voyager.h
@@ -527,3 +527,45 @@ extern void voyager_smp_intr_init(void);
#define VOYAGER_PSI_SUBREAD 2
#define VOYAGER_PSI_SUBWRITE 3
extern void voyager_cat_psi(__u8, __u16, __u8 *);
+
+/* These define the CPIs we use in linux */
+#define VIC_CPI_LEVEL0 0
+#define VIC_CPI_LEVEL1 1
+/* now the fake CPIs */
+#define VIC_TIMER_CPI 2
+#define VIC_INVALIDATE_CPI 3
+#define VIC_RESCHEDULE_CPI 4
+#define VIC_ENABLE_IRQ_CPI 5
+#define VIC_CALL_FUNCTION_CPI 6
+#define VIC_CALL_FUNCTION_SINGLE_CPI 7
+
+/* Now the QIC CPIs: Since we don't need the two initial levels,
+ * these are 2 less than the VIC CPIs */
+#define QIC_CPI_OFFSET 1
+#define QIC_TIMER_CPI (VIC_TIMER_CPI - QIC_CPI_OFFSET)
+#define QIC_INVALIDATE_CPI (VIC_INVALIDATE_CPI - QIC_CPI_OFFSET)
+#define QIC_RESCHEDULE_CPI (VIC_RESCHEDULE_CPI - QIC_CPI_OFFSET)
+#define QIC_ENABLE_IRQ_CPI (VIC_ENABLE_IRQ_CPI - QIC_CPI_OFFSET)
+#define QIC_CALL_FUNCTION_CPI (VIC_CALL_FUNCTION_CPI - QIC_CPI_OFFSET)
+#define QIC_CALL_FUNCTION_SINGLE_CPI (VIC_CALL_FUNCTION_SINGLE_CPI - QIC_CPI_OFFSET)
+
+#define VIC_START_FAKE_CPI VIC_TIMER_CPI
+#define VIC_END_FAKE_CPI VIC_CALL_FUNCTION_SINGLE_CPI
+
+/* this is the SYS_INT CPI. */
+#define VIC_SYS_INT 8
+#define VIC_CMN_INT 15
+
+/* This is the boot CPI for alternate processors. It gets overwritten
+ * by the above once the system has activated all available processors */
+#define VIC_CPU_BOOT_CPI VIC_CPI_LEVEL0
+#define VIC_CPU_BOOT_ERRATA_CPI (VIC_CPI_LEVEL0 + 8)
+
+extern asmlinkage void vic_cpi_interrupt(void);
+extern asmlinkage void vic_sys_interrupt(void);
+extern asmlinkage void vic_cmn_interrupt(void);
+extern asmlinkage void qic_timer_interrupt(void);
+extern asmlinkage void qic_invalidate_interrupt(void);
+extern asmlinkage void qic_reschedule_interrupt(void);
+extern asmlinkage void qic_enable_irq_interrupt(void);
+extern asmlinkage void qic_call_function_interrupt(void);
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
index 19144184983..1df35417c41 100644
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -15,10 +15,4 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
return raw_irqs_disabled_flags(regs->flags);
}
-static inline void xen_do_IRQ(int irq, struct pt_regs *regs)
-{
- regs->orig_ax = ~irq;
- do_IRQ(regs);
-}
-
#endif /* _ASM_X86_XEN_EVENTS_H */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 3f6000d95fe..5e79ca69432 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -33,8 +33,14 @@
#ifndef _ASM_X86_XEN_HYPERCALL_H
#define _ASM_X86_XEN_HYPERCALL_H
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
#include <linux/errno.h>
#include <linux/string.h>
+#include <linux/types.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
#include <xen/interface/xen.h>
#include <xen/interface/sched.h>
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index a38d25ac87d..81fbd735aec 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -33,39 +33,10 @@
#ifndef _ASM_X86_XEN_HYPERVISOR_H
#define _ASM_X86_XEN_HYPERVISOR_H
-#include <linux/types.h>
-#include <linux/kernel.h>
-
-#include <xen/interface/xen.h>
-#include <xen/interface/version.h>
-
-#include <asm/ptrace.h>
-#include <asm/page.h>
-#include <asm/desc.h>
-#if defined(__i386__)
-# ifdef CONFIG_X86_PAE
-# include <asm-generic/pgtable-nopud.h>
-# else
-# include <asm-generic/pgtable-nopmd.h>
-# endif
-#endif
-#include <asm/xen/hypercall.h>
-
/* arch/i386/kernel/setup.c */
extern struct shared_info *HYPERVISOR_shared_info;
extern struct start_info *xen_start_info;
-/* arch/i386/mach-xen/evtchn.c */
-/* Force a proper event-channel callback from Xen. */
-extern void force_evtchn_callback(void);
-
-/* Turn jiffies into Xen system time. */
-u64 jiffies_to_st(unsigned long jiffies);
-
-
-#define MULTI_UVMFLAGS_INDEX 3
-#define MULTI_UVMDOMID_INDEX 4
-
enum xen_domain_type {
XEN_NATIVE,
XEN_PV_DOMAIN,
@@ -74,9 +45,15 @@ enum xen_domain_type {
extern enum xen_domain_type xen_domain_type;
+#ifdef CONFIG_XEN
#define xen_domain() (xen_domain_type != XEN_NATIVE)
-#define xen_pv_domain() (xen_domain_type == XEN_PV_DOMAIN)
+#else
+#define xen_domain() (0)
+#endif
+
+#define xen_pv_domain() (xen_domain() && xen_domain_type == XEN_PV_DOMAIN)
+#define xen_hvm_domain() (xen_domain() && xen_domain_type == XEN_HVM_DOMAIN)
+
#define xen_initial_domain() (xen_pv_domain() && xen_start_info->flags & SIF_INITDOMAIN)
-#define xen_hvm_domain() (xen_domain_type == XEN_HVM_DOMAIN)
#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index bc628998a1b..4bd990ee43d 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -1,11 +1,16 @@
#ifndef _ASM_X86_XEN_PAGE_H
#define _ASM_X86_XEN_PAGE_H
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
#include <linux/pfn.h>
#include <asm/uaccess.h>
+#include <asm/page.h>
#include <asm/pgtable.h>
+#include <xen/interface/xen.h>
#include <xen/features.h>
/* Xen machine address */
@@ -132,7 +137,7 @@ static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
pte_t pte;
pte.pte = ((phys_addr_t)page_nr << PAGE_SHIFT) |
- (pgprot_val(pgprot) & __supported_pte_mask);
+ massage_pgprot(pgprot);
return pte;
}
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e489ff9cb3e..24f357e7557 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -12,6 +12,7 @@ CFLAGS_REMOVE_tsc.o = -pg
CFLAGS_REMOVE_rtc.o = -pg
CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
CFLAGS_REMOVE_ftrace.o = -pg
+CFLAGS_REMOVE_early_printk.o = -pg
endif
#
@@ -22,13 +23,14 @@ nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
CFLAGS_hpet.o := $(nostackp)
CFLAGS_tsc.o := $(nostackp)
+CFLAGS_paravirt.o := $(nostackp)
-obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
+obj-y := process_$(BITS).o signal.o entry_$(BITS).o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
-obj-y += time_$(BITS).o ioport.o ldt.o
-obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
+obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
+obj-y += setup.o i8259.o irqinit_$(BITS).o
obj-$(CONFIG_X86_VISWS) += visws_quirks.o
-obj-$(CONFIG_X86_32) += probe_roms_32.o
+obj-$(CONFIG_X86_32) += probe_32.o probe_roms_32.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
@@ -41,36 +43,38 @@ obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
obj-y += process.o
obj-y += i387.o xsave.o
obj-y += ptrace.o
-obj-y += ds.o
+obj-$(CONFIG_X86_DS) += ds.o
obj-$(CONFIG_X86_32) += tls.o
obj-$(CONFIG_IA32_EMULATION) += tls.o
obj-y += step.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += cpu/
obj-y += acpi/
-obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
+obj-y += reboot.o
obj-$(CONFIG_MCA) += mca_32.o
obj-$(CONFIG_X86_MSR) += msr.o
obj-$(CONFIG_X86_CPUID) += cpuid.o
obj-$(CONFIG_PCI) += early-quirks.o
apm-y := apm_32.o
obj-$(CONFIG_APM) += apm.o
-obj-$(CONFIG_X86_SMP) += smp.o
-obj-$(CONFIG_X86_SMP) += smpboot.o tsc_sync.o ipi.o tlb_$(BITS).o
-obj-$(CONFIG_X86_32_SMP) += smpcommon.o
-obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o
+obj-$(CONFIG_SMP) += smp.o
+obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o ipi.o
+obj-$(CONFIG_SMP) += setup_percpu.o
+obj-$(CONFIG_X86_64_SMP) += tsc_sync.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o ipi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
+obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
obj-$(CONFIG_X86_ES7000) += es7000_32.o
-obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
+obj-$(CONFIG_X86_SUMMIT) += summit_32.o
obj-y += vsmp_64.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_MODULES) += module_$(BITS).o
@@ -105,20 +109,24 @@ microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
obj-$(CONFIG_MICROCODE) += microcode.o
+obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
+
+obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
- obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
- obj-y += bios_uv.o uv_irq.o uv_sysfs.o
+ obj-y += genapic_64.o genapic_flat_64.o
obj-y += genx2apic_cluster.o
obj-y += genx2apic_phys.o
+ obj-$(CONFIG_X86_UV) += genx2apic_uv_x.o tlb_uv.o
+ obj-$(CONFIG_X86_UV) += bios_uv.o uv_irq.o uv_sysfs.o
obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
obj-$(CONFIG_AUDIT) += audit_64.o
obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
- obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
endif
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 8c1f76abae9..956c1dee6fb 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -42,12 +42,8 @@
#include <asm/mpspec.h>
#include <asm/smp.h>
-#ifdef CONFIG_X86_LOCAL_APIC
-# include <mach_apic.h>
-#endif
-
static int __initdata acpi_force = 0;
-
+u32 acpi_rsdt_forced;
#ifdef CONFIG_ACPI
int acpi_disabled = 0;
#else
@@ -56,16 +52,7 @@ int acpi_disabled = 1;
EXPORT_SYMBOL(acpi_disabled);
#ifdef CONFIG_X86_64
-
-#include <asm/proto.h>
-
-#else /* X86 */
-
-#ifdef CONFIG_X86_LOCAL_APIC
-#include <mach_apic.h>
-#include <mach_mpparse.h>
-#endif /* CONFIG_X86_LOCAL_APIC */
-
+# include <asm/proto.h>
#endif /* X86 */
#define BAD_MADT_ENTRY(entry, end) ( \
@@ -121,35 +108,18 @@ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
*/
char *__init __acpi_map_table(unsigned long phys, unsigned long size)
{
- unsigned long base, offset, mapped_size;
- int idx;
if (!phys || !size)
return NULL;
- if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT))
- return __va(phys);
-
- offset = phys & (PAGE_SIZE - 1);
- mapped_size = PAGE_SIZE - offset;
- clear_fixmap(FIX_ACPI_END);
- set_fixmap(FIX_ACPI_END, phys);
- base = fix_to_virt(FIX_ACPI_END);
-
- /*
- * Most cases can be covered by the below.
- */
- idx = FIX_ACPI_END;
- while (mapped_size < size) {
- if (--idx < FIX_ACPI_BEGIN)
- return NULL; /* cannot handle this */
- phys += PAGE_SIZE;
- clear_fixmap(idx);
- set_fixmap(idx, phys);
- mapped_size += PAGE_SIZE;
- }
+ return early_ioremap(phys, size);
+}
+void __init __acpi_unmap_table(char *map, unsigned long size)
+{
+ if (!map || !size)
+ return;
- return ((unsigned char *)base + offset);
+ early_iounmap(map, size);
}
#ifdef CONFIG_PCI_MMCONFIG
@@ -239,7 +209,8 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
madt->address);
}
- acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id);
+ default_acpi_madt_oem_check(madt->header.oem_id,
+ madt->header.oem_table_id);
return 0;
}
@@ -538,9 +509,10 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
union acpi_object *obj;
struct acpi_madt_local_apic *lapic;
- cpumask_t tmp_map, new_map;
+ cpumask_var_t tmp_map, new_map;
u8 physid;
int cpu;
+ int retval = -ENOMEM;
if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
return -EINVAL;
@@ -569,23 +541,37 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
buffer.length = ACPI_ALLOCATE_BUFFER;
buffer.pointer = NULL;
- tmp_map = cpu_present_map;
+ if (!alloc_cpumask_var(&tmp_map, GFP_KERNEL))
+ goto out;
+
+ if (!alloc_cpumask_var(&new_map, GFP_KERNEL))
+ goto free_tmp_map;
+
+ cpumask_copy(tmp_map, cpu_present_mask);
acpi_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED);
/*
* If mp_register_lapic successfully generates a new logical cpu
* number, then the following will get us exactly what was mapped
*/
- cpus_andnot(new_map, cpu_present_map, tmp_map);
- if (cpus_empty(new_map)) {
+ cpumask_andnot(new_map, cpu_present_mask, tmp_map);
+ if (cpumask_empty(new_map)) {
printk ("Unable to map lapic to logical cpu number\n");
- return -EINVAL;
+ retval = -EINVAL;
+ goto free_new_map;
}
- cpu = first_cpu(new_map);
+ cpu = cpumask_first(new_map);
*pcpu = cpu;
- return 0;
+ retval = 0;
+
+free_new_map:
+ free_cpumask_var(new_map);
+free_tmp_map:
+ free_cpumask_var(tmp_map);
+out:
+ return retval;
}
/* wrapper to silence section mismatch warning */
@@ -598,7 +584,7 @@ EXPORT_SYMBOL(acpi_map_lsapic);
int acpi_unmap_lsapic(int cpu)
{
per_cpu(x86_cpu_to_apicid, cpu) = -1;
- cpu_clear(cpu, cpu_present_map);
+ set_cpu_present(cpu, false);
num_processors--;
return (0);
@@ -869,7 +855,7 @@ static struct {
DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
} mp_ioapic_routing[MAX_IO_APICS];
-static int mp_find_ioapic(int gsi)
+int mp_find_ioapic(int gsi)
{
int i = 0;
@@ -884,6 +870,16 @@ static int mp_find_ioapic(int gsi)
return -1;
}
+int mp_find_ioapic_pin(int ioapic, int gsi)
+{
+ if (WARN_ON(ioapic == -1))
+ return -1;
+ if (WARN_ON(gsi > mp_ioapic_routing[ioapic].gsi_end))
+ return -1;
+
+ return gsi - mp_ioapic_routing[ioapic].gsi_base;
+}
+
static u8 __init uniq_ioapic_id(u8 id)
{
#ifdef CONFIG_X86_32
@@ -897,8 +893,8 @@ static u8 __init uniq_ioapic_id(u8 id)
DECLARE_BITMAP(used, 256);
bitmap_zero(used, 256);
for (i = 0; i < nr_ioapics; i++) {
- struct mp_config_ioapic *ia = &mp_ioapics[i];
- __set_bit(ia->mp_apicid, used);
+ struct mpc_ioapic *ia = &mp_ioapics[i];
+ __set_bit(ia->apicid, used);
}
if (!test_bit(id, used))
return id;
@@ -930,47 +926,70 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
idx = nr_ioapics;
- mp_ioapics[idx].mp_type = MP_IOAPIC;
- mp_ioapics[idx].mp_flags = MPC_APIC_USABLE;
- mp_ioapics[idx].mp_apicaddr = address;
+ mp_ioapics[idx].type = MP_IOAPIC;
+ mp_ioapics[idx].flags = MPC_APIC_USABLE;
+ mp_ioapics[idx].apicaddr = address;
set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
- mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id);
+ mp_ioapics[idx].apicid = uniq_ioapic_id(id);
#ifdef CONFIG_X86_32
- mp_ioapics[idx].mp_apicver = io_apic_get_version(idx);
+ mp_ioapics[idx].apicver = io_apic_get_version(idx);
#else
- mp_ioapics[idx].mp_apicver = 0;
+ mp_ioapics[idx].apicver = 0;
#endif
/*
* Build basic GSI lookup table to facilitate gsi->io_apic lookups
* and to prevent reprogramming of IOAPIC pins (PCI GSIs).
*/
- mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mp_apicid;
+ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid;
mp_ioapic_routing[idx].gsi_base = gsi_base;
mp_ioapic_routing[idx].gsi_end = gsi_base +
io_apic_get_redir_entries(idx);
- printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
- "GSI %d-%d\n", idx, mp_ioapics[idx].mp_apicid,
- mp_ioapics[idx].mp_apicver, mp_ioapics[idx].mp_apicaddr,
+ printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
+ "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
+ mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
nr_ioapics++;
}
-static void assign_to_mp_irq(struct mp_config_intsrc *m,
- struct mp_config_intsrc *mp_irq)
+int __init acpi_probe_gsi(void)
{
- memcpy(mp_irq, m, sizeof(struct mp_config_intsrc));
+ int idx;
+ int gsi;
+ int max_gsi = 0;
+
+ if (acpi_disabled)
+ return 0;
+
+ if (!acpi_ioapic)
+ return 0;
+
+ max_gsi = 0;
+ for (idx = 0; idx < nr_ioapics; idx++) {
+ gsi = mp_ioapic_routing[idx].gsi_end;
+
+ if (gsi > max_gsi)
+ max_gsi = gsi;
+ }
+
+ return max_gsi + 1;
}
-static int mp_irq_cmp(struct mp_config_intsrc *mp_irq,
- struct mp_config_intsrc *m)
+static void assign_to_mp_irq(struct mpc_intsrc *m,
+ struct mpc_intsrc *mp_irq)
{
- return memcmp(mp_irq, m, sizeof(struct mp_config_intsrc));
+ memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
}
-static void save_mp_irq(struct mp_config_intsrc *m)
+static int mp_irq_cmp(struct mpc_intsrc *mp_irq,
+ struct mpc_intsrc *m)
+{
+ return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
+}
+
+static void save_mp_irq(struct mpc_intsrc *m)
{
int i;
@@ -988,7 +1007,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
{
int ioapic;
int pin;
- struct mp_config_intsrc mp_irq;
+ struct mpc_intsrc mp_irq;
/*
* Convert 'gsi' to 'ioapic.pin'.
@@ -996,7 +1015,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
ioapic = mp_find_ioapic(gsi);
if (ioapic < 0)
return;
- pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+ pin = mp_find_ioapic_pin(ioapic, gsi);
/*
* TBD: This check is for faulty timer entries, where the override
@@ -1006,13 +1025,13 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
if ((bus_irq == 0) && (trigger == 3))
trigger = 1;
- mp_irq.mp_type = MP_INTSRC;
- mp_irq.mp_irqtype = mp_INT;
- mp_irq.mp_irqflag = (trigger << 2) | polarity;
- mp_irq.mp_srcbus = MP_ISA_BUS;
- mp_irq.mp_srcbusirq = bus_irq; /* IRQ */
- mp_irq.mp_dstapic = mp_ioapics[ioapic].mp_apicid; /* APIC ID */
- mp_irq.mp_dstirq = pin; /* INTIN# */
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (trigger << 2) | polarity;
+ mp_irq.srcbus = MP_ISA_BUS;
+ mp_irq.srcbusirq = bus_irq; /* IRQ */
+ mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */
+ mp_irq.dstirq = pin; /* INTIN# */
save_mp_irq(&mp_irq);
}
@@ -1022,7 +1041,7 @@ void __init mp_config_acpi_legacy_irqs(void)
int i;
int ioapic;
unsigned int dstapic;
- struct mp_config_intsrc mp_irq;
+ struct mpc_intsrc mp_irq;
#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
/*
@@ -1047,7 +1066,7 @@ void __init mp_config_acpi_legacy_irqs(void)
ioapic = mp_find_ioapic(0);
if (ioapic < 0)
return;
- dstapic = mp_ioapics[ioapic].mp_apicid;
+ dstapic = mp_ioapics[ioapic].apicid;
/*
* Use the default configuration for the IRQs 0-15. Unless
@@ -1057,16 +1076,14 @@ void __init mp_config_acpi_legacy_irqs(void)
int idx;
for (idx = 0; idx < mp_irq_entries; idx++) {
- struct mp_config_intsrc *irq = mp_irqs + idx;
+ struct mpc_intsrc *irq = mp_irqs + idx;
/* Do we already have a mapping for this ISA IRQ? */
- if (irq->mp_srcbus == MP_ISA_BUS
- && irq->mp_srcbusirq == i)
+ if (irq->srcbus == MP_ISA_BUS && irq->srcbusirq == i)
break;
/* Do we already have a mapping for this IOAPIC pin */
- if (irq->mp_dstapic == dstapic &&
- irq->mp_dstirq == i)
+ if (irq->dstapic == dstapic && irq->dstirq == i)
break;
}
@@ -1075,13 +1092,13 @@ void __init mp_config_acpi_legacy_irqs(void)
continue; /* IRQ already used */
}
- mp_irq.mp_type = MP_INTSRC;
- mp_irq.mp_irqflag = 0; /* Conforming */
- mp_irq.mp_srcbus = MP_ISA_BUS;
- mp_irq.mp_dstapic = dstapic;
- mp_irq.mp_irqtype = mp_INT;
- mp_irq.mp_srcbusirq = i; /* Identity mapped */
- mp_irq.mp_dstirq = i;
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqflag = 0; /* Conforming */
+ mp_irq.srcbus = MP_ISA_BUS;
+ mp_irq.dstapic = dstapic;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.srcbusirq = i; /* Identity mapped */
+ mp_irq.dstirq = i;
save_mp_irq(&mp_irq);
}
@@ -1118,7 +1135,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
return gsi;
}
- ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+ ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
#ifdef CONFIG_X86_32
if (ioapic_renumber_irq)
@@ -1192,22 +1209,22 @@ int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
u32 gsi, int triggering, int polarity)
{
#ifdef CONFIG_X86_MPPARSE
- struct mp_config_intsrc mp_irq;
+ struct mpc_intsrc mp_irq;
int ioapic;
if (!acpi_ioapic)
return 0;
/* print the entry should happen on mptable identically */
- mp_irq.mp_type = MP_INTSRC;
- mp_irq.mp_irqtype = mp_INT;
- mp_irq.mp_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
(polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
- mp_irq.mp_srcbus = number;
- mp_irq.mp_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+ mp_irq.srcbus = number;
+ mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
ioapic = mp_find_ioapic(gsi);
- mp_irq.mp_dstapic = mp_ioapic_routing[ioapic].apic_id;
- mp_irq.mp_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base;
+ mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
+ mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
save_mp_irq(&mp_irq);
#endif
@@ -1334,7 +1351,7 @@ static void __init acpi_process_madt(void)
if (!error) {
acpi_lapic = 1;
-#ifdef CONFIG_X86_GENERICARCH
+#ifdef CONFIG_X86_BIGSMP
generic_bigsmp_probe();
#endif
/*
@@ -1343,13 +1360,11 @@ static void __init acpi_process_madt(void)
error = acpi_parse_madt_ioapic_entries();
if (!error) {
acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
- acpi_irq_balance_set(NULL);
acpi_ioapic = 1;
smp_found_config = 1;
-#ifdef CONFIG_X86_32
- setup_apic_routing();
-#endif
+ if (apic->setup_apic_routing)
+ apic->setup_apic_routing();
}
}
if (error == -EINVAL) {
@@ -1360,7 +1375,29 @@ static void __init acpi_process_madt(void)
"Invalid BIOS MADT, disabling ACPI\n");
disable_acpi();
}
+ } else {
+ /*
+ * ACPI found no MADT, and so ACPI wants UP PIC mode.
+ * In the event an MPS table was found, forget it.
+ * Boot with "acpi=off" to use MPS on such a system.
+ */
+ if (smp_found_config) {
+ printk(KERN_WARNING PREFIX
+ "No APIC-table, disabling MPS\n");
+ smp_found_config = 0;
+ }
}
+
+ /*
+ * ACPI supports both logical (e.g. Hyper-Threading) and physical
+ * processors, where MPS only supports physical.
+ */
+ if (acpi_lapic && acpi_ioapic)
+ printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
+ "information\n");
+ else if (acpi_lapic)
+ printk(KERN_INFO "Using ACPI for processor (LAPIC) "
+ "configuration information\n");
#endif
return;
}
@@ -1784,6 +1821,10 @@ static int __init parse_acpi(char *arg)
disable_acpi();
acpi_ht = 1;
}
+ /* acpi=rsdt use RSDT instead of XSDT */
+ else if (strcmp(arg, "rsdt") == 0) {
+ acpi_rsdt_forced = 1;
+ }
/* "acpi=noirq" disables ACPI interrupt routing */
else if (strcmp(arg, "noirq") == 0) {
acpi_noirq_set();
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index c2502eb9aa8..bbbe4bbb6f3 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -56,6 +56,7 @@ static struct cstate_entry *cpu_cstate_entry; /* per CPU ptr */
static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
#define MWAIT_SUBSTATE_MASK (0xf)
+#define MWAIT_CSTATE_MASK (0xf)
#define MWAIT_SUBSTATE_SIZE (4)
#define CPUID_MWAIT_LEAF (5)
@@ -66,39 +67,20 @@ static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
#define NATIVE_CSTATE_BEYOND_HALT (2)
-int acpi_processor_ffh_cstate_probe(unsigned int cpu,
- struct acpi_processor_cx *cx, struct acpi_power_register *reg)
+static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
{
- struct cstate_entry *percpu_entry;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- cpumask_t saved_mask;
- int retval;
+ struct acpi_processor_cx *cx = _cx;
+ long retval;
unsigned int eax, ebx, ecx, edx;
unsigned int edx_part;
unsigned int cstate_type; /* C-state type and not ACPI C-state type */
unsigned int num_cstate_subtype;
- if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF )
- return -1;
-
- if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT)
- return -1;
-
- percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu);
- percpu_entry->states[cx->index].eax = 0;
- percpu_entry->states[cx->index].ecx = 0;
-
- /* Make sure we are running on right CPU */
- saved_mask = current->cpus_allowed;
- retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
- if (retval)
- return -1;
-
cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
/* Check whether this particular cx_type (in CST) is supported or not */
- cstate_type = (cx->address >> MWAIT_SUBSTATE_SIZE) + 1;
+ cstate_type = ((cx->address >> MWAIT_SUBSTATE_SIZE) &
+ MWAIT_CSTATE_MASK) + 1;
edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
@@ -114,21 +96,45 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
retval = -1;
goto out;
}
- percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK;
-
- /* Use the hint in CST */
- percpu_entry->states[cx->index].eax = cx->address;
if (!mwait_supported[cstate_type]) {
mwait_supported[cstate_type] = 1;
- printk(KERN_DEBUG "Monitor-Mwait will be used to enter C-%d "
- "state\n", cx->type);
+ printk(KERN_DEBUG
+ "Monitor-Mwait will be used to enter C-%d "
+ "state\n", cx->type);
}
- snprintf(cx->desc, ACPI_CX_DESC_LEN, "ACPI FFH INTEL MWAIT 0x%x",
- cx->address);
-
+ snprintf(cx->desc,
+ ACPI_CX_DESC_LEN, "ACPI FFH INTEL MWAIT 0x%x",
+ cx->address);
out:
- set_cpus_allowed_ptr(current, &saved_mask);
+ return retval;
+}
+
+int acpi_processor_ffh_cstate_probe(unsigned int cpu,
+ struct acpi_processor_cx *cx, struct acpi_power_register *reg)
+{
+ struct cstate_entry *percpu_entry;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ long retval;
+
+ if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF)
+ return -1;
+
+ if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT)
+ return -1;
+
+ percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu);
+ percpu_entry->states[cx->index].eax = 0;
+ percpu_entry->states[cx->index].ecx = 0;
+
+ /* Make sure we are running on right CPU */
+
+ retval = work_on_cpu(cpu, acpi_processor_ffh_cstate_probe_cpu, cx);
+ if (retval == 0) {
+ /* Use the hint in CST */
+ percpu_entry->states[cx->index].eax = cx->address;
+ percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK;
+ }
return retval;
}
EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 806b4e9051b..7c243a2c511 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -101,6 +101,7 @@ int acpi_save_state_mem(void)
stack_start.sp = temp_stack + sizeof(temp_stack);
early_gdt_descr.address =
(unsigned long)get_cpu_gdt_table(smp_processor_id());
+ initial_gs = per_cpu_offset(smp_processor_id());
#endif
initial_code = (unsigned long)wakeup_long64;
saved_magic = 0x123456789abcdef0;
@@ -156,6 +157,8 @@ static int __init acpi_sleep_setup(char *str)
#ifdef CONFIG_HIBERNATION
if (strncmp(str, "s4_nohwsig", 10) == 0)
acpi_no_s4_hw_signature();
+ if (strncmp(str, "s4_nonvs", 8) == 0)
+ acpi_s4_no_nvs();
#endif
if (strncmp(str, "old_ordering", 12) == 0)
acpi_old_suspend_ordering();
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 331b318304e..5113c080f0c 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -20,10 +20,15 @@
#include <linux/pci.h>
#include <linux/gfp.h>
#include <linux/bitops.h>
+#include <linux/debugfs.h>
#include <linux/scatterlist.h>
#include <linux/iommu-helper.h>
+#ifdef CONFIG_IOMMU_API
+#include <linux/iommu.h>
+#endif
#include <asm/proto.h>
#include <asm/iommu.h>
+#include <asm/gart.h>
#include <asm/amd_iommu_types.h>
#include <asm/amd_iommu.h>
@@ -37,6 +42,10 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock);
static LIST_HEAD(iommu_pd_list);
static DEFINE_SPINLOCK(iommu_pd_list_lock);
+#ifdef CONFIG_IOMMU_API
+static struct iommu_ops amd_iommu_ops;
+#endif
+
/*
* general struct to manage commands send to an IOMMU
*/
@@ -46,6 +55,68 @@ struct iommu_cmd {
static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
struct unity_map_entry *e);
+static struct dma_ops_domain *find_protection_domain(u16 devid);
+
+
+#ifdef CONFIG_AMD_IOMMU_STATS
+
+/*
+ * Initialization code for statistics collection
+ */
+
+DECLARE_STATS_COUNTER(compl_wait);
+DECLARE_STATS_COUNTER(cnt_map_single);
+DECLARE_STATS_COUNTER(cnt_unmap_single);
+DECLARE_STATS_COUNTER(cnt_map_sg);
+DECLARE_STATS_COUNTER(cnt_unmap_sg);
+DECLARE_STATS_COUNTER(cnt_alloc_coherent);
+DECLARE_STATS_COUNTER(cnt_free_coherent);
+DECLARE_STATS_COUNTER(cross_page);
+DECLARE_STATS_COUNTER(domain_flush_single);
+DECLARE_STATS_COUNTER(domain_flush_all);
+DECLARE_STATS_COUNTER(alloced_io_mem);
+DECLARE_STATS_COUNTER(total_map_requests);
+
+static struct dentry *stats_dir;
+static struct dentry *de_isolate;
+static struct dentry *de_fflush;
+
+static void amd_iommu_stats_add(struct __iommu_counter *cnt)
+{
+ if (stats_dir == NULL)
+ return;
+
+ cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir,
+ &cnt->value);
+}
+
+static void amd_iommu_stats_init(void)
+{
+ stats_dir = debugfs_create_dir("amd-iommu", NULL);
+ if (stats_dir == NULL)
+ return;
+
+ de_isolate = debugfs_create_bool("isolation", 0444, stats_dir,
+ (u32 *)&amd_iommu_isolate);
+
+ de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
+ (u32 *)&amd_iommu_unmap_flush);
+
+ amd_iommu_stats_add(&compl_wait);
+ amd_iommu_stats_add(&cnt_map_single);
+ amd_iommu_stats_add(&cnt_unmap_single);
+ amd_iommu_stats_add(&cnt_map_sg);
+ amd_iommu_stats_add(&cnt_unmap_sg);
+ amd_iommu_stats_add(&cnt_alloc_coherent);
+ amd_iommu_stats_add(&cnt_free_coherent);
+ amd_iommu_stats_add(&cross_page);
+ amd_iommu_stats_add(&domain_flush_single);
+ amd_iommu_stats_add(&domain_flush_all);
+ amd_iommu_stats_add(&alloced_io_mem);
+ amd_iommu_stats_add(&total_map_requests);
+}
+
+#endif
/* returns !0 if the IOMMU is caching non-present entries in its TLB */
static int iommu_has_npcache(struct amd_iommu *iommu)
@@ -187,12 +258,56 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
spin_lock_irqsave(&iommu->lock, flags);
ret = __iommu_queue_command(iommu, cmd);
+ if (!ret)
+ iommu->need_sync = true;
spin_unlock_irqrestore(&iommu->lock, flags);
return ret;
}
/*
+ * This function waits until an IOMMU has completed a completion
+ * wait command
+ */
+static void __iommu_wait_for_completion(struct amd_iommu *iommu)
+{
+ int ready = 0;
+ unsigned status = 0;
+ unsigned long i = 0;
+
+ INC_STATS_COUNTER(compl_wait);
+
+ while (!ready && (i < EXIT_LOOP_COUNT)) {
+ ++i;
+ /* wait for the bit to become one */
+ status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+ ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
+ }
+
+ /* set bit back to zero */
+ status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
+ writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
+
+ if (unlikely(i == EXIT_LOOP_COUNT))
+ panic("AMD IOMMU: Completion wait loop failed\n");
+}
+
+/*
+ * This function queues a completion wait command into the command
+ * buffer of an IOMMU
+ */
+static int __iommu_completion_wait(struct amd_iommu *iommu)
+{
+ struct iommu_cmd cmd;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
+ CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
+
+ return __iommu_queue_command(iommu, &cmd);
+}
+
+/*
* This function is called whenever we need to ensure that the IOMMU has
* completed execution of all commands we sent. It sends a
* COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
@@ -201,37 +316,23 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
*/
static int iommu_completion_wait(struct amd_iommu *iommu)
{
- int ret = 0, ready = 0;
- unsigned status = 0;
- struct iommu_cmd cmd;
- unsigned long flags, i = 0;
+ int ret = 0;
+ unsigned long flags;
- memset(&cmd, 0, sizeof(cmd));
- cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
- CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
+ spin_lock_irqsave(&iommu->lock, flags);
- iommu->need_sync = 0;
+ if (!iommu->need_sync)
+ goto out;
- spin_lock_irqsave(&iommu->lock, flags);
+ ret = __iommu_completion_wait(iommu);
- ret = __iommu_queue_command(iommu, &cmd);
+ iommu->need_sync = false;
if (ret)
goto out;
- while (!ready && (i < EXIT_LOOP_COUNT)) {
- ++i;
- /* wait for the bit to become one */
- status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
- ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
- }
-
- /* set bit back to zero */
- status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
- writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
+ __iommu_wait_for_completion(iommu);
- if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
- printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
out:
spin_unlock_irqrestore(&iommu->lock, flags);
@@ -254,11 +355,24 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
ret = iommu_queue_command(iommu, &cmd);
- iommu->need_sync = 1;
-
return ret;
}
+static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
+ u16 domid, int pde, int s)
+{
+ memset(cmd, 0, sizeof(*cmd));
+ address &= PAGE_MASK;
+ CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
+ cmd->data[1] |= domid;
+ cmd->data[2] = lower_32_bits(address);
+ cmd->data[3] = upper_32_bits(address);
+ if (s) /* size bit - we flush more than one 4kb page */
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+ if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+}
+
/*
* Generic command send function for invalidaing TLB entries
*/
@@ -268,21 +382,10 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
struct iommu_cmd cmd;
int ret;
- memset(&cmd, 0, sizeof(cmd));
- address &= PAGE_MASK;
- CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
- cmd.data[1] |= domid;
- cmd.data[2] = lower_32_bits(address);
- cmd.data[3] = upper_32_bits(address);
- if (s) /* size bit - we flush more than one 4kb page */
- cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
- if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
- cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+ __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s);
ret = iommu_queue_command(iommu, &cmd);
- iommu->need_sync = 1;
-
return ret;
}
@@ -318,9 +421,35 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
{
u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+ INC_STATS_COUNTER(domain_flush_single);
+
iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
}
+/*
+ * This function is used to flush the IO/TLB for a given protection domain
+ * on every IOMMU in the system
+ */
+static void iommu_flush_domain(u16 domid)
+{
+ unsigned long flags;
+ struct amd_iommu *iommu;
+ struct iommu_cmd cmd;
+
+ INC_STATS_COUNTER(domain_flush_all);
+
+ __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
+ domid, 1, 1);
+
+ list_for_each_entry(iommu, &amd_iommu_list, list) {
+ spin_lock_irqsave(&iommu->lock, flags);
+ __iommu_queue_command(iommu, &cmd);
+ __iommu_completion_wait(iommu);
+ __iommu_wait_for_completion(iommu);
+ spin_unlock_irqrestore(&iommu->lock, flags);
+ }
+}
+
/****************************************************************************
*
* The functions below are used the create the page table mappings for
@@ -335,15 +464,15 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
* supporting all features of AMD IOMMU page tables like level skipping
* and full 64 bit address spaces.
*/
-static int iommu_map(struct protection_domain *dom,
- unsigned long bus_addr,
- unsigned long phys_addr,
- int prot)
+static int iommu_map_page(struct protection_domain *dom,
+ unsigned long bus_addr,
+ unsigned long phys_addr,
+ int prot)
{
u64 __pte, *pte, *page;
bus_addr = PAGE_ALIGN(bus_addr);
- phys_addr = PAGE_ALIGN(bus_addr);
+ phys_addr = PAGE_ALIGN(phys_addr);
/* only support 512GB address spaces for now */
if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
@@ -385,6 +514,28 @@ static int iommu_map(struct protection_domain *dom,
return 0;
}
+static void iommu_unmap_page(struct protection_domain *dom,
+ unsigned long bus_addr)
+{
+ u64 *pte;
+
+ pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
+
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return;
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
+
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return;
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
+
+ *pte = 0;
+}
+
/*
* This function checks if a specific unity mapping entry is needed for
* this specific IOMMU.
@@ -437,7 +588,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
for (addr = e->address_start; addr < e->address_end;
addr += PAGE_SIZE) {
- ret = iommu_map(&dma_dom->domain, addr, addr, e->prot);
+ ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot);
if (ret)
return ret;
/*
@@ -537,7 +688,7 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
address >>= PAGE_SHIFT;
iommu_area_free(dom->bitmap, address, pages);
- if (address + pages >= dom->next_bit)
+ if (address >= dom->next_bit)
dom->need_flush = true;
}
@@ -568,6 +719,16 @@ static u16 domain_id_alloc(void)
return id;
}
+static void domain_id_free(int id)
+{
+ unsigned long flags;
+
+ write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+ if (id > 0 && id < MAX_DOMAIN_ID)
+ __clear_bit(id, amd_iommu_pd_alloc_bitmap);
+ write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+}
+
/*
* Used to reserve address ranges in the aperture (e.g. for exclusion
* ranges.
@@ -584,12 +745,12 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
iommu_area_reserve(dom->bitmap, start_page, pages);
}
-static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
+static void free_pagetable(struct protection_domain *domain)
{
int i, j;
u64 *p1, *p2, *p3;
- p1 = dma_dom->domain.pt_root;
+ p1 = domain->pt_root;
if (!p1)
return;
@@ -599,7 +760,7 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
continue;
p2 = IOMMU_PTE_PAGE(p1[i]);
- for (j = 0; j < 512; ++i) {
+ for (j = 0; j < 512; ++j) {
if (!IOMMU_PTE_PRESENT(p2[j]))
continue;
p3 = IOMMU_PTE_PAGE(p2[j]);
@@ -610,6 +771,8 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
}
free_page((unsigned long)p1);
+
+ domain->pt_root = NULL;
}
/*
@@ -621,7 +784,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
if (!dom)
return;
- dma_ops_free_pagetable(dom);
+ free_pagetable(&dom->domain);
kfree(dom->pte_pages);
@@ -660,6 +823,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
goto free_dma_dom;
dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
+ dma_dom->domain.flags = PD_DMA_OPS_MASK;
dma_dom->domain.priv = dma_dom;
if (!dma_dom->domain.pt_root)
goto free_dma_dom;
@@ -722,6 +886,15 @@ free_dma_dom:
}
/*
+ * little helper function to check whether a given protection domain is a
+ * dma_ops domain
+ */
+static bool dma_ops_domain(struct protection_domain *domain)
+{
+ return domain->flags & PD_DMA_OPS_MASK;
+}
+
+/*
* Find out the protection domain structure for a given PCI device. This
* will give us the pointer to the page table root for example.
*/
@@ -741,14 +914,15 @@ static struct protection_domain *domain_for_device(u16 devid)
* If a device is not yet associated with a domain, this function does
* assigns it visible for the hardware
*/
-static void set_device_domain(struct amd_iommu *iommu,
- struct protection_domain *domain,
- u16 devid)
+static void attach_device(struct amd_iommu *iommu,
+ struct protection_domain *domain,
+ u16 devid)
{
unsigned long flags;
-
u64 pte_root = virt_to_phys(domain->pt_root);
+ domain->dev_cnt += 1;
+
pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
<< DEV_ENTRY_MODE_SHIFT;
pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
@@ -762,10 +936,118 @@ static void set_device_domain(struct amd_iommu *iommu,
write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
iommu_queue_inv_dev_entry(iommu, devid);
+}
+
+/*
+ * Removes a device from a protection domain (unlocked)
+ */
+static void __detach_device(struct protection_domain *domain, u16 devid)
+{
+
+ /* lock domain */
+ spin_lock(&domain->lock);
+
+ /* remove domain from the lookup table */
+ amd_iommu_pd_table[devid] = NULL;
+
+ /* remove entry from the device table seen by the hardware */
+ amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
+ amd_iommu_dev_table[devid].data[1] = 0;
+ amd_iommu_dev_table[devid].data[2] = 0;
+
+ /* decrease reference counter */
+ domain->dev_cnt -= 1;
+
+ /* ready */
+ spin_unlock(&domain->lock);
+}
+
+/*
+ * Removes a device from a protection domain (with devtable_lock held)
+ */
+static void detach_device(struct protection_domain *domain, u16 devid)
+{
+ unsigned long flags;
+
+ /* lock device table */
+ write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+ __detach_device(domain, devid);
+ write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+}
+
+static int device_change_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct device *dev = data;
+ struct pci_dev *pdev = to_pci_dev(dev);
+ u16 devid = calc_devid(pdev->bus->number, pdev->devfn);
+ struct protection_domain *domain;
+ struct dma_ops_domain *dma_domain;
+ struct amd_iommu *iommu;
+ int order = amd_iommu_aperture_order;
+ unsigned long flags;
+
+ if (devid > amd_iommu_last_bdf)
+ goto out;
+
+ devid = amd_iommu_alias_table[devid];
+
+ iommu = amd_iommu_rlookup_table[devid];
+ if (iommu == NULL)
+ goto out;
+
+ domain = domain_for_device(devid);
+
+ if (domain && !dma_ops_domain(domain))
+ WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound "
+ "to a non-dma-ops domain\n", dev_name(dev));
+
+ switch (action) {
+ case BUS_NOTIFY_BOUND_DRIVER:
+ if (domain)
+ goto out;
+ dma_domain = find_protection_domain(devid);
+ if (!dma_domain)
+ dma_domain = iommu->default_dom;
+ attach_device(iommu, &dma_domain->domain, devid);
+ printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
+ "device %s\n", dma_domain->domain.id, dev_name(dev));
+ break;
+ case BUS_NOTIFY_UNBIND_DRIVER:
+ if (!domain)
+ goto out;
+ detach_device(domain, devid);
+ break;
+ case BUS_NOTIFY_ADD_DEVICE:
+ /* allocate a protection domain if a device is added */
+ dma_domain = find_protection_domain(devid);
+ if (dma_domain)
+ goto out;
+ dma_domain = dma_ops_domain_alloc(iommu, order);
+ if (!dma_domain)
+ goto out;
+ dma_domain->target_dev = devid;
+
+ spin_lock_irqsave(&iommu_pd_list_lock, flags);
+ list_add_tail(&dma_domain->list, &iommu_pd_list);
+ spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
+
+ break;
+ default:
+ goto out;
+ }
+
+ iommu_queue_inv_dev_entry(iommu, devid);
+ iommu_completion_wait(iommu);
- iommu->need_sync = 1;
+out:
+ return 0;
}
+struct notifier_block device_nb = {
+ .notifier_call = device_change_notifier,
+};
+
/*****************************************************************************
*
* The next functions belong to the dma_ops mapping/unmapping code.
@@ -801,7 +1083,6 @@ static struct dma_ops_domain *find_protection_domain(u16 devid)
list_for_each_entry(entry, &iommu_pd_list, list) {
if (entry->target_dev == devid) {
ret = entry;
- list_del(&ret->list);
break;
}
}
@@ -852,12 +1133,14 @@ static int get_device_resources(struct device *dev,
if (!dma_dom)
dma_dom = (*iommu)->default_dom;
*domain = &dma_dom->domain;
- set_device_domain(*iommu, *domain, *bdf);
+ attach_device(*iommu, *domain, *bdf);
printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
- "device ", (*domain)->id);
- print_devid(_bdf, 1);
+ "device %s\n", (*domain)->id, dev_name(dev));
}
+ if (domain_for_device(_bdf) == NULL)
+ attach_device(*iommu, *domain, _bdf);
+
return 1;
}
@@ -908,7 +1191,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
if (address >= dom->aperture_size)
return;
- WARN_ON(address & 0xfffULL || address > dom->aperture_size);
+ WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size);
pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
pte += IOMMU_PTE_L0_INDEX(address);
@@ -920,8 +1203,8 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
/*
* This function contains common code for mapping of a physically
- * contiguous memory region into DMA address space. It is uses by all
- * mapping functions provided by this IOMMU driver.
+ * contiguous memory region into DMA address space. It is used by all
+ * mapping functions provided with this IOMMU driver.
* Must be called with the domain lock held.
*/
static dma_addr_t __map_single(struct device *dev,
@@ -942,6 +1225,11 @@ static dma_addr_t __map_single(struct device *dev,
pages = iommu_num_pages(paddr, size, PAGE_SIZE);
paddr &= PAGE_MASK;
+ INC_STATS_COUNTER(total_map_requests);
+
+ if (pages > 1)
+ INC_STATS_COUNTER(cross_page);
+
if (align)
align_mask = (1UL << get_order(size)) - 1;
@@ -958,6 +1246,8 @@ static dma_addr_t __map_single(struct device *dev,
}
address += offset;
+ ADD_STATS_COUNTER(alloced_io_mem, size);
+
if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
iommu_flush_tlb(iommu, dma_dom->domain.id);
dma_dom->need_flush = false;
@@ -981,7 +1271,8 @@ static void __unmap_single(struct amd_iommu *iommu,
dma_addr_t i, start;
unsigned int pages;
- if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size))
+ if ((dma_addr == bad_dma_address) ||
+ (dma_addr + size > dma_dom->aperture_size))
return;
pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
@@ -993,6 +1284,8 @@ static void __unmap_single(struct amd_iommu *iommu,
start += PAGE_SIZE;
}
+ SUB_STATS_COUNTER(alloced_io_mem, size);
+
dma_ops_free_addresses(dma_dom, dma_addr, pages);
if (amd_iommu_unmap_flush || dma_dom->need_flush) {
@@ -1014,6 +1307,8 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
dma_addr_t addr;
u64 dma_mask;
+ INC_STATS_COUNTER(cnt_map_single);
+
if (!check_device(dev))
return bad_dma_address;
@@ -1025,14 +1320,16 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
/* device not handled by any AMD IOMMU */
return (dma_addr_t)paddr;
+ if (!dma_ops_domain(domain))
+ return bad_dma_address;
+
spin_lock_irqsave(&domain->lock, flags);
addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
dma_mask);
if (addr == bad_dma_address)
goto out;
- if (unlikely(iommu->need_sync))
- iommu_completion_wait(iommu);
+ iommu_completion_wait(iommu);
out:
spin_unlock_irqrestore(&domain->lock, flags);
@@ -1051,17 +1348,21 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
struct protection_domain *domain;
u16 devid;
+ INC_STATS_COUNTER(cnt_unmap_single);
+
if (!check_device(dev) ||
!get_device_resources(dev, &iommu, &domain, &devid))
/* device not handled by any AMD IOMMU */
return;
+ if (!dma_ops_domain(domain))
+ return;
+
spin_lock_irqsave(&domain->lock, flags);
__unmap_single(iommu, domain->priv, dma_addr, size, dir);
- if (unlikely(iommu->need_sync))
- iommu_completion_wait(iommu);
+ iommu_completion_wait(iommu);
spin_unlock_irqrestore(&domain->lock, flags);
}
@@ -1101,6 +1402,8 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
int mapped_elems = 0;
u64 dma_mask;
+ INC_STATS_COUNTER(cnt_map_sg);
+
if (!check_device(dev))
return 0;
@@ -1111,6 +1414,9 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
if (!iommu || !domain)
return map_sg_no_iommu(dev, sglist, nelems, dir);
+ if (!dma_ops_domain(domain))
+ return 0;
+
spin_lock_irqsave(&domain->lock, flags);
for_each_sg(sglist, s, nelems, i) {
@@ -1127,8 +1433,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
goto unmap;
}
- if (unlikely(iommu->need_sync))
- iommu_completion_wait(iommu);
+ iommu_completion_wait(iommu);
out:
spin_unlock_irqrestore(&domain->lock, flags);
@@ -1161,10 +1466,15 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
u16 devid;
int i;
+ INC_STATS_COUNTER(cnt_unmap_sg);
+
if (!check_device(dev) ||
!get_device_resources(dev, &iommu, &domain, &devid))
return;
+ if (!dma_ops_domain(domain))
+ return;
+
spin_lock_irqsave(&domain->lock, flags);
for_each_sg(sglist, s, nelems, i) {
@@ -1173,8 +1483,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
s->dma_address = s->dma_length = 0;
}
- if (unlikely(iommu->need_sync))
- iommu_completion_wait(iommu);
+ iommu_completion_wait(iommu);
spin_unlock_irqrestore(&domain->lock, flags);
}
@@ -1193,6 +1502,8 @@ static void *alloc_coherent(struct device *dev, size_t size,
phys_addr_t paddr;
u64 dma_mask = dev->coherent_dma_mask;
+ INC_STATS_COUNTER(cnt_alloc_coherent);
+
if (!check_device(dev))
return NULL;
@@ -1211,6 +1522,9 @@ static void *alloc_coherent(struct device *dev, size_t size,
return virt_addr;
}
+ if (!dma_ops_domain(domain))
+ goto out_free;
+
if (!dma_mask)
dma_mask = *dev->dma_mask;
@@ -1219,19 +1533,20 @@ static void *alloc_coherent(struct device *dev, size_t size,
*dma_addr = __map_single(dev, iommu, domain->priv, paddr,
size, DMA_BIDIRECTIONAL, true, dma_mask);
- if (*dma_addr == bad_dma_address) {
- free_pages((unsigned long)virt_addr, get_order(size));
- virt_addr = NULL;
- goto out;
- }
+ if (*dma_addr == bad_dma_address)
+ goto out_free;
- if (unlikely(iommu->need_sync))
- iommu_completion_wait(iommu);
+ iommu_completion_wait(iommu);
-out:
spin_unlock_irqrestore(&domain->lock, flags);
return virt_addr;
+
+out_free:
+
+ free_pages((unsigned long)virt_addr, get_order(size));
+
+ return NULL;
}
/*
@@ -1245,6 +1560,8 @@ static void free_coherent(struct device *dev, size_t size,
struct protection_domain *domain;
u16 devid;
+ INC_STATS_COUNTER(cnt_free_coherent);
+
if (!check_device(dev))
return;
@@ -1253,12 +1570,14 @@ static void free_coherent(struct device *dev, size_t size,
if (!iommu || !domain)
goto free_mem;
+ if (!dma_ops_domain(domain))
+ goto free_mem;
+
spin_lock_irqsave(&domain->lock, flags);
__unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
- if (unlikely(iommu->need_sync))
- iommu_completion_wait(iommu);
+ iommu_completion_wait(iommu);
spin_unlock_irqrestore(&domain->lock, flags);
@@ -1297,7 +1616,7 @@ static int amd_iommu_dma_supported(struct device *dev, u64 mask)
* we don't need to preallocate the protection domains anymore.
* For now we have to.
*/
-void prealloc_protection_domains(void)
+static void prealloc_protection_domains(void)
{
struct pci_dev *dev = NULL;
struct dma_ops_domain *dma_dom;
@@ -1306,7 +1625,7 @@ void prealloc_protection_domains(void)
u16 devid;
while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
- devid = (dev->bus->number << 8) | dev->devfn;
+ devid = calc_devid(dev->bus->number, dev->devfn);
if (devid > amd_iommu_last_bdf)
continue;
devid = amd_iommu_alias_table[devid];
@@ -1353,6 +1672,7 @@ int __init amd_iommu_init_dma_ops(void)
iommu->default_dom = dma_ops_domain_alloc(iommu, order);
if (iommu->default_dom == NULL)
return -ENOMEM;
+ iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
ret = iommu_init_unity_mappings(iommu);
if (ret)
goto free_domains;
@@ -1376,6 +1696,12 @@ int __init amd_iommu_init_dma_ops(void)
/* Make the driver finally visible to the drivers */
dma_ops = &amd_iommu_dma_ops;
+ register_iommu(&amd_iommu_ops);
+
+ bus_register_notifier(&pci_bus_type, &device_nb);
+
+ amd_iommu_stats_init();
+
return 0;
free_domains:
@@ -1387,3 +1713,224 @@ free_domains:
return ret;
}
+
+/*****************************************************************************
+ *
+ * The following functions belong to the exported interface of AMD IOMMU
+ *
+ * This interface allows access to lower level functions of the IOMMU
+ * like protection domain handling and assignement of devices to domains
+ * which is not possible with the dma_ops interface.
+ *
+ *****************************************************************************/
+
+static void cleanup_domain(struct protection_domain *domain)
+{
+ unsigned long flags;
+ u16 devid;
+
+ write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+
+ for (devid = 0; devid <= amd_iommu_last_bdf; ++devid)
+ if (amd_iommu_pd_table[devid] == domain)
+ __detach_device(domain, devid);
+
+ write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+}
+
+static int amd_iommu_domain_init(struct iommu_domain *dom)
+{
+ struct protection_domain *domain;
+
+ domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+ if (!domain)
+ return -ENOMEM;
+
+ spin_lock_init(&domain->lock);
+ domain->mode = PAGE_MODE_3_LEVEL;
+ domain->id = domain_id_alloc();
+ if (!domain->id)
+ goto out_free;
+ domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!domain->pt_root)
+ goto out_free;
+
+ dom->priv = domain;
+
+ return 0;
+
+out_free:
+ kfree(domain);
+
+ return -ENOMEM;
+}
+
+static void amd_iommu_domain_destroy(struct iommu_domain *dom)
+{
+ struct protection_domain *domain = dom->priv;
+
+ if (!domain)
+ return;
+
+ if (domain->dev_cnt > 0)
+ cleanup_domain(domain);
+
+ BUG_ON(domain->dev_cnt != 0);
+
+ free_pagetable(domain);
+
+ domain_id_free(domain->id);
+
+ kfree(domain);
+
+ dom->priv = NULL;
+}
+
+static void amd_iommu_detach_device(struct iommu_domain *dom,
+ struct device *dev)
+{
+ struct protection_domain *domain = dom->priv;
+ struct amd_iommu *iommu;
+ struct pci_dev *pdev;
+ u16 devid;
+
+ if (dev->bus != &pci_bus_type)
+ return;
+
+ pdev = to_pci_dev(dev);
+
+ devid = calc_devid(pdev->bus->number, pdev->devfn);
+
+ if (devid > 0)
+ detach_device(domain, devid);
+
+ iommu = amd_iommu_rlookup_table[devid];
+ if (!iommu)
+ return;
+
+ iommu_queue_inv_dev_entry(iommu, devid);
+ iommu_completion_wait(iommu);
+}
+
+static int amd_iommu_attach_device(struct iommu_domain *dom,
+ struct device *dev)
+{
+ struct protection_domain *domain = dom->priv;
+ struct protection_domain *old_domain;
+ struct amd_iommu *iommu;
+ struct pci_dev *pdev;
+ u16 devid;
+
+ if (dev->bus != &pci_bus_type)
+ return -EINVAL;
+
+ pdev = to_pci_dev(dev);
+
+ devid = calc_devid(pdev->bus->number, pdev->devfn);
+
+ if (devid >= amd_iommu_last_bdf ||
+ devid != amd_iommu_alias_table[devid])
+ return -EINVAL;
+
+ iommu = amd_iommu_rlookup_table[devid];
+ if (!iommu)
+ return -EINVAL;
+
+ old_domain = domain_for_device(devid);
+ if (old_domain)
+ return -EBUSY;
+
+ attach_device(iommu, domain, devid);
+
+ iommu_completion_wait(iommu);
+
+ return 0;
+}
+
+static int amd_iommu_map_range(struct iommu_domain *dom,
+ unsigned long iova, phys_addr_t paddr,
+ size_t size, int iommu_prot)
+{
+ struct protection_domain *domain = dom->priv;
+ unsigned long i, npages = iommu_num_pages(paddr, size, PAGE_SIZE);
+ int prot = 0;
+ int ret;
+
+ if (iommu_prot & IOMMU_READ)
+ prot |= IOMMU_PROT_IR;
+ if (iommu_prot & IOMMU_WRITE)
+ prot |= IOMMU_PROT_IW;
+
+ iova &= PAGE_MASK;
+ paddr &= PAGE_MASK;
+
+ for (i = 0; i < npages; ++i) {
+ ret = iommu_map_page(domain, iova, paddr, prot);
+ if (ret)
+ return ret;
+
+ iova += PAGE_SIZE;
+ paddr += PAGE_SIZE;
+ }
+
+ return 0;
+}
+
+static void amd_iommu_unmap_range(struct iommu_domain *dom,
+ unsigned long iova, size_t size)
+{
+
+ struct protection_domain *domain = dom->priv;
+ unsigned long i, npages = iommu_num_pages(iova, size, PAGE_SIZE);
+
+ iova &= PAGE_MASK;
+
+ for (i = 0; i < npages; ++i) {
+ iommu_unmap_page(domain, iova);
+ iova += PAGE_SIZE;
+ }
+
+ iommu_flush_domain(domain->id);
+}
+
+static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
+ unsigned long iova)
+{
+ struct protection_domain *domain = dom->priv;
+ unsigned long offset = iova & ~PAGE_MASK;
+ phys_addr_t paddr;
+ u64 *pte;
+
+ pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)];
+
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return 0;
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L1_INDEX(iova)];
+
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return 0;
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L0_INDEX(iova)];
+
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return 0;
+
+ paddr = *pte & IOMMU_PAGE_MASK;
+ paddr |= offset;
+
+ return paddr;
+}
+
+static struct iommu_ops amd_iommu_ops = {
+ .domain_init = amd_iommu_domain_init,
+ .domain_destroy = amd_iommu_domain_destroy,
+ .attach_dev = amd_iommu_attach_device,
+ .detach_dev = amd_iommu_detach_device,
+ .map = amd_iommu_map_range,
+ .unmap = amd_iommu_unmap_range,
+ .iova_to_phys = amd_iommu_iova_to_phys,
+};
+
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 0cdcda35a05..42c33cebf00 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -28,6 +28,7 @@
#include <asm/amd_iommu_types.h>
#include <asm/amd_iommu.h>
#include <asm/iommu.h>
+#include <asm/gart.h>
/*
* definitions for the ACPI scanning code
@@ -121,7 +122,8 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have
LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
we find in ACPI */
unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
-int amd_iommu_isolate; /* if 1, device isolation is enabled */
+bool amd_iommu_isolate = true; /* if true, device isolation is
+ enabled */
bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
@@ -242,20 +244,16 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
}
/* Function to enable the hardware */
-void __init iommu_enable(struct amd_iommu *iommu)
+static void __init iommu_enable(struct amd_iommu *iommu)
{
- printk(KERN_INFO "AMD IOMMU: Enabling IOMMU "
- "at %02x:%02x.%x cap 0x%hx\n",
- iommu->dev->bus->number,
- PCI_SLOT(iommu->dev->devfn),
- PCI_FUNC(iommu->dev->devfn),
- iommu->cap_ptr);
+ printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
+ dev_name(&iommu->dev->dev), iommu->cap_ptr);
iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
}
/* Function to enable IOMMU event logging and event interrupts */
-void __init iommu_enable_event_logging(struct amd_iommu *iommu)
+static void __init iommu_enable_event_logging(struct amd_iommu *iommu)
{
iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
@@ -427,6 +425,10 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
&entry, sizeof(entry));
+ /* set head and tail to zero manually */
+ writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+ writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+
iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
return cmd_buf;
@@ -1074,7 +1076,8 @@ int __init amd_iommu_init(void)
goto free;
/* IOMMU rlookup table - find the IOMMU for a specific device */
- amd_iommu_rlookup_table = (void *)__get_free_pages(GFP_KERNEL,
+ amd_iommu_rlookup_table = (void *)__get_free_pages(
+ GFP_KERNEL | __GFP_ZERO,
get_order(rlookup_table_size));
if (amd_iommu_rlookup_table == NULL)
goto free;
@@ -1212,8 +1215,10 @@ static int __init parse_amd_iommu_options(char *str)
{
for (; *str; ++str) {
if (strncmp(str, "isolate", 7) == 0)
- amd_iommu_isolate = 1;
- if (strncmp(str, "fullflush", 11) == 0)
+ amd_iommu_isolate = true;
+ if (strncmp(str, "share", 5) == 0)
+ amd_iommu_isolate = false;
+ if (strncmp(str, "fullflush", 9) == 0)
amd_iommu_unmap_flush = true;
}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 9a32b37ee2e..676debfc170 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -1,8 +1,9 @@
/*
* Firmware replacement code.
*
- * Work around broken BIOSes that don't set an aperture or only set the
- * aperture in the AGP bridge.
+ * Work around broken BIOSes that don't set an aperture, only set the
+ * aperture in the AGP bridge, or set too small aperture.
+ *
* If all fails map the aperture over some low memory. This is cheaper than
* doing bounce buffering. The memory is lost. This is done at early boot
* because only the bootmem allocator can allocate 32+MB.
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 04a7f960bbc..cf2ca19e62d 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -1,7 +1,7 @@
/*
* Local APIC handling, local APIC timers
*
- * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ * (c) 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
*
* Fixes
* Maciej W. Rozycki : Bits for genuine 82489DX APICs;
@@ -14,49 +14,71 @@
* Mikael Pettersson : PM converted to driver model.
*/
-#include <linux/init.h>
-
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/interrupt.h>
-#include <linux/mc146818rtc.h>
#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/ioport.h>
-#include <linux/cpu.h>
-#include <linux/clockchips.h>
+#include <linux/mc146818rtc.h>
#include <linux/acpi_pmtmr.h>
+#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/bootmem.h>
+#include <linux/ftrace.h>
+#include <linux/ioport.h>
#include <linux/module.h>
-#include <linux/dmi.h>
+#include <linux/sysdev.h>
+#include <linux/delay.h>
+#include <linux/timex.h>
#include <linux/dmar.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/dmi.h>
+#include <linux/nmi.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
-#include <asm/atomic.h>
-#include <asm/smp.h>
-#include <asm/mtrr.h>
-#include <asm/mpspec.h>
-#include <asm/desc.h>
#include <asm/arch_hooks.h>
-#include <asm/hpet.h>
#include <asm/pgalloc.h>
+#include <asm/genapic.h>
+#include <asm/atomic.h>
+#include <asm/mpspec.h>
#include <asm/i8253.h>
-#include <asm/nmi.h>
-#include <asm/idle.h>
+#include <asm/i8259.h>
#include <asm/proto.h>
-#include <asm/timex.h>
#include <asm/apic.h>
-#include <asm/i8259.h>
+#include <asm/desc.h>
+#include <asm/hpet.h>
+#include <asm/idle.h>
+#include <asm/mtrr.h>
+#include <asm/smp.h>
+
+unsigned int num_processors;
+
+unsigned disabled_cpus __cpuinitdata;
+
+/* Processor that is doing the boot up */
+unsigned int boot_cpu_physical_apicid = -1U;
-#include <mach_apic.h>
-#include <mach_apicdef.h>
-#include <mach_ipi.h>
+/*
+ * The highest APIC ID seen during enumeration.
+ *
+ * This determines the messaging protocol we can use: if all APIC IDs
+ * are in the 0 ... 7 range, then we can use logical addressing which
+ * has some performance advantages (better broadcasting).
+ *
+ * If there's an APIC ID above 8, we use physical addressing.
+ */
+unsigned int max_physical_apicid;
/*
- * Sanity check
+ * Bitmask of physically existing CPUs:
*/
-#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F)
-# error SPURIOUS_APIC_VECTOR definition error
-#endif
+physid_mask_t phys_cpu_present_map;
+
+/*
+ * Map cpu index to physical APIC ID
+ */
+DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
#ifdef CONFIG_X86_32
/*
@@ -97,8 +119,8 @@ __setup("apicpmtimer", setup_apicpmtimer);
#ifdef HAVE_X2APIC
int x2apic;
/* x2apic enabled before OS handover */
-int x2apic_preenabled;
-int disable_x2apic;
+static int x2apic_preenabled;
+static int disable_x2apic;
static __init int setup_nox2apic(char *str)
{
disable_x2apic = 1;
@@ -118,8 +140,6 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
int first_system_vector = 0xfe;
-char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
-
/*
* Debug level, exported for io_apic.c
*/
@@ -141,7 +161,7 @@ static int lapic_next_event(unsigned long delta,
struct clock_event_device *evt);
static void lapic_timer_setup(enum clock_event_mode mode,
struct clock_event_device *evt);
-static void lapic_timer_broadcast(cpumask_t mask);
+static void lapic_timer_broadcast(const struct cpumask *mask);
static void apic_pm_activate(void);
/*
@@ -227,7 +247,7 @@ void xapic_icr_write(u32 low, u32 id)
apic_write(APIC_ICR, low);
}
-u64 xapic_icr_read(void)
+static u64 xapic_icr_read(void)
{
u32 icr1, icr2;
@@ -267,7 +287,7 @@ void x2apic_icr_write(u32 low, u32 id)
wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
}
-u64 x2apic_icr_read(void)
+static u64 x2apic_icr_read(void)
{
unsigned long val;
@@ -441,6 +461,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
v = apic_read(APIC_LVTT);
v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
apic_write(APIC_LVTT, v);
+ apic_write(APIC_TMICT, 0xffffffff);
break;
case CLOCK_EVT_MODE_RESUME:
/* Nothing to do here */
@@ -453,10 +474,10 @@ static void lapic_timer_setup(enum clock_event_mode mode,
/*
* Local APIC timer broadcast function
*/
-static void lapic_timer_broadcast(cpumask_t mask)
+static void lapic_timer_broadcast(const struct cpumask *mask)
{
#ifdef CONFIG_SMP
- send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
+ apic->send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
#endif
}
@@ -469,7 +490,7 @@ static void __cpuinit setup_APIC_timer(void)
struct clock_event_device *levt = &__get_cpu_var(lapic_events);
memcpy(levt, &lapic_clockevent, sizeof(*levt));
- levt->cpumask = cpumask_of_cpu(smp_processor_id());
+ levt->cpumask = cpumask_of(smp_processor_id());
clockevents_register_device(levt);
}
@@ -534,7 +555,8 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
}
}
-static int __init calibrate_by_pmtimer(long deltapm, long *delta)
+static int __init
+calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
{
const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
const long pm_thresh = pm_100ms / 100;
@@ -545,7 +567,7 @@ static int __init calibrate_by_pmtimer(long deltapm, long *delta)
return -1;
#endif
- apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
+ apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm);
/* Check, if the PM timer is available */
if (!deltapm)
@@ -555,19 +577,30 @@ static int __init calibrate_by_pmtimer(long deltapm, long *delta)
if (deltapm > (pm_100ms - pm_thresh) &&
deltapm < (pm_100ms + pm_thresh)) {
- apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
- } else {
- res = (((u64)deltapm) * mult) >> 22;
- do_div(res, 1000000);
- printk(KERN_WARNING "APIC calibration not consistent "
- "with PM Timer: %ldms instead of 100ms\n",
- (long)res);
- /* Correct the lapic counter value */
- res = (((u64)(*delta)) * pm_100ms);
+ apic_printk(APIC_VERBOSE, "... PM-Timer result ok\n");
+ return 0;
+ }
+
+ res = (((u64)deltapm) * mult) >> 22;
+ do_div(res, 1000000);
+ pr_warning("APIC calibration not consistent "
+ "with PM-Timer: %ldms instead of 100ms\n",(long)res);
+
+ /* Correct the lapic counter value */
+ res = (((u64)(*delta)) * pm_100ms);
+ do_div(res, deltapm);
+ pr_info("APIC delta adjusted to PM-Timer: "
+ "%lu (%ld)\n", (unsigned long)res, *delta);
+ *delta = (long)res;
+
+ /* Correct the tsc counter value */
+ if (cpu_has_tsc) {
+ res = (((u64)(*deltatsc)) * pm_100ms);
do_div(res, deltapm);
- printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
- "%lu (%ld)\n", (unsigned long)res, *delta);
- *delta = (long)res;
+ apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
+ "PM-Timer: %lu (%ld) \n",
+ (unsigned long)res, *deltatsc);
+ *deltatsc = (long)res;
}
return 0;
@@ -578,7 +611,7 @@ static int __init calibrate_APIC_clock(void)
struct clock_event_device *levt = &__get_cpu_var(lapic_events);
void (*real_handler)(struct clock_event_device *dev);
unsigned long deltaj;
- long delta;
+ long delta, deltatsc;
int pm_referenced = 0;
local_irq_disable();
@@ -608,9 +641,11 @@ static int __init calibrate_APIC_clock(void)
delta = lapic_cal_t1 - lapic_cal_t2;
apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
+ deltatsc = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
+
/* we trust the PM based calibration if possible */
pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1,
- &delta);
+ &delta, &deltatsc);
/* Calculate the scaled math multiplication factor */
lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
@@ -628,11 +663,10 @@ static int __init calibrate_APIC_clock(void)
calibration_result);
if (cpu_has_tsc) {
- delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
"%ld.%04ld MHz.\n",
- (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ),
- (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ));
+ (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ),
+ (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ));
}
apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
@@ -645,8 +679,7 @@ static int __init calibrate_APIC_clock(void)
*/
if (calibration_result < (1000000 / HZ)) {
local_irq_enable();
- printk(KERN_WARNING
- "APIC frequency too slow, disabling apic timer\n");
+ pr_warning("APIC frequency too slow, disabling apic timer\n");
return -1;
}
@@ -672,13 +705,9 @@ static int __init calibrate_APIC_clock(void)
while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
cpu_relax();
- local_irq_disable();
-
/* Stop the lapic timer */
lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
- local_irq_enable();
-
/* Jiffies delta */
deltaj = lapic_cal_j2 - lapic_cal_j1;
apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
@@ -692,8 +721,7 @@ static int __init calibrate_APIC_clock(void)
local_irq_enable();
if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
- printk(KERN_WARNING
- "APIC timer disabled due to verification failure.\n");
+ pr_warning("APIC timer disabled due to verification failure\n");
return -1;
}
@@ -714,7 +742,7 @@ void __init setup_boot_APIC_clock(void)
* broadcast mechanism is used. On UP systems simply ignore it.
*/
if (disable_apic_timer) {
- printk(KERN_INFO "Disabling APIC timer\n");
+ pr_info("Disabling APIC timer\n");
/* No broadcast on UP ! */
if (num_possible_cpus() > 1) {
lapic_clockevent.mult = 1;
@@ -741,7 +769,7 @@ void __init setup_boot_APIC_clock(void)
if (nmi_watchdog != NMI_IO_APIC)
lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
else
- printk(KERN_WARNING "APIC timer registered as dummy,"
+ pr_warning("APIC timer registered as dummy,"
" due to nmi_watchdog=%d!\n", nmi_watchdog);
/* Setup the lapic or request the broadcast */
@@ -773,8 +801,7 @@ static void local_apic_timer_interrupt(void)
* spurious.
*/
if (!evt->event_handler) {
- printk(KERN_WARNING
- "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
+ pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu);
/* Switch it off */
lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
return;
@@ -783,11 +810,7 @@ static void local_apic_timer_interrupt(void)
/*
* the NMI deadlock-detector uses this.
*/
-#ifdef CONFIG_X86_64
- add_pda(apic_timer_irqs, 1);
-#else
- per_cpu(irq_stat, cpu).apic_timer_irqs++;
-#endif
+ inc_irq_stat(apic_timer_irqs);
evt->event_handler(evt);
}
@@ -800,7 +823,7 @@ static void local_apic_timer_interrupt(void)
* [ if a single-CPU system runs an SMP kernel then we call the local
* interrupt as well. Thus we cannot inline the local irq ... ]
*/
-void smp_apic_timer_interrupt(struct pt_regs *regs)
+void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
@@ -814,9 +837,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
* Besides, if we don't timer interrupts ignore the global
* interrupt lock, which is the WrongThing (tm) to do.
*/
-#ifdef CONFIG_X86_64
exit_idle();
-#endif
irq_enter();
local_apic_timer_interrupt();
irq_exit();
@@ -907,6 +928,10 @@ void disable_local_APIC(void)
{
unsigned int value;
+ /* APIC hasn't been mapped yet */
+ if (!apic_phys)
+ return;
+
clear_local_APIC();
/*
@@ -999,11 +1024,11 @@ int __init verify_local_APIC(void)
*/
reg0 = apic_read(APIC_ID);
apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
- apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
+ apic_write(APIC_ID, reg0 ^ apic->apic_id_mask);
reg1 = apic_read(APIC_ID);
apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
apic_write(APIC_ID, reg0);
- if (reg1 != (reg0 ^ APIC_ID_MASK))
+ if (reg1 != (reg0 ^ apic->apic_id_mask))
return 0;
/*
@@ -1093,18 +1118,18 @@ static void __cpuinit lapic_setup_esr(void)
unsigned int oldvalue, value, maxlvt;
if (!lapic_is_integrated()) {
- printk(KERN_INFO "No ESR for 82489DX.\n");
+ pr_info("No ESR for 82489DX.\n");
return;
}
- if (esr_disable) {
+ if (apic->disable_esr) {
/*
* Something untraceable is creating bad interrupts on
* secondary quads ... for the moment, just leave the
* ESR disabled - we can't do anything useful with the
* errors anyway - mbligh
*/
- printk(KERN_INFO "Leaving ESR disabled.\n");
+ pr_info("Leaving ESR disabled.\n");
return;
}
@@ -1138,9 +1163,14 @@ void __cpuinit setup_local_APIC(void)
unsigned int value;
int i, j;
+ if (disable_apic) {
+ arch_disable_smp_support();
+ return;
+ }
+
#ifdef CONFIG_X86_32
/* Pound the ESR really hard over the head with a big hammer - mbligh */
- if (lapic_is_integrated() && esr_disable) {
+ if (lapic_is_integrated() && apic->disable_esr) {
apic_write(APIC_ESR, 0);
apic_write(APIC_ESR, 0);
apic_write(APIC_ESR, 0);
@@ -1154,7 +1184,7 @@ void __cpuinit setup_local_APIC(void)
* Double-check whether this APIC is really registered.
* This is meaningless in clustered apic mode, so we skip it.
*/
- if (!apic_id_registered())
+ if (!apic->apic_id_registered())
BUG();
/*
@@ -1162,7 +1192,7 @@ void __cpuinit setup_local_APIC(void)
* an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
* document number 292116). So here it goes...
*/
- init_apic_ldr();
+ apic->init_apic_ldr();
/*
* Set Task Priority to 'accept all'. We never change this
@@ -1298,7 +1328,7 @@ void check_x2apic(void)
rdmsr(MSR_IA32_APICBASE, msr, msr2);
if (msr & X2APIC_ENABLE) {
- printk("x2apic enabled by BIOS, switching to x2apic ops\n");
+ pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
x2apic_preenabled = x2apic = 1;
apic_ops = &x2apic_ops;
}
@@ -1310,12 +1340,12 @@ void enable_x2apic(void)
rdmsr(MSR_IA32_APICBASE, msr, msr2);
if (!(msr & X2APIC_ENABLE)) {
- printk("Enabling x2apic\n");
+ pr_info("Enabling x2apic\n");
wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
}
}
-void enable_IR_x2apic(void)
+void __init enable_IR_x2apic(void)
{
#ifdef CONFIG_INTR_REMAP
int ret;
@@ -1325,9 +1355,8 @@ void enable_IR_x2apic(void)
return;
if (!x2apic_preenabled && disable_x2apic) {
- printk(KERN_INFO
- "Skipped enabling x2apic and Interrupt-remapping "
- "because of nox2apic\n");
+ pr_info("Skipped enabling x2apic and Interrupt-remapping "
+ "because of nox2apic\n");
return;
}
@@ -1335,22 +1364,19 @@ void enable_IR_x2apic(void)
panic("Bios already enabled x2apic, can't enforce nox2apic");
if (!x2apic_preenabled && skip_ioapic_setup) {
- printk(KERN_INFO
- "Skipped enabling x2apic and Interrupt-remapping "
- "because of skipping io-apic setup\n");
+ pr_info("Skipped enabling x2apic and Interrupt-remapping "
+ "because of skipping io-apic setup\n");
return;
}
ret = dmar_table_init();
if (ret) {
- printk(KERN_INFO
- "dmar_table_init() failed with %d:\n", ret);
+ pr_info("dmar_table_init() failed with %d:\n", ret);
if (x2apic_preenabled)
panic("x2apic enabled by bios. But IR enabling failed");
else
- printk(KERN_INFO
- "Not enabling x2apic,Intr-remapping\n");
+ pr_info("Not enabling x2apic,Intr-remapping\n");
return;
}
@@ -1359,7 +1385,7 @@ void enable_IR_x2apic(void)
ret = save_mask_IO_APIC_setup();
if (ret) {
- printk(KERN_INFO "Saving IO-APIC state failed: %d\n", ret);
+ pr_info("Saving IO-APIC state failed: %d\n", ret);
goto end;
}
@@ -1394,14 +1420,11 @@ end:
if (!ret) {
if (!x2apic_preenabled)
- printk(KERN_INFO
- "Enabled x2apic and interrupt-remapping\n");
+ pr_info("Enabled x2apic and interrupt-remapping\n");
else
- printk(KERN_INFO
- "Enabled Interrupt-remapping\n");
+ pr_info("Enabled Interrupt-remapping\n");
} else
- printk(KERN_ERR
- "Failed to enable Interrupt-remapping and x2apic\n");
+ pr_err("Failed to enable Interrupt-remapping and x2apic\n");
#else
if (!cpu_has_x2apic)
return;
@@ -1410,8 +1433,8 @@ end:
panic("x2apic enabled prior OS handover,"
" enable CONFIG_INTR_REMAP");
- printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping "
- " and x2apic\n");
+ pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping "
+ " and x2apic\n");
#endif
return;
@@ -1428,7 +1451,7 @@ end:
static int __init detect_init_APIC(void)
{
if (!cpu_has_apic) {
- printk(KERN_INFO "No local APIC present\n");
+ pr_info("No local APIC present\n");
return -1;
}
@@ -1451,7 +1474,7 @@ static int __init detect_init_APIC(void)
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) ||
- (boot_cpu_data.x86 == 15))
+ (boot_cpu_data.x86 >= 15))
break;
goto no_apic;
case X86_VENDOR_INTEL:
@@ -1469,8 +1492,8 @@ static int __init detect_init_APIC(void)
* "lapic" specified.
*/
if (!force_enable_local_apic) {
- printk(KERN_INFO "Local APIC disabled by BIOS -- "
- "you can enable it with \"lapic\"\n");
+ pr_info("Local APIC disabled by BIOS -- "
+ "you can enable it with \"lapic\"\n");
return -1;
}
/*
@@ -1480,8 +1503,7 @@ static int __init detect_init_APIC(void)
*/
rdmsr(MSR_IA32_APICBASE, l, h);
if (!(l & MSR_IA32_APICBASE_ENABLE)) {
- printk(KERN_INFO
- "Local APIC disabled by BIOS -- reenabling.\n");
+ pr_info("Local APIC disabled by BIOS -- reenabling.\n");
l &= ~MSR_IA32_APICBASE_BASE;
l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
wrmsr(MSR_IA32_APICBASE, l, h);
@@ -1494,7 +1516,7 @@ static int __init detect_init_APIC(void)
*/
features = cpuid_edx(1);
if (!(features & (1 << X86_FEATURE_APIC))) {
- printk(KERN_WARNING "Could not enable APIC!\n");
+ pr_warning("Could not enable APIC!\n");
return -1;
}
set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
@@ -1505,14 +1527,14 @@ static int __init detect_init_APIC(void)
if (l & MSR_IA32_APICBASE_ENABLE)
mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
- printk(KERN_INFO "Found and enabled local APIC!\n");
+ pr_info("Found and enabled local APIC!\n");
apic_pm_activate();
return 0;
no_apic:
- printk(KERN_INFO "No local APIC present or hardware disabled\n");
+ pr_info("No local APIC present or hardware disabled\n");
return -1;
}
#endif
@@ -1586,14 +1608,14 @@ int apic_version[MAX_APICS];
int __init APIC_init_uniprocessor(void)
{
-#ifdef CONFIG_X86_64
if (disable_apic) {
- printk(KERN_INFO "Apic disabled\n");
+ pr_info("Apic disabled\n");
return -1;
}
+#ifdef CONFIG_X86_64
if (!cpu_has_apic) {
disable_apic = 1;
- printk(KERN_INFO "Apic disabled by BIOS\n");
+ pr_info("Apic disabled by BIOS\n");
return -1;
}
#else
@@ -1605,8 +1627,8 @@ int __init APIC_init_uniprocessor(void)
*/
if (!cpu_has_apic &&
APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
- printk(KERN_ERR "BIOS bug, local APIC 0x%x not detected!...\n",
- boot_cpu_physical_apicid);
+ pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
+ boot_cpu_physical_apicid);
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
return -1;
}
@@ -1616,7 +1638,7 @@ int __init APIC_init_uniprocessor(void)
enable_IR_x2apic();
#endif
#ifdef CONFIG_X86_64
- setup_apic_routing();
+ default_setup_apic_routing();
#endif
verify_local_APIC();
@@ -1682,9 +1704,7 @@ void smp_spurious_interrupt(struct pt_regs *regs)
{
u32 v;
-#ifdef CONFIG_X86_64
exit_idle();
-#endif
irq_enter();
/*
* Check if this really is a spurious interrupt and ACK it
@@ -1695,14 +1715,11 @@ void smp_spurious_interrupt(struct pt_regs *regs)
if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
ack_APIC_irq();
-#ifdef CONFIG_X86_64
- add_pda(irq_spurious_count, 1);
-#else
+ inc_irq_stat(irq_spurious_count);
+
/* see sw-dev-man vol 3, chapter 7.4.13.5 */
- printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
- "should never happen.\n", smp_processor_id());
- __get_cpu_var(irq_stat).irq_spurious_count++;
-#endif
+ pr_info("spurious APIC interrupt on CPU#%d, "
+ "should never happen.\n", smp_processor_id());
irq_exit();
}
@@ -1713,9 +1730,7 @@ void smp_error_interrupt(struct pt_regs *regs)
{
u32 v, v1;
-#ifdef CONFIG_X86_64
exit_idle();
-#endif
irq_enter();
/* First tickle the hardware, only then report what went on. -- REW */
v = apic_read(APIC_ESR);
@@ -1724,17 +1739,18 @@ void smp_error_interrupt(struct pt_regs *regs)
ack_APIC_irq();
atomic_inc(&irq_err_count);
- /* Here is what the APIC error bits mean:
- 0: Send CS error
- 1: Receive CS error
- 2: Send accept error
- 3: Receive accept error
- 4: Reserved
- 5: Send illegal vector
- 6: Received illegal vector
- 7: Illegal register address
- */
- printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
+ /*
+ * Here is what the APIC error bits mean:
+ * 0: Send CS error
+ * 1: Receive CS error
+ * 2: Send accept error
+ * 3: Receive accept error
+ * 4: Reserved
+ * 5: Send illegal vector
+ * 6: Received illegal vector
+ * 7: Illegal register address
+ */
+ pr_debug("APIC error on CPU%d: %02x(%02x)\n",
smp_processor_id(), v , v1);
irq_exit();
}
@@ -1760,7 +1776,8 @@ void __init connect_bsp_APIC(void)
outb(0x01, 0x23);
}
#endif
- enable_apic_mode();
+ if (apic->enable_apic_mode)
+ apic->enable_apic_mode();
}
/**
@@ -1832,28 +1849,37 @@ void disconnect_bsp_APIC(int virt_wire_setup)
void __cpuinit generic_processor_info(int apicid, int version)
{
int cpu;
- cpumask_t tmp_map;
/*
* Validate version
*/
if (version == 0x0) {
- printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
- "fixing up to 0x10. (tell your hw vendor)\n",
+ pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
+ "fixing up to 0x10. (tell your hw vendor)\n",
version);
version = 0x10;
}
apic_version[apicid] = version;
- if (num_processors >= NR_CPUS) {
- printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
- " Processor ignored.\n", NR_CPUS);
+ if (num_processors >= nr_cpu_ids) {
+ int max = nr_cpu_ids;
+ int thiscpu = max + disabled_cpus;
+
+ pr_warning(
+ "ACPI: NR_CPUS/possible_cpus limit of %i reached."
+ " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
+
+ disabled_cpus++;
return;
}
num_processors++;
- cpus_complement(tmp_map, cpu_present_map);
- cpu = first_cpu(tmp_map);
+ cpu = cpumask_next_zero(-1, cpu_present_mask);
+
+ if (version != apic_version[boot_cpu_physical_apicid])
+ WARN_ONCE(1,
+ "ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n",
+ apic_version[boot_cpu_physical_apicid], cpu, version);
physid_set(apicid, phys_cpu_present_map);
if (apicid == boot_cpu_physical_apicid) {
@@ -1889,29 +1915,39 @@ void __cpuinit generic_processor_info(int apicid, int version)
}
#endif
-#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
- /* are we being called early in kernel startup? */
- if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
- u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
- u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
-
- cpu_to_apicid[cpu] = apicid;
- bios_cpu_apicid[cpu] = apicid;
- } else {
- per_cpu(x86_cpu_to_apicid, cpu) = apicid;
- per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
- }
+#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
+ early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
+ early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
#endif
- cpu_set(cpu, cpu_possible_map);
- cpu_set(cpu, cpu_present_map);
+ set_cpu_possible(cpu, true);
+ set_cpu_present(cpu, true);
}
-#ifdef CONFIG_X86_64
int hard_smp_processor_id(void)
{
return read_apic_id();
}
+
+void default_init_apic_ldr(void)
+{
+ unsigned long val;
+
+ apic_write(APIC_DFR, APIC_DFR_VALUE);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
+ apic_write(APIC_LDR, val);
+}
+
+#ifdef CONFIG_X86_32
+int default_apicid_to_node(int logical_apicid)
+{
+#ifdef CONFIG_SMP
+ return apicid_2_node[hard_smp_processor_id()];
+#else
+ return 0;
+#endif
+}
#endif
/*
@@ -2106,18 +2142,16 @@ __cpuinit int apic_is_clustered_box(void)
bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
- for (i = 0; i < NR_CPUS; i++) {
+ for (i = 0; i < nr_cpu_ids; i++) {
/* are we being called early in kernel startup? */
if (bios_cpu_apicid) {
id = bios_cpu_apicid[i];
- }
- else if (i < nr_cpu_ids) {
+ } else if (i < nr_cpu_ids) {
if (cpu_present(i))
id = per_cpu(x86_bios_cpu_apicid, i);
else
continue;
- }
- else
+ } else
break;
if (id != BAD_APICID)
@@ -2209,7 +2243,7 @@ static int __init apic_set_verbosity(char *arg)
else if (strcmp("verbose", arg) == 0)
apic_verbosity = APIC_VERBOSE;
else {
- printk(KERN_WARNING "APIC Verbosity level %s not recognised"
+ pr_warning("APIC Verbosity level %s not recognised"
" use apic=verbose or apic=debug\n", arg);
return -EINVAL;
}
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 5145a6e72bb..37ba5f85b71 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -160,9 +160,9 @@
* Work around byte swap bug in one of the Vaio's BIOS's
* (Marc Boucher <marc@mbsi.ca>).
* Exposed the disable flag to dmi so that we can handle known
- * broken APM (Alan Cox <alan@redhat.com>).
+ * broken APM (Alan Cox <alan@lxorguk.ukuu.org.uk>).
* 1.14ac: If the BIOS says "I slowed the CPU down" then don't spin
- * calling it - instead idle. (Alan Cox <alan@redhat.com>)
+ * calling it - instead idle. (Alan Cox <alan@lxorguk.ukuu.org.uk>)
* If an APM idle fails log it and idle sensibly
* 1.15: Don't queue events to clients who open the device O_WRONLY.
* Don't expect replies from clients who open the device O_RDONLY.
@@ -301,7 +301,7 @@ extern int (*console_blank_hook)(int);
*/
#define APM_ZERO_SEGS
-#include "apm.h"
+#include <asm/apm.h>
/*
* Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend.
@@ -391,11 +391,7 @@ static int power_off;
#else
static int power_off = 1;
#endif
-#ifdef CONFIG_APM_REAL_MODE_POWER_OFF
-static int realmode_power_off = 1;
-#else
static int realmode_power_off;
-#endif
#ifdef CONFIG_APM_ALLOW_INTS
static int allow_ints = 1;
#else
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 6649d09ad88..fbf2f33e308 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -11,7 +11,7 @@
#include <linux/suspend.h>
#include <linux/kbuild.h>
#include <asm/ucontext.h>
-#include "sigframe.h"
+#include <asm/sigframe.h>
#include <asm/pgtable.h>
#include <asm/fixmap.h>
#include <asm/processor.h>
@@ -75,6 +75,7 @@ void foo(void)
OFFSET(PT_DS, pt_regs, ds);
OFFSET(PT_ES, pt_regs, es);
OFFSET(PT_FS, pt_regs, fs);
+ OFFSET(PT_GS, pt_regs, gs);
OFFSET(PT_ORIG_EAX, pt_regs, orig_ax);
OFFSET(PT_EIP, pt_regs, ip);
OFFSET(PT_CS, pt_regs, cs);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 7fcf63d22f8..8793ab33e2c 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -11,7 +11,6 @@
#include <linux/hardirq.h>
#include <linux/suspend.h>
#include <linux/kbuild.h>
-#include <asm/pda.h>
#include <asm/processor.h>
#include <asm/segment.h>
#include <asm/thread_info.h>
@@ -20,6 +19,8 @@
#include <xen/interface/xen.h>
+#include <asm/sigframe.h>
+
#define __NO_STUBS 1
#undef __SYSCALL
#undef _ASM_X86_UNISTD_64_H
@@ -46,16 +47,6 @@ int main(void)
#endif
BLANK();
#undef ENTRY
-#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
- ENTRY(kernelstack);
- ENTRY(oldrsp);
- ENTRY(pcurrent);
- ENTRY(irqcount);
- ENTRY(cpunumber);
- ENTRY(irqstackptr);
- ENTRY(data_offset);
- BLANK();
-#undef ENTRY
#ifdef CONFIG_PARAVIRT
BLANK();
OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
@@ -87,7 +78,7 @@ int main(void)
BLANK();
#undef ENTRY
DEFINE(IA32_RT_SIGFRAME_sigcontext,
- offsetof (struct rt_sigframe32, uc.uc_mcontext));
+ offsetof (struct rt_sigframe_ia32, uc.uc_mcontext));
BLANK();
#endif
DEFINE(pbe_address, offsetof(struct pbe, address));
diff --git a/arch/x86/kernel/bigsmp_32.c b/arch/x86/kernel/bigsmp_32.c
new file mode 100644
index 00000000000..47a62f46afd
--- /dev/null
+++ b/arch/x86/kernel/bigsmp_32.c
@@ -0,0 +1,266 @@
+/*
+ * APIC driver for "bigsmp" XAPIC machines with more than 8 virtual CPUs.
+ * Drives the local APIC in "clustered mode".
+ */
+#define APIC_DEFINITION 1
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <asm/mpspec.h>
+#include <asm/genapic.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <asm/ipi.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+#include <linux/smp.h>
+
+
+static inline unsigned bigsmp_get_apic_id(unsigned long x)
+{
+ return (x >> 24) & 0xFF;
+}
+
+#define xapic_phys_to_log_apicid(cpu) (per_cpu(x86_bios_cpu_apicid, cpu))
+
+static inline int bigsmp_apic_id_registered(void)
+{
+ return 1;
+}
+
+static inline const cpumask_t *bigsmp_target_cpus(void)
+{
+#ifdef CONFIG_SMP
+ return &cpu_online_map;
+#else
+ return &cpumask_of_cpu(0);
+#endif
+}
+
+#define APIC_DFR_VALUE (APIC_DFR_FLAT)
+
+static inline unsigned long
+bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid)
+{
+ return 0;
+}
+
+static inline unsigned long bigsmp_check_apicid_present(int bit)
+{
+ return 1;
+}
+
+static inline unsigned long calculate_ldr(int cpu)
+{
+ unsigned long val, id;
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ id = xapic_phys_to_log_apicid(cpu);
+ val |= SET_APIC_LOGICAL_ID(id);
+ return val;
+}
+
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+static inline void bigsmp_init_apic_ldr(void)
+{
+ unsigned long val;
+ int cpu = smp_processor_id();
+
+ apic_write(APIC_DFR, APIC_DFR_VALUE);
+ val = calculate_ldr(cpu);
+ apic_write(APIC_LDR, val);
+}
+
+static inline void bigsmp_setup_apic_routing(void)
+{
+ printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
+ "Physflat", nr_ioapics);
+}
+
+static inline int bigsmp_apicid_to_node(int logical_apicid)
+{
+ return apicid_2_node[hard_smp_processor_id()];
+}
+
+static inline int bigsmp_cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < nr_cpu_ids)
+ return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
+
+ return BAD_APICID;
+}
+
+static inline physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid)
+{
+ return physid_mask_of_physid(phys_apicid);
+}
+
+extern u8 cpu_2_logical_apicid[];
+/* Mapping from cpu number to logical apicid */
+static inline int bigsmp_cpu_to_logical_apicid(int cpu)
+{
+ if (cpu >= nr_cpu_ids)
+ return BAD_APICID;
+ return cpu_physical_id(cpu);
+}
+
+static inline physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map)
+{
+ /* For clustered we don't have a good way to do this yet - hack */
+ return physids_promote(0xFFL);
+}
+
+static inline void bigsmp_setup_portio_remap(void)
+{
+}
+
+static inline int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid)
+{
+ return 1;
+}
+
+/* As we are using single CPU as destination, pick only one CPU here */
+static inline unsigned int bigsmp_cpu_mask_to_apicid(const cpumask_t *cpumask)
+{
+ return bigsmp_cpu_to_logical_apicid(first_cpu(*cpumask));
+}
+
+static inline unsigned int
+bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ for_each_cpu_and(cpu, cpumask, andmask) {
+ if (cpumask_test_cpu(cpu, cpu_online_mask))
+ break;
+ }
+ if (cpu < nr_cpu_ids)
+ return bigsmp_cpu_to_logical_apicid(cpu);
+
+ return BAD_APICID;
+}
+
+static inline int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+static inline void bigsmp_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ default_send_IPI_mask_sequence_phys(mask, vector);
+}
+
+static inline void bigsmp_send_IPI_allbutself(int vector)
+{
+ default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
+}
+
+static inline void bigsmp_send_IPI_all(int vector)
+{
+ bigsmp_send_IPI_mask(cpu_online_mask, vector);
+}
+
+static int dmi_bigsmp; /* can be set by dmi scanners */
+
+static int hp_ht_bigsmp(const struct dmi_system_id *d)
+{
+ printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
+ dmi_bigsmp = 1;
+ return 0;
+}
+
+
+static const struct dmi_system_id bigsmp_dmi_table[] = {
+ { hp_ht_bigsmp, "HP ProLiant DL760 G2",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+ DMI_MATCH(DMI_BIOS_VERSION, "P44-"),}
+ },
+
+ { hp_ht_bigsmp, "HP ProLiant DL740",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+ DMI_MATCH(DMI_BIOS_VERSION, "P47-"),}
+ },
+ { }
+};
+
+static void bigsmp_vector_allocation_domain(int cpu, cpumask_t *retmask)
+{
+ cpus_clear(*retmask);
+ cpu_set(cpu, *retmask);
+}
+
+static int probe_bigsmp(void)
+{
+ if (def_to_bigsmp)
+ dmi_bigsmp = 1;
+ else
+ dmi_check_system(bigsmp_dmi_table);
+ return dmi_bigsmp;
+}
+
+struct genapic apic_bigsmp = {
+
+ .name = "bigsmp",
+ .probe = probe_bigsmp,
+ .acpi_madt_oem_check = NULL,
+ .apic_id_registered = bigsmp_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ /* phys delivery to target CPU: */
+ .irq_dest_mode = 0,
+
+ .target_cpus = bigsmp_target_cpus,
+ .disable_esr = 1,
+ .dest_logical = 0,
+ .check_apicid_used = bigsmp_check_apicid_used,
+ .check_apicid_present = bigsmp_check_apicid_present,
+
+ .vector_allocation_domain = bigsmp_vector_allocation_domain,
+ .init_apic_ldr = bigsmp_init_apic_ldr,
+
+ .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
+ .setup_apic_routing = bigsmp_setup_apic_routing,
+ .multi_timer_check = NULL,
+ .apicid_to_node = bigsmp_apicid_to_node,
+ .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
+ .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
+ .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = bigsmp_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = bigsmp_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = bigsmp_get_apic_id,
+ .set_apic_id = NULL,
+ .apic_id_mask = 0xFF << 24,
+
+ .cpu_mask_to_apicid = bigsmp_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = bigsmp_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = bigsmp_send_IPI_mask,
+ .send_IPI_mask_allbutself = NULL,
+ .send_IPI_allbutself = bigsmp_send_IPI_allbutself,
+ .send_IPI_all = bigsmp_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+
+ .wakeup_cpu = NULL,
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+
+ .wait_for_init_deassert = default_wait_for_init_deassert,
+
+ .smp_callin_clear_local_apic = NULL,
+ .store_NMI_vector = NULL,
+ .inquire_remote_apic = default_inquire_remote_apic,
+};
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index f0dfe6f17e7..f63882728d9 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -25,7 +25,7 @@
#include <asm/uv/bios.h>
#include <asm/uv/uv_hub.h>
-struct uv_systab uv_systab;
+static struct uv_systab uv_systab;
s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
{
@@ -69,10 +69,10 @@ s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
long sn_partition_id;
EXPORT_SYMBOL_GPL(sn_partition_id);
-long uv_coherency_id;
-EXPORT_SYMBOL_GPL(uv_coherency_id);
-long uv_region_size;
-EXPORT_SYMBOL_GPL(uv_region_size);
+long sn_coherency_id;
+EXPORT_SYMBOL_GPL(sn_coherency_id);
+long sn_region_size;
+EXPORT_SYMBOL_GPL(sn_region_size);
int uv_type;
@@ -100,6 +100,56 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
return ret;
}
+int
+uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size,
+ unsigned long *intr_mmr_offset)
+{
+ union uv_watchlist_u size_blade;
+ u64 watchlist;
+ s64 ret;
+
+ size_blade.size = mq_size;
+ size_blade.blade = blade;
+
+ /*
+ * bios returns watchlist number or negative error number.
+ */
+ ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr,
+ size_blade.val, (u64)intr_mmr_offset,
+ (u64)&watchlist, 0);
+ if (ret < BIOS_STATUS_SUCCESS)
+ return ret;
+
+ return watchlist;
+}
+EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_alloc);
+
+int
+uv_bios_mq_watchlist_free(int blade, int watchlist_num)
+{
+ return (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_FREE,
+ blade, watchlist_num, 0, 0, 0);
+}
+EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free);
+
+s64
+uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms)
+{
+ return uv_bios_call_irqsave(UV_BIOS_MEMPROTECT, paddr, len,
+ perms, 0, 0);
+}
+EXPORT_SYMBOL_GPL(uv_bios_change_memprotect);
+
+s64
+uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len)
+{
+ s64 ret;
+
+ ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie,
+ (u64)addr, buf, (u64)len, 0);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa);
s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
{
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
new file mode 100644
index 00000000000..2ac0ab71412
--- /dev/null
+++ b/arch/x86/kernel/check.c
@@ -0,0 +1,161 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/workqueue.h>
+#include <asm/e820.h>
+#include <asm/proto.h>
+
+/*
+ * Some BIOSes seem to corrupt the low 64k of memory during events
+ * like suspend/resume and unplugging an HDMI cable. Reserve all
+ * remaining free memory in that area and fill it with a distinct
+ * pattern.
+ */
+#define MAX_SCAN_AREAS 8
+
+static int __read_mostly memory_corruption_check = -1;
+
+static unsigned __read_mostly corruption_check_size = 64*1024;
+static unsigned __read_mostly corruption_check_period = 60; /* seconds */
+
+static struct e820entry scan_areas[MAX_SCAN_AREAS];
+static int num_scan_areas;
+
+
+static __init int set_corruption_check(char *arg)
+{
+ char *end;
+
+ memory_corruption_check = simple_strtol(arg, &end, 10);
+
+ return (*end == 0) ? 0 : -EINVAL;
+}
+early_param("memory_corruption_check", set_corruption_check);
+
+static __init int set_corruption_check_period(char *arg)
+{
+ char *end;
+
+ corruption_check_period = simple_strtoul(arg, &end, 10);
+
+ return (*end == 0) ? 0 : -EINVAL;
+}
+early_param("memory_corruption_check_period", set_corruption_check_period);
+
+static __init int set_corruption_check_size(char *arg)
+{
+ char *end;
+ unsigned size;
+
+ size = memparse(arg, &end);
+
+ if (*end == '\0')
+ corruption_check_size = size;
+
+ return (size == corruption_check_size) ? 0 : -EINVAL;
+}
+early_param("memory_corruption_check_size", set_corruption_check_size);
+
+
+void __init setup_bios_corruption_check(void)
+{
+ u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
+
+ if (memory_corruption_check == -1) {
+ memory_corruption_check =
+#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
+ 1
+#else
+ 0
+#endif
+ ;
+ }
+
+ if (corruption_check_size == 0)
+ memory_corruption_check = 0;
+
+ if (!memory_corruption_check)
+ return;
+
+ corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
+
+ while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
+ u64 size;
+ addr = find_e820_area_size(addr, &size, PAGE_SIZE);
+
+ if (addr == 0)
+ break;
+
+ if ((addr + size) > corruption_check_size)
+ size = corruption_check_size - addr;
+
+ if (size == 0)
+ break;
+
+ e820_update_range(addr, size, E820_RAM, E820_RESERVED);
+ scan_areas[num_scan_areas].addr = addr;
+ scan_areas[num_scan_areas].size = size;
+ num_scan_areas++;
+
+ /* Assume we've already mapped this early memory */
+ memset(__va(addr), 0, size);
+
+ addr += size;
+ }
+
+ printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
+ num_scan_areas);
+ update_e820();
+}
+
+
+void check_for_bios_corruption(void)
+{
+ int i;
+ int corruption = 0;
+
+ if (!memory_corruption_check)
+ return;
+
+ for (i = 0; i < num_scan_areas; i++) {
+ unsigned long *addr = __va(scan_areas[i].addr);
+ unsigned long size = scan_areas[i].size;
+
+ for (; size; addr++, size -= sizeof(unsigned long)) {
+ if (!*addr)
+ continue;
+ printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
+ addr, __pa(addr), *addr);
+ corruption = 1;
+ *addr = 0;
+ }
+ }
+
+ WARN_ONCE(corruption, KERN_ERR "Memory corruption detected in low memory\n");
+}
+
+static void check_corruption(struct work_struct *dummy);
+static DECLARE_DELAYED_WORK(bios_check_work, check_corruption);
+
+static void check_corruption(struct work_struct *dummy)
+{
+ check_for_bios_corruption();
+ schedule_delayed_work(&bios_check_work,
+ round_jiffies_relative(corruption_check_period*HZ));
+}
+
+static int start_periodic_check_for_corruption(void)
+{
+ if (!memory_corruption_check || corruption_check_period == 0)
+ return 0;
+
+ printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
+ corruption_check_period);
+
+ /* First time we run the checks right away */
+ schedule_delayed_work(&bios_check_work, 0);
+ return 0;
+}
+
+module_init(start_periodic_check_for_corruption);
+
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82ec6075c05..82db7f45e2d 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -2,8 +2,14 @@
# Makefile for x86-compatible CPU details and quirks
#
+# Don't trace early stages of a secondary CPU boot
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_common.o = -pg
+endif
+
obj-y := intel_cacheinfo.o addon_cpuid_features.o
obj-y += proc.o capflags.o powerflags.o common.o
+obj-y += vmware.o hypervisor.o
obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
obj-$(CONFIG_X86_64) += bugs_64.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index ef8f831af82..e48640cfac0 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -7,7 +7,7 @@
#include <asm/pat.h>
#include <asm/processor.h>
-#include <mach_apic.h>
+#include <asm/genapic.h>
struct cpuid_bit {
u16 feature;
@@ -69,7 +69,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
*/
void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_SMP
+#ifdef CONFIG_SMP
unsigned int eax, ebx, ecx, edx, sub_index;
unsigned int ht_mask_width, core_plus_mask_width;
unsigned int core_select_mask, core_level_siblings;
@@ -116,14 +116,14 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
-#ifdef CONFIG_X86_32
- c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width)
+ c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width)
& core_select_mask;
- c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width);
-#else
- c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask;
- c->phys_proc_id = phys_pkg_id(core_plus_mask_width);
-#endif
+ c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width);
+ /*
+ * Reinit the apicid, now that we have extended initial_apicid.
+ */
+ c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+
c->x86_max_cores = (core_level_siblings / smp_num_siblings);
@@ -135,37 +135,3 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
return;
#endif
}
-
-#ifdef CONFIG_X86_PAT
-void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
-{
- if (!cpu_has_pat)
- pat_disable("PAT not supported by CPU.");
-
- switch (c->x86_vendor) {
- case X86_VENDOR_INTEL:
- /*
- * There is a known erratum on Pentium III and Core Solo
- * and Core Duo CPUs.
- * " Page with PAT set to WC while associated MTRR is UC
- * may consolidate to UC "
- * Because of this erratum, it is better to stick with
- * setting WC in MTRR rather than using PAT on these CPUs.
- *
- * Enable PAT WC only on P4, Core 2 or later CPUs.
- */
- if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15))
- return;
-
- pat_disable("PAT WC disabled due to known CPU erratum.");
- return;
-
- case X86_VENDOR_AMD:
- case X86_VENDOR_CENTAUR:
- case X86_VENDOR_TRANSMETA:
- return;
- }
-
- pat_disable("PAT disabled. Not yet verified on this CPU type.");
-}
-#endif
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 8f1e31db2ad..ff4d7b9e32e 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -12,7 +12,7 @@
# include <asm/cacheflush.h>
#endif
-#include <mach_apic.h>
+#include <asm/genapic.h>
#include "cpu.h"
@@ -283,9 +283,14 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
{
early_init_amd_mc(c);
- /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
- if (c->x86_power & (1<<8))
+ /*
+ * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
+ * with P/T states and does not stop in deep C-states
+ */
+ if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ }
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSCALL32);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b9c9ea0217a..e8f4a386bd9 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -21,14 +21,16 @@
#include <asm/asm.h>
#include <asm/numa.h>
#include <asm/smp.h>
+#include <asm/cpu.h>
+#include <asm/cpumask.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/mpspec.h>
#include <asm/apic.h>
-#include <mach_apic.h>
#include <asm/genapic.h>
+#include <asm/genapic.h>
+#include <asm/uv/uv.h>
#endif
-#include <asm/pda.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
#include <asm/desc.h>
@@ -36,28 +38,59 @@
#include <asm/proto.h>
#include <asm/sections.h>
#include <asm/setup.h>
+#include <asm/hypervisor.h>
+#include <asm/stackprotector.h>
#include "cpu.h"
+#ifdef CONFIG_X86_64
+
+/* all of these masks are initialized in setup_cpu_local_masks() */
+cpumask_var_t cpu_callin_mask;
+cpumask_var_t cpu_callout_mask;
+cpumask_var_t cpu_initialized_mask;
+
+/* representing cpus for which sibling maps can be computed */
+cpumask_var_t cpu_sibling_setup_mask;
+
+/* correctly size the local cpu masks */
+void __init setup_cpu_local_masks(void)
+{
+ alloc_bootmem_cpumask_var(&cpu_initialized_mask);
+ alloc_bootmem_cpumask_var(&cpu_callin_mask);
+ alloc_bootmem_cpumask_var(&cpu_callout_mask);
+ alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
+}
+
+#else /* CONFIG_X86_32 */
+
+cpumask_t cpu_callin_map;
+cpumask_t cpu_callout_map;
+cpumask_t cpu_initialized;
+cpumask_t cpu_sibling_setup_map;
+
+#endif /* CONFIG_X86_32 */
+
+
static struct cpu_dev *this_cpu __cpuinitdata;
+DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
#ifdef CONFIG_X86_64
-/* We need valid kernel segments for data and code in long mode too
- * IRET will check the segment types kkeil 2000/10/28
- * Also sysret mandates a special GDT layout
- */
-/* The TLS descriptors are currently at a different place compared to i386.
- Hopefully nobody expects them at a fixed place (Wine?) */
-DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
+ /*
+ * We need valid kernel segments for data and code in long mode too
+ * IRET will check the segment types kkeil 2000/10/28
+ * Also sysret mandates a special GDT layout
+ *
+ * The TLS descriptors are currently at a different place compared to i386.
+ * Hopefully nobody expects them at a fixed place (Wine?)
+ */
[GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
[GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
[GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
-} };
#else
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
@@ -89,9 +122,10 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
[GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
[GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
- [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
-} };
+ [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
+ GDT_STACK_CANARY_INIT
#endif
+} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
#ifdef CONFIG_X86_32
@@ -192,6 +226,49 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
#endif
/*
+ * Some CPU features depend on higher CPUID levels, which may not always
+ * be available due to CPUID level capping or broken virtualization
+ * software. Add those features to this table to auto-disable them.
+ */
+struct cpuid_dependent_feature {
+ u32 feature;
+ u32 level;
+};
+static const struct cpuid_dependent_feature __cpuinitconst
+cpuid_dependent_features[] = {
+ { X86_FEATURE_MWAIT, 0x00000005 },
+ { X86_FEATURE_DCA, 0x00000009 },
+ { X86_FEATURE_XSAVE, 0x0000000d },
+ { 0, 0 }
+};
+
+static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
+{
+ const struct cpuid_dependent_feature *df;
+ for (df = cpuid_dependent_features; df->feature; df++) {
+ /*
+ * Note: cpuid_level is set to -1 if unavailable, but
+ * extended_extended_level is set to 0 if unavailable
+ * and the legitimate extended levels are all negative
+ * when signed; hence the weird messing around with
+ * signs here...
+ */
+ if (cpu_has(c, df->feature) &&
+ ((s32)df->feature < 0 ?
+ (u32)df->feature > (u32)c->extended_cpuid_level :
+ (s32)df->feature > (s32)c->cpuid_level)) {
+ clear_cpu_cap(c, df->feature);
+ if (warn)
+ printk(KERN_WARNING
+ "CPU: CPU feature %s disabled "
+ "due to lack of CPUID level 0x%x\n",
+ x86_cap_flags[df->feature],
+ df->level);
+ }
+ }
+}
+
+/*
* Naming convention should be: <Name> [(<Codename>)]
* This table only is used unless init_<vendor>() below doesn't set it;
* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
@@ -221,18 +298,29 @@ static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+void load_percpu_segment(int cpu)
+{
+#ifdef CONFIG_X86_32
+ loadsegment(fs, __KERNEL_PERCPU);
+#else
+ loadsegment(gs, 0);
+ wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
+#endif
+ load_stack_canary_segment();
+}
+
/* Current gdt points %fs at the "master" per-cpu area: after this,
* it's on the real one. */
-void switch_to_new_gdt(void)
+void switch_to_new_gdt(int cpu)
{
struct desc_ptr gdt_descr;
- gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
+ gdt_descr.address = (long)get_cpu_gdt_table(cpu);
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
-#ifdef CONFIG_X86_32
- asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
-#endif
+ /* Reload the per-cpu base */
+
+ load_percpu_segment(cpu);
}
static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
@@ -354,7 +442,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
} else if (smp_num_siblings > 1) {
- if (smp_num_siblings > NR_CPUS) {
+ if (smp_num_siblings > nr_cpu_ids) {
printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
smp_num_siblings);
smp_num_siblings = 1;
@@ -362,11 +450,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
}
index_msb = get_count_order(smp_num_siblings);
-#ifdef CONFIG_X86_64
- c->phys_proc_id = phys_pkg_id(index_msb);
-#else
- c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
-#endif
+ c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
smp_num_siblings = smp_num_siblings / c->x86_max_cores;
@@ -374,13 +458,8 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
core_bits = get_count_order(c->x86_max_cores);
-#ifdef CONFIG_X86_64
- c->cpu_core_id = phys_pkg_id(index_msb) &
+ c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
((1 << core_bits) - 1);
-#else
- c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
- ((1 << core_bits) - 1);
-#endif
}
out:
@@ -549,11 +628,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
if (this_cpu->c_early_init)
this_cpu->c_early_init(c);
- validate_pat_support(c);
-
#ifdef CONFIG_SMP
c->cpu_index = boot_cpu_id;
#endif
+ filter_cpuid_features(c, false);
}
void __init early_cpu_init(void)
@@ -616,7 +694,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
#ifdef CONFIG_X86_32
# ifdef CONFIG_X86_HT
- c->apicid = phys_pkg_id(c->initial_apicid, 0);
+ c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
# else
c->apicid = c->initial_apicid;
# endif
@@ -663,7 +741,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
this_cpu->c_identify(c);
#ifdef CONFIG_X86_64
- c->apicid = phys_pkg_id(0);
+ c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
#endif
/*
@@ -687,6 +765,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
* we do "generic changes."
*/
+ /* Filter out anything that depends on CPUID levels we don't have */
+ filter_cpuid_features(c, true);
+
/* If the model name is still unset, do table lookup. */
if (!c->x86_model_id[0]) {
char *p;
@@ -703,6 +784,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
detect_ht(c);
#endif
+ init_hypervisor(c);
/*
* On SMP, boot_cpu_data holds the common feature set between
* all CPUs; so make sure that we indicate which features are
@@ -854,57 +936,23 @@ static __init int setup_disablecpuid(char *arg)
}
__setup("clearcpuid=", setup_disablecpuid);
-cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
-
#ifdef CONFIG_X86_64
-struct x8664_pda **_cpu_pda __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
-
struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
-char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
+DEFINE_PER_CPU_FIRST(union irq_stack_union,
+ irq_stack_union) __aligned(PAGE_SIZE);
+DEFINE_PER_CPU(char *, irq_stack_ptr) =
+ init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
-void __cpuinit pda_init(int cpu)
-{
- struct x8664_pda *pda = cpu_pda(cpu);
+DEFINE_PER_CPU(unsigned long, kernel_stack) =
+ (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
+EXPORT_PER_CPU_SYMBOL(kernel_stack);
- /* Setup up data that may be needed in __get_free_pages early */
- loadsegment(fs, 0);
- loadsegment(gs, 0);
- /* Memory clobbers used to order PDA accessed */
- mb();
- wrmsrl(MSR_GS_BASE, pda);
- mb();
-
- pda->cpunumber = cpu;
- pda->irqcount = -1;
- pda->kernelstack = (unsigned long)stack_thread_info() -
- PDA_STACKOFFSET + THREAD_SIZE;
- pda->active_mm = &init_mm;
- pda->mmu_state = 0;
-
- if (cpu == 0) {
- /* others are initialized in smpboot.c */
- pda->pcurrent = &init_task;
- pda->irqstackptr = boot_cpu_stack;
- pda->irqstackptr += IRQSTACKSIZE - 64;
- } else {
- if (!pda->irqstackptr) {
- pda->irqstackptr = (char *)
- __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
- if (!pda->irqstackptr)
- panic("cannot allocate irqstack for cpu %d",
- cpu);
- pda->irqstackptr += IRQSTACKSIZE - 64;
- }
+DEFINE_PER_CPU(unsigned int, irq_count) = -1;
- if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
- pda->nodenumber = cpu_to_node(cpu);
- }
-}
-
-char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
- DEBUG_STKSZ] __page_aligned_bss;
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
+ __aligned(PAGE_SIZE);
extern asmlinkage void ignore_sysret(void);
@@ -937,16 +985,21 @@ unsigned long kernel_eflags;
*/
DEFINE_PER_CPU(struct orig_ist, orig_ist);
-#else
+#else /* x86_64 */
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+DEFINE_PER_CPU(unsigned long, stack_canary);
+#endif
-/* Make sure %fs is initialized properly in idle threads */
+/* Make sure %fs and %gs are initialized properly in idle threads */
struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
{
memset(regs, 0, sizeof(struct pt_regs));
regs->fs = __KERNEL_PERCPU;
+ regs->gs = __KERNEL_STACK_CANARY;
return regs;
}
-#endif
+#endif /* x86_64 */
/*
* cpu_init() initializes state that is per-CPU. Some data is already
@@ -962,19 +1015,18 @@ void __cpuinit cpu_init(void)
struct tss_struct *t = &per_cpu(init_tss, cpu);
struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
unsigned long v;
- char *estacks = NULL;
struct task_struct *me;
int i;
- /* CPU 0 is initialised in head64.c */
- if (cpu != 0)
- pda_init(cpu);
- else
- estacks = boot_exception_stacks;
+#ifdef CONFIG_NUMA
+ if (cpu != 0 && percpu_read(node_number) == 0 &&
+ cpu_to_node(cpu) != NUMA_NO_NODE)
+ percpu_write(node_number, cpu_to_node(cpu));
+#endif
me = current;
- if (cpu_test_and_set(cpu, cpu_initialized))
+ if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
panic("CPU#%d already initialized!\n", cpu);
printk(KERN_INFO "Initializing CPU#%d\n", cpu);
@@ -986,7 +1038,9 @@ void __cpuinit cpu_init(void)
* and set up the GDT descriptor:
*/
- switch_to_new_gdt();
+ switch_to_new_gdt(cpu);
+ loadsegment(fs, 0);
+
load_idt((const struct desc_ptr *)&idt_descr);
memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
@@ -1004,18 +1058,13 @@ void __cpuinit cpu_init(void)
* set up and load the per-CPU TSS
*/
if (!orig_ist->ist[0]) {
- static const unsigned int order[N_EXCEPTION_STACKS] = {
- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
- [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+ static const unsigned int sizes[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
+ [DEBUG_STACK - 1] = DEBUG_STKSZ
};
+ char *estacks = per_cpu(exception_stacks, cpu);
for (v = 0; v < N_EXCEPTION_STACKS; v++) {
- if (cpu) {
- estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
- if (!estacks)
- panic("Cannot allocate exception "
- "stack %ld %d\n", v, cpu);
- }
- estacks += PAGE_SIZE << order[v];
+ estacks += sizes[v];
orig_ist->ist[v] = t->x86_tss.ist[v] =
(unsigned long)estacks;
}
@@ -1049,22 +1098,19 @@ void __cpuinit cpu_init(void)
*/
if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
arch_kgdb_ops.correct_hw_break();
- else {
+ else
#endif
- /*
- * Clear all 6 debug registers:
- */
-
- set_debugreg(0UL, 0);
- set_debugreg(0UL, 1);
- set_debugreg(0UL, 2);
- set_debugreg(0UL, 3);
- set_debugreg(0UL, 6);
- set_debugreg(0UL, 7);
-#ifdef CONFIG_KGDB
- /* If the kgdb is connected no debug regs should be altered. */
+ {
+ /*
+ * Clear all 6 debug registers:
+ */
+ set_debugreg(0UL, 0);
+ set_debugreg(0UL, 1);
+ set_debugreg(0UL, 2);
+ set_debugreg(0UL, 3);
+ set_debugreg(0UL, 6);
+ set_debugreg(0UL, 7);
}
-#endif
fpu_init();
@@ -1083,7 +1129,7 @@ void __cpuinit cpu_init(void)
struct tss_struct *t = &per_cpu(init_tss, cpu);
struct thread_struct *thread = &curr->thread;
- if (cpu_test_and_set(cpu, cpu_initialized)) {
+ if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
for (;;) local_irq_enable();
}
@@ -1094,7 +1140,7 @@ void __cpuinit cpu_init(void)
clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
load_idt(&idt_descr);
- switch_to_new_gdt();
+ switch_to_new_gdt(cpu);
/*
* Set up and load the per-CPU TSS and LDT
@@ -1115,9 +1161,6 @@ void __cpuinit cpu_init(void)
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
#endif
- /* Clear %gs. */
- asm volatile ("mov %0, %%gs" : : "r" (0));
-
/* Clear all 6 debug registers: */
set_debugreg(0, 0);
set_debugreg(0, 1);
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index efae3b22a0f..65792c2cc46 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -245,17 +245,6 @@ config X86_E_POWERSAVER
comment "shared options"
-config X86_ACPI_CPUFREQ_PROC_INTF
- bool "/proc/acpi/processor/../performance interface (deprecated)"
- depends on PROC_FS
- depends on X86_ACPI_CPUFREQ || X86_POWERNOW_K7_ACPI || X86_POWERNOW_K8_ACPI
- help
- This enables the deprecated /proc/acpi/processor/../performance
- interface. While it is helpful for debugging, the generic,
- cross-architecture cpufreq interfaces should be used.
-
- If in doubt, say N.
-
config X86_SPEEDSTEP_LIB
tristate
default (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD)
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 8e48c5d4467..4b1c319d30c 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,6 +33,7 @@
#include <linux/cpufreq.h>
#include <linux/compiler.h>
#include <linux/dmi.h>
+#include <linux/ftrace.h>
#include <linux/acpi.h>
#include <acpi/processor.h>
@@ -144,13 +145,14 @@ typedef union {
struct drv_cmd {
unsigned int type;
- cpumask_t mask;
+ const struct cpumask *mask;
drv_addr_union addr;
u32 val;
};
-static void do_drv_read(struct drv_cmd *cmd)
+static long do_drv_read(void *_cmd)
{
+ struct drv_cmd *cmd = _cmd;
u32 h;
switch (cmd->type) {
@@ -165,10 +167,12 @@ static void do_drv_read(struct drv_cmd *cmd)
default:
break;
}
+ return 0;
}
-static void do_drv_write(struct drv_cmd *cmd)
+static long do_drv_write(void *_cmd)
{
+ struct drv_cmd *cmd = _cmd;
u32 lo, hi;
switch (cmd->type) {
@@ -185,48 +189,41 @@ static void do_drv_write(struct drv_cmd *cmd)
default:
break;
}
+ return 0;
}
static void drv_read(struct drv_cmd *cmd)
{
- cpumask_t saved_mask = current->cpus_allowed;
cmd->val = 0;
- set_cpus_allowed_ptr(current, &cmd->mask);
- do_drv_read(cmd);
- set_cpus_allowed_ptr(current, &saved_mask);
+ work_on_cpu(cpumask_any(cmd->mask), do_drv_read, cmd);
}
static void drv_write(struct drv_cmd *cmd)
{
- cpumask_t saved_mask = current->cpus_allowed;
unsigned int i;
- for_each_cpu_mask_nr(i, cmd->mask) {
- set_cpus_allowed_ptr(current, &cpumask_of_cpu(i));
- do_drv_write(cmd);
+ for_each_cpu(i, cmd->mask) {
+ work_on_cpu(i, do_drv_write, cmd);
}
-
- set_cpus_allowed_ptr(current, &saved_mask);
- return;
}
-static u32 get_cur_val(const cpumask_t *mask)
+static u32 get_cur_val(const struct cpumask *mask)
{
struct acpi_processor_performance *perf;
struct drv_cmd cmd;
- if (unlikely(cpus_empty(*mask)))
+ if (unlikely(cpumask_empty(mask)))
return 0;
- switch (per_cpu(drv_data, first_cpu(*mask))->cpu_feature) {
+ switch (per_cpu(drv_data, cpumask_first(mask))->cpu_feature) {
case SYSTEM_INTEL_MSR_CAPABLE:
cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
break;
case SYSTEM_IO_CAPABLE:
cmd.type = SYSTEM_IO_CAPABLE;
- perf = per_cpu(drv_data, first_cpu(*mask))->acpi_data;
+ perf = per_cpu(drv_data, cpumask_first(mask))->acpi_data;
cmd.addr.io.port = perf->control_register.address;
cmd.addr.io.bit_width = perf->control_register.bit_width;
break;
@@ -234,8 +231,7 @@ static u32 get_cur_val(const cpumask_t *mask)
return 0;
}
- cmd.mask = *mask;
-
+ cmd.mask = mask;
drv_read(&cmd);
dprintk("get_cur_val = %u\n", cmd.val);
@@ -243,6 +239,30 @@ static u32 get_cur_val(const cpumask_t *mask)
return cmd.val;
}
+struct perf_cur {
+ union {
+ struct {
+ u32 lo;
+ u32 hi;
+ } split;
+ u64 whole;
+ } aperf_cur, mperf_cur;
+};
+
+
+static long read_measured_perf_ctrs(void *_cur)
+{
+ struct perf_cur *cur = _cur;
+
+ rdmsr(MSR_IA32_APERF, cur->aperf_cur.split.lo, cur->aperf_cur.split.hi);
+ rdmsr(MSR_IA32_MPERF, cur->mperf_cur.split.lo, cur->mperf_cur.split.hi);
+
+ wrmsr(MSR_IA32_APERF, 0, 0);
+ wrmsr(MSR_IA32_MPERF, 0, 0);
+
+ return 0;
+}
+
/*
* Return the measured active (C0) frequency on this CPU since last call
* to this function.
@@ -259,31 +279,12 @@ static u32 get_cur_val(const cpumask_t *mask)
static unsigned int get_measured_perf(struct cpufreq_policy *policy,
unsigned int cpu)
{
- union {
- struct {
- u32 lo;
- u32 hi;
- } split;
- u64 whole;
- } aperf_cur, mperf_cur;
-
- cpumask_t saved_mask;
+ struct perf_cur cur;
unsigned int perf_percent;
unsigned int retval;
- saved_mask = current->cpus_allowed;
- set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
- if (get_cpu() != cpu) {
- /* We were not able to run on requested processor */
- put_cpu();
+ if (!work_on_cpu(cpu, read_measured_perf_ctrs, &cur))
return 0;
- }
-
- rdmsr(MSR_IA32_APERF, aperf_cur.split.lo, aperf_cur.split.hi);
- rdmsr(MSR_IA32_MPERF, mperf_cur.split.lo, mperf_cur.split.hi);
-
- wrmsr(MSR_IA32_APERF, 0,0);
- wrmsr(MSR_IA32_MPERF, 0,0);
#ifdef __i386__
/*
@@ -291,37 +292,39 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
* Get an approximate value. Return failure in case we cannot get
* an approximate value.
*/
- if (unlikely(aperf_cur.split.hi || mperf_cur.split.hi)) {
+ if (unlikely(cur.aperf_cur.split.hi || cur.mperf_cur.split.hi)) {
int shift_count;
u32 h;
- h = max_t(u32, aperf_cur.split.hi, mperf_cur.split.hi);
+ h = max_t(u32, cur.aperf_cur.split.hi, cur.mperf_cur.split.hi);
shift_count = fls(h);
- aperf_cur.whole >>= shift_count;
- mperf_cur.whole >>= shift_count;
+ cur.aperf_cur.whole >>= shift_count;
+ cur.mperf_cur.whole >>= shift_count;
}
- if (((unsigned long)(-1) / 100) < aperf_cur.split.lo) {
+ if (((unsigned long)(-1) / 100) < cur.aperf_cur.split.lo) {
int shift_count = 7;
- aperf_cur.split.lo >>= shift_count;
- mperf_cur.split.lo >>= shift_count;
+ cur.aperf_cur.split.lo >>= shift_count;
+ cur.mperf_cur.split.lo >>= shift_count;
}
- if (aperf_cur.split.lo && mperf_cur.split.lo)
- perf_percent = (aperf_cur.split.lo * 100) / mperf_cur.split.lo;
+ if (cur.aperf_cur.split.lo && cur.mperf_cur.split.lo)
+ perf_percent = (cur.aperf_cur.split.lo * 100) /
+ cur.mperf_cur.split.lo;
else
perf_percent = 0;
#else
- if (unlikely(((unsigned long)(-1) / 100) < aperf_cur.whole)) {
+ if (unlikely(((unsigned long)(-1) / 100) < cur.aperf_cur.whole)) {
int shift_count = 7;
- aperf_cur.whole >>= shift_count;
- mperf_cur.whole >>= shift_count;
+ cur.aperf_cur.whole >>= shift_count;
+ cur.mperf_cur.whole >>= shift_count;
}
- if (aperf_cur.whole && mperf_cur.whole)
- perf_percent = (aperf_cur.whole * 100) / mperf_cur.whole;
+ if (cur.aperf_cur.whole && cur.mperf_cur.whole)
+ perf_percent = (cur.aperf_cur.whole * 100) /
+ cur.mperf_cur.whole;
else
perf_percent = 0;
@@ -329,10 +332,6 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100;
- put_cpu();
- set_cpus_allowed_ptr(current, &saved_mask);
-
- dprintk("cpu %d: performance percent %d\n", cpu, perf_percent);
return retval;
}
@@ -350,7 +349,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
}
cached_freq = data->freq_table[data->acpi_data->state].frequency;
- freq = extract_freq(get_cur_val(&cpumask_of_cpu(cpu)), data);
+ freq = extract_freq(get_cur_val(cpumask_of(cpu)), data);
if (freq != cached_freq) {
/*
* The dreaded BIOS frequency change behind our back.
@@ -364,7 +363,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
return freq;
}
-static unsigned int check_freqs(const cpumask_t *mask, unsigned int freq,
+static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
struct acpi_cpufreq_data *data)
{
unsigned int cur_freq;
@@ -385,12 +384,12 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
struct acpi_processor_performance *perf;
struct cpufreq_freqs freqs;
- cpumask_t online_policy_cpus;
struct drv_cmd cmd;
unsigned int next_state = 0; /* Index into freq_table */
unsigned int next_perf_state = 0; /* Index into perf table */
unsigned int i;
int result = 0;
+ struct power_trace it;
dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
@@ -404,15 +403,10 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
data->freq_table,
target_freq,
relation, &next_state);
- if (unlikely(result))
- return -ENODEV;
-
-#ifdef CONFIG_HOTPLUG_CPU
- /* cpufreq holds the hotplug lock, so we are safe from here on */
- cpus_and(online_policy_cpus, cpu_online_map, policy->cpus);
-#else
- online_policy_cpus = policy->cpus;
-#endif
+ if (unlikely(result)) {
+ result = -ENODEV;
+ goto out;
+ }
next_perf_state = data->freq_table[next_state].index;
if (perf->state == next_perf_state) {
@@ -423,10 +417,12 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
} else {
dprintk("Already at target state (P%d)\n",
next_perf_state);
- return 0;
+ goto out;
}
}
+ trace_power_mark(&it, POWER_PSTATE, next_perf_state);
+
switch (data->cpu_feature) {
case SYSTEM_INTEL_MSR_CAPABLE:
cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
@@ -440,19 +436,19 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
cmd.val = (u32) perf->states[next_perf_state].control;
break;
default:
- return -ENODEV;
+ result = -ENODEV;
+ goto out;
}
- cpus_clear(cmd.mask);
-
+ /* cpufreq holds the hotplug lock, so we are safe from here on */
if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
- cmd.mask = online_policy_cpus;
+ cmd.mask = policy->cpus;
else
- cpu_set(policy->cpu, cmd.mask);
+ cmd.mask = cpumask_of(policy->cpu);
freqs.old = perf->states[perf->state].core_frequency * 1000;
freqs.new = data->freq_table[next_state].frequency;
- for_each_cpu_mask_nr(i, cmd.mask) {
+ for_each_cpu(i, cmd.mask) {
freqs.cpu = i;
cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
}
@@ -460,19 +456,21 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
drv_write(&cmd);
if (acpi_pstate_strict) {
- if (!check_freqs(&cmd.mask, freqs.new, data)) {
+ if (!check_freqs(cmd.mask, freqs.new, data)) {
dprintk("acpi_cpufreq_target failed (%d)\n",
policy->cpu);
- return -EAGAIN;
+ result = -EAGAIN;
+ goto out;
}
}
- for_each_cpu_mask_nr(i, cmd.mask) {
+ for_each_cpu(i, cmd.mask) {
freqs.cpu = i;
cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
}
perf->state = next_perf_state;
+out:
return result;
}
@@ -513,6 +511,17 @@ acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
}
}
+static void free_acpi_perf_data(void)
+{
+ unsigned int i;
+
+ /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */
+ for_each_possible_cpu(i)
+ free_cpumask_var(per_cpu_ptr(acpi_perf_data, i)
+ ->shared_cpu_map);
+ free_percpu(acpi_perf_data);
+}
+
/*
* acpi_cpufreq_early_init - initialize ACPI P-States library
*
@@ -523,6 +532,7 @@ acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
*/
static int __init acpi_cpufreq_early_init(void)
{
+ unsigned int i;
dprintk("acpi_cpufreq_early_init\n");
acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
@@ -530,6 +540,16 @@ static int __init acpi_cpufreq_early_init(void)
dprintk("Memory allocation error for acpi_perf_data.\n");
return -ENOMEM;
}
+ for_each_possible_cpu(i) {
+ if (!alloc_cpumask_var_node(
+ &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
+ GFP_KERNEL, cpu_to_node(i))) {
+
+ /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */
+ free_acpi_perf_data();
+ return -ENOMEM;
+ }
+ }
/* Do initialization in ACPI core */
acpi_processor_preregister_performance(acpi_perf_data);
@@ -600,15 +620,15 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
*/
if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
- policy->cpus = perf->shared_cpu_map;
+ cpumask_copy(policy->cpus, perf->shared_cpu_map);
}
- policy->related_cpus = perf->shared_cpu_map;
+ cpumask_copy(policy->related_cpus, perf->shared_cpu_map);
#ifdef CONFIG_SMP
dmi_check_system(sw_any_bug_dmi_table);
- if (bios_with_sw_any_bug && cpus_weight(policy->cpus) == 1) {
+ if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) {
policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
- policy->cpus = per_cpu(cpu_core_map, cpu);
+ cpumask_copy(policy->cpus, cpu_core_mask(cpu));
}
#endif
@@ -791,7 +811,7 @@ static int __init acpi_cpufreq_init(void)
ret = cpufreq_register_driver(&acpi_cpufreq_driver);
if (ret)
- free_percpu(acpi_perf_data);
+ free_acpi_perf_data();
return ret;
}
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index b0461856acf..a4cff5d6e38 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -982,7 +982,7 @@ static int __init longhaul_init(void)
case 10:
printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n");
default:
- ;;
+ ;
}
return -ENODEV;
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index b8e05ee4f73..b585e04cbc9 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -122,7 +122,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
return 0;
/* notifiers */
- for_each_cpu_mask_nr(i, policy->cpus) {
+ for_each_cpu(i, policy->cpus) {
freqs.cpu = i;
cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
}
@@ -130,11 +130,11 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
/* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software
* Developer's Manual, Volume 3
*/
- for_each_cpu_mask_nr(i, policy->cpus)
+ for_each_cpu(i, policy->cpus)
cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
/* notifiers */
- for_each_cpu_mask_nr(i, policy->cpus) {
+ for_each_cpu(i, policy->cpus) {
freqs.cpu = i;
cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
}
@@ -160,6 +160,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
switch (c->x86_model) {
case 0x0E: /* Core */
case 0x0F: /* Core Duo */
+ case 0x16: /* Celeron Core */
p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PCORE);
case 0x0D: /* Pentium M (Dothan) */
@@ -171,7 +172,9 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
}
if (c->x86 != 0xF) {
- printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@vger.kernel.org>\n");
+ if (!cpu_has(c, X86_FEATURE_EST))
+ printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. "
+ "Please send an e-mail to <cpufreq@vger.kernel.org>\n");
return 0;
}
@@ -200,7 +203,7 @@ static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
unsigned int i;
#ifdef CONFIG_SMP
- policy->cpus = per_cpu(cpu_sibling_map, policy->cpu);
+ cpumask_copy(policy->cpus, &per_cpu(cpu_sibling_map, policy->cpu));
#endif
/* Errata workaround */
@@ -274,6 +277,7 @@ static struct cpufreq_driver p4clockmod_driver = {
.name = "p4-clockmod",
.owner = THIS_MODULE,
.attr = p4clockmod_attr,
+ .hide_interface = 1,
};
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 7c7d56b4313..1b446d79a8f 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -310,6 +310,12 @@ static int powernow_acpi_init(void)
goto err0;
}
+ if (!alloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
+ GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto err05;
+ }
+
if (acpi_processor_register_performance(acpi_processor_perf, 0)) {
retval = -EIO;
goto err1;
@@ -412,6 +418,8 @@ static int powernow_acpi_init(void)
err2:
acpi_processor_unregister_performance(acpi_processor_perf, 0);
err1:
+ free_cpumask_var(acpi_processor_perf->shared_cpu_map);
+err05:
kfree(acpi_processor_perf);
err0:
printk(KERN_WARNING PFX "ACPI perflib can not be used in this platform\n");
@@ -652,6 +660,7 @@ static int powernow_cpu_exit (struct cpufreq_policy *policy) {
#ifdef CONFIG_X86_POWERNOW_K7_ACPI
if (acpi_processor_perf) {
acpi_processor_unregister_performance(acpi_processor_perf, 0);
+ free_cpumask_var(acpi_processor_perf->shared_cpu_map);
kfree(acpi_processor_perf);
}
#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index d3dcd58b87c..fb039cd345d 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -115,9 +115,20 @@ static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
u32 i = 0;
if (cpu_family == CPU_HW_PSTATE) {
- rdmsr(MSR_PSTATE_STATUS, lo, hi);
- i = lo & HW_PSTATE_MASK;
- data->currpstate = i;
+ if (data->currpstate == HW_PSTATE_INVALID) {
+ /* read (initial) hw pstate if not yet set */
+ rdmsr(MSR_PSTATE_STATUS, lo, hi);
+ i = lo & HW_PSTATE_MASK;
+
+ /*
+ * a workaround for family 11h erratum 311 might cause
+ * an "out-of-range Pstate if the core is in Pstate-0
+ */
+ if (i >= data->numps)
+ data->currpstate = HW_PSTATE_0;
+ else
+ data->currpstate = i;
+ }
return 0;
}
do {
@@ -755,7 +766,7 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned
static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
{
struct cpufreq_frequency_table *powernow_table;
- int ret_val;
+ int ret_val = -ENODEV;
if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
dprintk("register performance failed: bad ACPI data\n");
@@ -804,6 +815,13 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
/* notify BIOS that we exist */
acpi_processor_notify_smm(THIS_MODULE);
+ if (!alloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
+ printk(KERN_ERR PFX
+ "unable to alloc powernow_k8_data cpumask\n");
+ ret_val = -ENOMEM;
+ goto err_out_mem;
+ }
+
return 0;
err_out_mem:
@@ -815,7 +833,7 @@ err_out:
/* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */
data->acpi_data.state_count = 0;
- return -ENODEV;
+ return ret_val;
}
static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table)
@@ -918,12 +936,28 @@ static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
{
if (data->acpi_data.state_count)
acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
+ free_cpumask_var(data->acpi_data.shared_cpu_map);
+}
+
+static int get_transition_latency(struct powernow_k8_data *data)
+{
+ int max_latency = 0;
+ int i;
+ for (i = 0; i < data->acpi_data.state_count; i++) {
+ int cur_latency = data->acpi_data.states[i].transition_latency
+ + data->acpi_data.states[i].bus_master_latency;
+ if (cur_latency > max_latency)
+ max_latency = cur_latency;
+ }
+ /* value in usecs, needs to be in nanoseconds */
+ return 1000 * max_latency;
}
#else
static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; }
static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; }
static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; }
+static int get_transition_latency(struct powernow_k8_data *data) { return 0; }
#endif /* CONFIG_X86_POWERNOW_K8_ACPI */
/* Take a frequency, and issue the fid/vid transition command */
@@ -1121,8 +1155,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
}
data->cpu = pol->cpu;
+ data->currpstate = HW_PSTATE_INVALID;
- if (powernow_k8_cpu_init_acpi(data)) {
+ rc = powernow_k8_cpu_init_acpi(data);
+ if (rc) {
/*
* Use the PSB BIOS structure. This is only availabe on
* an UP version, and is deprecated by AMD.
@@ -1140,22 +1176,25 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
"ACPI maintainers and complain to your BIOS "
"vendor.\n");
#endif
- kfree(data);
- return -ENODEV;
+ goto err_out;
}
if (pol->cpu != 0) {
printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
"CPU other than CPU0. Complain to your BIOS "
"vendor.\n");
- kfree(data);
- return -ENODEV;
+ goto err_out;
}
rc = find_psb_table(data);
if (rc) {
- kfree(data);
- return -ENODEV;
+ goto err_out;
}
- }
+ /* Take a crude guess here.
+ * That guess was in microseconds, so multiply with 1000 */
+ pol->cpuinfo.transition_latency = (
+ ((data->rvo + 8) * data->vstable * VST_UNITS_20US) +
+ ((1 << data->irt) * 30)) * 1000;
+ } else /* ACPI _PSS objects available */
+ pol->cpuinfo.transition_latency = get_transition_latency(data);
/* only run on specific CPU from here on */
oldmask = current->cpus_allowed;
@@ -1181,15 +1220,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
set_cpus_allowed_ptr(current, &oldmask);
if (cpu_family == CPU_HW_PSTATE)
- pol->cpus = cpumask_of_cpu(pol->cpu);
+ cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
else
- pol->cpus = per_cpu(cpu_core_map, pol->cpu);
- data->available_cores = &(pol->cpus);
-
- /* Take a crude guess here.
- * That guess was in microseconds, so multiply with 1000 */
- pol->cpuinfo.transition_latency = (((data->rvo + 8) * data->vstable * VST_UNITS_20US)
- + (3 * (1 << data->irt) * 10)) * 1000;
+ cpumask_copy(pol->cpus, &per_cpu(cpu_core_map, pol->cpu));
+ data->available_cores = pol->cpus;
if (cpu_family == CPU_HW_PSTATE)
pol->cur = find_khz_freq_from_pstate(data->powernow_table, data->currpstate);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index ab48cfed4d9..8ecc75b6c7c 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -5,6 +5,19 @@
* http://www.gnu.org/licenses/gpl.html
*/
+
+enum pstate {
+ HW_PSTATE_INVALID = 0xff,
+ HW_PSTATE_0 = 0,
+ HW_PSTATE_1 = 1,
+ HW_PSTATE_2 = 2,
+ HW_PSTATE_3 = 3,
+ HW_PSTATE_4 = 4,
+ HW_PSTATE_5 = 5,
+ HW_PSTATE_6 = 6,
+ HW_PSTATE_7 = 7,
+};
+
struct powernow_k8_data {
unsigned int cpu;
@@ -23,7 +36,9 @@ struct powernow_k8_data {
u32 exttype; /* extended interface = 1 */
/* keep track of the current fid / vid or pstate */
- u32 currvid, currfid, currpstate;
+ u32 currvid;
+ u32 currfid;
+ enum pstate currpstate;
/* the powernow_table includes all frequency and vid/fid pairings:
* fid are the lower 8 bits of the index, vid are the upper 8 bits.
@@ -38,7 +53,7 @@ struct powernow_k8_data {
/* we need to keep track of associated cores, but let cpufreq
* handle hotplug events - so just point at cpufreq pol->cpus
* structure */
- cpumask_t *available_cores;
+ struct cpumask *available_cores;
};
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 3b5f06423e7..f08998278a3 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -458,13 +458,6 @@ static int centrino_verify (struct cpufreq_policy *policy)
*
* Sets a new CPUFreq policy.
*/
-struct allmasks {
- cpumask_t online_policy_cpus;
- cpumask_t saved_mask;
- cpumask_t set_mask;
- cpumask_t covered_cpus;
-};
-
static int centrino_target (struct cpufreq_policy *policy,
unsigned int target_freq,
unsigned int relation)
@@ -474,14 +467,15 @@ static int centrino_target (struct cpufreq_policy *policy,
struct cpufreq_freqs freqs;
int retval = 0;
unsigned int j, k, first_cpu, tmp;
- CPUMASK_ALLOC(allmasks);
- CPUMASK_PTR(online_policy_cpus, allmasks);
- CPUMASK_PTR(saved_mask, allmasks);
- CPUMASK_PTR(set_mask, allmasks);
- CPUMASK_PTR(covered_cpus, allmasks);
+ cpumask_var_t saved_mask, covered_cpus;
- if (unlikely(allmasks == NULL))
+ if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL)))
+ return -ENOMEM;
+ if (unlikely(!alloc_cpumask_var(&covered_cpus, GFP_KERNEL))) {
+ free_cpumask_var(saved_mask);
return -ENOMEM;
+ }
+ cpumask_copy(saved_mask, &current->cpus_allowed);
if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
retval = -ENODEV;
@@ -497,30 +491,26 @@ static int centrino_target (struct cpufreq_policy *policy,
goto out;
}
-#ifdef CONFIG_HOTPLUG_CPU
- /* cpufreq holds the hotplug lock, so we are safe from here on */
- cpus_and(*online_policy_cpus, cpu_online_map, policy->cpus);
-#else
- *online_policy_cpus = policy->cpus;
-#endif
-
- *saved_mask = current->cpus_allowed;
first_cpu = 1;
- cpus_clear(*covered_cpus);
- for_each_cpu_mask_nr(j, *online_policy_cpus) {
+ for_each_cpu(j, policy->cpus) {
+ const struct cpumask *mask;
+
+ /* cpufreq holds the hotplug lock, so we are safe here */
+ if (!cpu_online(j))
+ continue;
+
/*
* Support for SMP systems.
* Make sure we are running on CPU that wants to change freq
*/
- cpus_clear(*set_mask);
if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
- cpus_or(*set_mask, *set_mask, *online_policy_cpus);
+ mask = policy->cpus;
else
- cpu_set(j, *set_mask);
+ mask = cpumask_of(j);
- set_cpus_allowed_ptr(current, set_mask);
+ set_cpus_allowed_ptr(current, mask);
preempt_disable();
- if (unlikely(!cpu_isset(smp_processor_id(), *set_mask))) {
+ if (unlikely(!cpu_isset(smp_processor_id(), *mask))) {
dprintk("couldn't limit to CPUs in this domain\n");
retval = -EAGAIN;
if (first_cpu) {
@@ -548,7 +538,9 @@ static int centrino_target (struct cpufreq_policy *policy,
dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
target_freq, freqs.old, freqs.new, msr);
- for_each_cpu_mask_nr(k, *online_policy_cpus) {
+ for_each_cpu(k, policy->cpus) {
+ if (!cpu_online(k))
+ continue;
freqs.cpu = k;
cpufreq_notify_transition(&freqs,
CPUFREQ_PRECHANGE);
@@ -571,7 +563,9 @@ static int centrino_target (struct cpufreq_policy *policy,
preempt_enable();
}
- for_each_cpu_mask_nr(k, *online_policy_cpus) {
+ for_each_cpu(k, policy->cpus) {
+ if (!cpu_online(k))
+ continue;
freqs.cpu = k;
cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
}
@@ -584,18 +578,17 @@ static int centrino_target (struct cpufreq_policy *policy,
* Best effort undo..
*/
- if (!cpus_empty(*covered_cpus))
- for_each_cpu_mask_nr(j, *covered_cpus) {
- set_cpus_allowed_ptr(current,
- &cpumask_of_cpu(j));
- wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
- }
+ for_each_cpu_mask_nr(j, *covered_cpus) {
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(j));
+ wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
+ }
tmp = freqs.new;
freqs.new = freqs.old;
freqs.old = tmp;
- for_each_cpu_mask_nr(j, *online_policy_cpus) {
- freqs.cpu = j;
+ for_each_cpu(j, policy->cpus) {
+ if (!cpu_online(j))
+ continue;
cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
}
@@ -608,7 +601,8 @@ migrate_end:
preempt_enable();
set_cpus_allowed_ptr(current, saved_mask);
out:
- CPUMASK_FREE(allmasks);
+ free_cpumask_var(saved_mask);
+ free_cpumask_var(covered_cpus);
return retval;
}
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 04d0376b64b..dedc1e98f16 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -229,7 +229,7 @@ static unsigned int speedstep_detect_chipset (void)
return 0;
}
-static unsigned int _speedstep_get(const cpumask_t *cpus)
+static unsigned int _speedstep_get(const struct cpumask *cpus)
{
unsigned int speed;
cpumask_t cpus_allowed;
@@ -244,7 +244,7 @@ static unsigned int _speedstep_get(const cpumask_t *cpus)
static unsigned int speedstep_get(unsigned int cpu)
{
- return _speedstep_get(&cpumask_of_cpu(cpu));
+ return _speedstep_get(cpumask_of(cpu));
}
/**
@@ -267,7 +267,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate))
return -EINVAL;
- freqs.old = _speedstep_get(&policy->cpus);
+ freqs.old = _speedstep_get(policy->cpus);
freqs.new = speedstep_freqs[newstate].frequency;
freqs.cpu = policy->cpu;
@@ -279,20 +279,20 @@ static int speedstep_target (struct cpufreq_policy *policy,
cpus_allowed = current->cpus_allowed;
- for_each_cpu_mask_nr(i, policy->cpus) {
+ for_each_cpu(i, policy->cpus) {
freqs.cpu = i;
cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
}
/* switch to physical CPU where state is to be changed */
- set_cpus_allowed_ptr(current, &policy->cpus);
+ set_cpus_allowed_ptr(current, policy->cpus);
speedstep_set_state(newstate);
/* allow to be run on all CPUs */
set_cpus_allowed_ptr(current, &cpus_allowed);
- for_each_cpu_mask_nr(i, policy->cpus) {
+ for_each_cpu(i, policy->cpus) {
freqs.cpu = i;
cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
}
@@ -322,11 +322,11 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
/* only run on CPU to be set, or on its sibling */
#ifdef CONFIG_SMP
- policy->cpus = per_cpu(cpu_sibling_map, policy->cpu);
+ cpumask_copy(policy->cpus, &per_cpu(cpu_sibling_map, policy->cpu));
#endif
cpus_allowed = current->cpus_allowed;
- set_cpus_allowed_ptr(current, &policy->cpus);
+ set_cpus_allowed_ptr(current, policy->cpus);
/* detect low and high frequency and transition latency */
result = speedstep_get_freqs(speedstep_processor,
@@ -339,7 +339,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
return result;
/* get current speed setting */
- speed = _speedstep_get(&policy->cpus);
+ speed = _speedstep_get(policy->cpus);
if (!speed)
return -EIO;
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index 98d4fdb7dc0..cdac7d62369 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -139,6 +139,15 @@ static unsigned int pentium_core_get_frequency(void)
case 3:
fsb = 166667;
break;
+ case 2:
+ fsb = 200000;
+ break;
+ case 0:
+ fsb = 266667;
+ break;
+ case 4:
+ fsb = 333333;
+ break;
default:
printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value");
}
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
new file mode 100644
index 00000000000..fb5b86af0b0
--- /dev/null
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -0,0 +1,58 @@
+/*
+ * Common hypervisor code
+ *
+ * Copyright (C) 2008, VMware, Inc.
+ * Author : Alok N Kataria <akataria@vmware.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <asm/processor.h>
+#include <asm/vmware.h>
+#include <asm/hypervisor.h>
+
+static inline void __cpuinit
+detect_hypervisor_vendor(struct cpuinfo_x86 *c)
+{
+ if (vmware_platform()) {
+ c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE;
+ } else {
+ c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
+ }
+}
+
+unsigned long get_hypervisor_tsc_freq(void)
+{
+ if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
+ return vmware_get_tsc_khz();
+ return 0;
+}
+
+static inline void __cpuinit
+hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
+{
+ if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) {
+ vmware_set_feature_bits(c);
+ return;
+ }
+}
+
+void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
+{
+ detect_hypervisor_vendor(c);
+ hypervisor_set_feature_bits(c);
+}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index cce0b6118d5..1f137a87d4b 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -11,7 +11,6 @@
#include <asm/pgtable.h>
#include <asm/msr.h>
#include <asm/uaccess.h>
-#include <asm/ptrace.h>
#include <asm/ds.h>
#include <asm/bugs.h>
@@ -25,11 +24,24 @@
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/mpspec.h>
#include <asm/apic.h>
-#include <mach_apic.h>
+#include <asm/genapic.h>
#endif
static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
{
+ /* Unmask CPUID levels if masked: */
+ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
+ u64 misc_enable;
+
+ rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+
+ if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
+ misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
+ wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+ c->cpuid_level = cpuid_eax(0);
+ }
+ }
+
if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
(c->x86 == 0x6 && c->x86_model >= 0x0e))
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
@@ -41,6 +53,28 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
if (c->x86 == 15 && c->x86_cache_alignment == 64)
c->x86_cache_alignment = 128;
#endif
+
+ /*
+ * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
+ * with P/T states and does not stop in deep C-states
+ */
+ if (c->x86_power & (1 << 8)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ }
+
+ /*
+ * There is a known erratum on Pentium III and Core Solo
+ * and Core Duo CPUs.
+ * " Page with PAT set to WC while associated MTRR is UC
+ * may consolidate to UC "
+ * Because of this erratum, it is better to stick with
+ * setting WC in MTRR rather than using PAT on these CPUs.
+ *
+ * Enable PAT WC only on P4, Core 2 or later CPUs.
+ */
+ if (c->x86 == 6 && c->x86_model < 15)
+ clear_cpu_cap(c, X86_FEATURE_PAT);
}
#ifdef CONFIG_X86_32
@@ -242,6 +276,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
intel_workarounds(c);
+ /*
+ * Detect the extended topology information if available. This
+ * will reinitialise the initial_apicid which will be used
+ * in init_intel_cacheinfo()
+ */
+ detect_extended_topology(c);
+
l2 = init_intel_cacheinfo(c);
if (c->cpuid_level > 9) {
unsigned eax = cpuid_eax(10);
@@ -262,6 +303,9 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
ds_init_intel(c);
}
+ if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
+ set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR);
+
#ifdef CONFIG_X86_64
if (c->x86 == 15)
c->x86_cache_alignment = c->x86_clflush_size * 2;
@@ -307,13 +351,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_P4);
if (c->x86 == 6)
set_cpu_cap(c, X86_FEATURE_P3);
-
- if (cpu_has_bts)
- ptrace_bts_init_intel(c);
-
#endif
- detect_extended_topology(c);
if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
/*
* let's use the legacy cpuid vector 0x1 and 0x4 for topology
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 3f46afbb1cf..7293508d8f5 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -36,8 +36,11 @@ static struct _cache_table cache_table[] __cpuinitdata =
{
{ 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */
{ 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */
+ { 0x09, LVL_1_INST, 32 }, /* 4-way set assoc, 64 byte line size */
{ 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */
{ 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */
+ { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
+ { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
{ 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
{ 0x23, LVL_3, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */
{ 0x25, LVL_3, 2048 }, /* 8-way set assoc, sectored cache, 64 byte line size */
@@ -85,6 +88,18 @@ static struct _cache_table cache_table[] __cpuinitdata =
{ 0x85, LVL_2, 2048 }, /* 8-way set assoc, 32 byte line size */
{ 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */
{ 0x87, LVL_2, 1024 }, /* 8-way set assoc, 64 byte line size */
+ { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */
+ { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */
+ { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */
+ { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */
+ { 0xd7, LVL_3, 2038 }, /* 8-way set assoc, 64 byte line size */
+ { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */
+ { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */
+ { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */
+ { 0xde, LVL_3, 8192 }, /* 12-way set assoc, 64 byte line size */
+ { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */
+ { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */
+ { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */
{ 0x00, 0, 0}
};
@@ -132,7 +147,16 @@ struct _cpuid4_info {
union _cpuid4_leaf_ecx ecx;
unsigned long size;
unsigned long can_disable;
- cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */
+ DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
+};
+
+/* subset of above _cpuid4_info w/o shared_cpu_map */
+struct _cpuid4_info_regs {
+ union _cpuid4_leaf_eax eax;
+ union _cpuid4_leaf_ebx ebx;
+ union _cpuid4_leaf_ecx ecx;
+ unsigned long size;
+ unsigned long can_disable;
};
#ifdef CONFIG_PCI
@@ -263,7 +287,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
}
static void __cpuinit
-amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
+amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
{
if (index < 3)
return;
@@ -271,7 +295,8 @@ amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
}
static int
-__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
+__cpuinit cpuid4_cache_lookup_regs(int index,
+ struct _cpuid4_info_regs *this_leaf)
{
union _cpuid4_leaf_eax eax;
union _cpuid4_leaf_ebx ebx;
@@ -299,6 +324,15 @@ __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
return 0;
}
+static int
+__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
+{
+ struct _cpuid4_info_regs *leaf_regs =
+ (struct _cpuid4_info_regs *)this_leaf;
+
+ return cpuid4_cache_lookup_regs(index, leaf_regs);
+}
+
static int __cpuinit find_num_cache_leaves(void)
{
unsigned int eax, ebx, ecx, edx;
@@ -338,11 +372,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
* parameters cpuid leaf to find the cache details
*/
for (i = 0; i < num_cache_leaves; i++) {
- struct _cpuid4_info this_leaf;
-
+ struct _cpuid4_info_regs this_leaf;
int retval;
- retval = cpuid4_cache_lookup(i, &this_leaf);
+ retval = cpuid4_cache_lookup_regs(i, &this_leaf);
if (retval >= 0) {
switch(this_leaf.eax.split.level) {
case 1:
@@ -491,17 +524,20 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;
if (num_threads_sharing == 1)
- cpu_set(cpu, this_leaf->shared_cpu_map);
+ cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map));
else {
index_msb = get_count_order(num_threads_sharing);
for_each_online_cpu(i) {
if (cpu_data(i).apicid >> index_msb ==
c->apicid >> index_msb) {
- cpu_set(i, this_leaf->shared_cpu_map);
+ cpumask_set_cpu(i,
+ to_cpumask(this_leaf->shared_cpu_map));
if (i != cpu && per_cpu(cpuid4_info, i)) {
- sibling_leaf = CPUID4_INFO_IDX(i, index);
- cpu_set(cpu, sibling_leaf->shared_cpu_map);
+ sibling_leaf =
+ CPUID4_INFO_IDX(i, index);
+ cpumask_set_cpu(cpu, to_cpumask(
+ sibling_leaf->shared_cpu_map));
}
}
}
@@ -513,9 +549,10 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
int sibling;
this_leaf = CPUID4_INFO_IDX(cpu, index);
- for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) {
+ for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {
sibling_leaf = CPUID4_INFO_IDX(sibling, index);
- cpu_clear(cpu, sibling_leaf->shared_cpu_map);
+ cpumask_clear_cpu(cpu,
+ to_cpumask(sibling_leaf->shared_cpu_map));
}
}
#else
@@ -534,31 +571,16 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
per_cpu(cpuid4_info, cpu) = NULL;
}
-static int __cpuinit detect_cache_attributes(unsigned int cpu)
+static void __cpuinit get_cpu_leaves(void *_retval)
{
- struct _cpuid4_info *this_leaf;
- unsigned long j;
- int retval;
- cpumask_t oldmask;
-
- if (num_cache_leaves == 0)
- return -ENOENT;
-
- per_cpu(cpuid4_info, cpu) = kzalloc(
- sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
- if (per_cpu(cpuid4_info, cpu) == NULL)
- return -ENOMEM;
-
- oldmask = current->cpus_allowed;
- retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
- if (retval)
- goto out;
+ int j, *retval = _retval, cpu = smp_processor_id();
/* Do cpuid and store the results */
for (j = 0; j < num_cache_leaves; j++) {
+ struct _cpuid4_info *this_leaf;
this_leaf = CPUID4_INFO_IDX(cpu, j);
- retval = cpuid4_cache_lookup(j, this_leaf);
- if (unlikely(retval < 0)) {
+ *retval = cpuid4_cache_lookup(j, this_leaf);
+ if (unlikely(*retval < 0)) {
int i;
for (i = 0; i < j; i++)
@@ -567,9 +589,21 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
}
cache_shared_cpu_map_setup(cpu, j);
}
- set_cpus_allowed_ptr(current, &oldmask);
+}
+
+static int __cpuinit detect_cache_attributes(unsigned int cpu)
+{
+ int retval;
+
+ if (num_cache_leaves == 0)
+ return -ENOENT;
-out:
+ per_cpu(cpuid4_info, cpu) = kzalloc(
+ sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
+ if (per_cpu(cpuid4_info, cpu) == NULL)
+ return -ENOMEM;
+
+ smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
if (retval) {
kfree(per_cpu(cpuid4_info, cpu));
per_cpu(cpuid4_info, cpu) = NULL;
@@ -623,11 +657,12 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
int n = 0;
if (len > 1) {
- cpumask_t *mask = &this_leaf->shared_cpu_map;
+ const struct cpumask *mask;
+ mask = to_cpumask(this_leaf->shared_cpu_map);
n = type?
- cpulist_scnprintf(buf, len-2, *mask):
- cpumask_scnprintf(buf, len-2, *mask);
+ cpulist_scnprintf(buf, len-2, mask) :
+ cpumask_scnprintf(buf, len-2, mask);
buf[n++] = '\n';
buf[n] = '\0';
}
@@ -644,20 +679,17 @@ static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf)
return show_shared_cpu_map_func(leaf, 1, buf);
}
-static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) {
- switch(this_leaf->eax.split.type) {
- case CACHE_TYPE_DATA:
+static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
+{
+ switch (this_leaf->eax.split.type) {
+ case CACHE_TYPE_DATA:
return sprintf(buf, "Data\n");
- break;
- case CACHE_TYPE_INST:
+ case CACHE_TYPE_INST:
return sprintf(buf, "Instruction\n");
- break;
- case CACHE_TYPE_UNIFIED:
+ case CACHE_TYPE_UNIFIED:
return sprintf(buf, "Unified\n");
- break;
- default:
+ default:
return sprintf(buf, "Unknown\n");
- break;
}
}
@@ -690,7 +722,8 @@ static struct pci_dev *get_k8_northbridge(int node)
static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
{
- int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
+ const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
+ int node = cpu_to_node(cpumask_first(mask));
struct pci_dev *dev = NULL;
ssize_t ret = 0;
int i;
@@ -724,7 +757,8 @@ static ssize_t
store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
size_t count)
{
- int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
+ const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
+ int node = cpu_to_node(cpumask_first(mask));
struct pci_dev *dev = NULL;
unsigned int ret, index, val;
@@ -869,7 +903,7 @@ err_out:
return -ENOMEM;
}
-static cpumask_t cache_dev_map = CPU_MASK_NONE;
+static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
/* Add/Remove cache interface for CPU device */
static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
@@ -909,7 +943,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
}
kobject_uevent(&(this_object->kobj), KOBJ_ADD);
}
- cpu_set(cpu, cache_dev_map);
+ cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
return 0;
@@ -922,9 +956,9 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
if (per_cpu(cpuid4_info, cpu) == NULL)
return;
- if (!cpu_isset(cpu, cache_dev_map))
+ if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
return;
- cpu_clear(cpu, cache_dev_map);
+ cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
for (i = 0; i < num_cache_leaves; i++)
kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj));
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index 0ebf3fc6a61..dfaebce3633 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -1,6 +1,6 @@
/*
* mce.c - x86 Machine Check Exception Reporting
- * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@redhat.com>
+ * (c) 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>, Dave Jones <davej@redhat.com>
*/
#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 4b031a4ac85..1c838032fd3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -510,12 +510,9 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
*/
void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
{
- static cpumask_t mce_cpus = CPU_MASK_NONE;
-
mce_cpu_quirks(c);
if (mce_dont_init ||
- cpu_test_and_set(smp_processor_id(), mce_cpus) ||
!mce_available(c))
return;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 5eb390a4b2e..4772e91e824 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -67,7 +67,7 @@ static struct threshold_block threshold_defaults = {
struct threshold_bank {
struct kobject *kobj;
struct threshold_block *blocks;
- cpumask_t cpus;
+ cpumask_var_t cpus;
};
static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
@@ -83,34 +83,41 @@ static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
* CPU Initialization
*/
+struct thresh_restart {
+ struct threshold_block *b;
+ int reset;
+ u16 old_limit;
+};
+
/* must be called with correct cpu affinity */
-static void threshold_restart_bank(struct threshold_block *b,
- int reset, u16 old_limit)
+static long threshold_restart_bank(void *_tr)
{
+ struct thresh_restart *tr = _tr;
u32 mci_misc_hi, mci_misc_lo;
- rdmsr(b->address, mci_misc_lo, mci_misc_hi);
+ rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
- if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
- reset = 1; /* limit cannot be lower than err count */
+ if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
+ tr->reset = 1; /* limit cannot be lower than err count */
- if (reset) { /* reset err count and overflow bit */
+ if (tr->reset) { /* reset err count and overflow bit */
mci_misc_hi =
(mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
- (THRESHOLD_MAX - b->threshold_limit);
- } else if (old_limit) { /* change limit w/o reset */
+ (THRESHOLD_MAX - tr->b->threshold_limit);
+ } else if (tr->old_limit) { /* change limit w/o reset */
int new_count = (mci_misc_hi & THRESHOLD_MAX) +
- (old_limit - b->threshold_limit);
+ (tr->old_limit - tr->b->threshold_limit);
mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
(new_count & THRESHOLD_MAX);
}
- b->interrupt_enable ?
+ tr->b->interrupt_enable ?
(mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
(mci_misc_hi &= ~MASK_INT_TYPE_HI);
mci_misc_hi |= MASK_COUNT_EN_HI;
- wrmsr(b->address, mci_misc_lo, mci_misc_hi);
+ wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+ return 0;
}
/* cpu init entry point, called from mce.c with preempt off */
@@ -120,6 +127,7 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
unsigned int cpu = smp_processor_id();
u8 lvt_off;
u32 low = 0, high = 0, address = 0;
+ struct thresh_restart tr;
for (bank = 0; bank < NR_BANKS; ++bank) {
for (block = 0; block < NR_BLOCKS; ++block) {
@@ -162,7 +170,10 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
wrmsr(address, low, high);
threshold_defaults.address = address;
- threshold_restart_bank(&threshold_defaults, 0, 0);
+ tr.b = &threshold_defaults;
+ tr.reset = 0;
+ tr.old_limit = 0;
+ threshold_restart_bank(&tr);
}
}
}
@@ -237,7 +248,7 @@ asmlinkage void mce_threshold_interrupt(void)
}
}
out:
- add_pda(irq_threshold_count, 1);
+ inc_irq_stat(irq_threshold_count);
irq_exit();
}
@@ -251,20 +262,6 @@ struct threshold_attr {
ssize_t(*store) (struct threshold_block *, const char *, size_t count);
};
-static void affinity_set(unsigned int cpu, cpumask_t *oldmask,
- cpumask_t *newmask)
-{
- *oldmask = current->cpus_allowed;
- cpus_clear(*newmask);
- cpu_set(cpu, *newmask);
- set_cpus_allowed_ptr(current, newmask);
-}
-
-static void affinity_restore(const cpumask_t *oldmask)
-{
- set_cpus_allowed_ptr(current, oldmask);
-}
-
#define SHOW_FIELDS(name) \
static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
{ \
@@ -277,15 +274,16 @@ static ssize_t store_interrupt_enable(struct threshold_block *b,
const char *buf, size_t count)
{
char *end;
- cpumask_t oldmask, newmask;
+ struct thresh_restart tr;
unsigned long new = simple_strtoul(buf, &end, 0);
if (end == buf)
return -EINVAL;
b->interrupt_enable = !!new;
- affinity_set(b->cpu, &oldmask, &newmask);
- threshold_restart_bank(b, 0, 0);
- affinity_restore(&oldmask);
+ tr.b = b;
+ tr.reset = 0;
+ tr.old_limit = 0;
+ work_on_cpu(b->cpu, threshold_restart_bank, &tr);
return end - buf;
}
@@ -294,8 +292,7 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
const char *buf, size_t count)
{
char *end;
- cpumask_t oldmask, newmask;
- u16 old;
+ struct thresh_restart tr;
unsigned long new = simple_strtoul(buf, &end, 0);
if (end == buf)
return -EINVAL;
@@ -303,34 +300,36 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
new = THRESHOLD_MAX;
if (new < 1)
new = 1;
- old = b->threshold_limit;
+ tr.old_limit = b->threshold_limit;
b->threshold_limit = new;
+ tr.b = b;
+ tr.reset = 0;
- affinity_set(b->cpu, &oldmask, &newmask);
- threshold_restart_bank(b, 0, old);
- affinity_restore(&oldmask);
+ work_on_cpu(b->cpu, threshold_restart_bank, &tr);
return end - buf;
}
-static ssize_t show_error_count(struct threshold_block *b, char *buf)
+static long local_error_count(void *_b)
{
- u32 high, low;
- cpumask_t oldmask, newmask;
- affinity_set(b->cpu, &oldmask, &newmask);
+ struct threshold_block *b = _b;
+ u32 low, high;
+
rdmsr(b->address, low, high);
- affinity_restore(&oldmask);
- return sprintf(buf, "%x\n",
- (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
+ return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
+}
+
+static ssize_t show_error_count(struct threshold_block *b, char *buf)
+{
+ return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b));
}
static ssize_t store_error_count(struct threshold_block *b,
const char *buf, size_t count)
{
- cpumask_t oldmask, newmask;
- affinity_set(b->cpu, &oldmask, &newmask);
- threshold_restart_bank(b, 1, 0);
- affinity_restore(&oldmask);
+ struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
+
+ work_on_cpu(b->cpu, threshold_restart_bank, &tr);
return 1;
}
@@ -463,19 +462,26 @@ out_free:
return err;
}
+static __cpuinit long local_allocate_threshold_blocks(void *_bank)
+{
+ unsigned int *bank = _bank;
+
+ return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
+ MSR_IA32_MC0_MISC + *bank * 4);
+}
+
/* symlinks sibling shared banks to first core. first core owns dir/files. */
static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
{
int i, err = 0;
struct threshold_bank *b = NULL;
- cpumask_t oldmask, newmask;
char name[32];
sprintf(name, "threshold_bank%i", bank);
#ifdef CONFIG_SMP
if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
- i = first_cpu(per_cpu(cpu_core_map, cpu));
+ i = cpumask_first(&per_cpu(cpu_core_map, cpu));
/* first core not up yet */
if (cpu_data(i).cpu_core_id)
@@ -495,7 +501,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (err)
goto out;
- b->cpus = per_cpu(cpu_core_map, cpu);
+ cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));
per_cpu(threshold_banks, cpu)[bank] = b;
goto out;
}
@@ -506,28 +512,29 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
err = -ENOMEM;
goto out;
}
+ if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
+ kfree(b);
+ err = -ENOMEM;
+ goto out;
+ }
b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj);
if (!b->kobj)
goto out_free;
#ifndef CONFIG_SMP
- b->cpus = CPU_MASK_ALL;
+ cpumask_setall(b->cpus);
#else
- b->cpus = per_cpu(cpu_core_map, cpu);
+ cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));
#endif
per_cpu(threshold_banks, cpu)[bank] = b;
- affinity_set(cpu, &oldmask, &newmask);
- err = allocate_threshold_blocks(cpu, bank, 0,
- MSR_IA32_MC0_MISC + bank * 4);
- affinity_restore(&oldmask);
-
+ err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank);
if (err)
goto out_free;
- for_each_cpu_mask_nr(i, b->cpus) {
+ for_each_cpu(i, b->cpus) {
if (i == cpu)
continue;
@@ -543,6 +550,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
out_free:
per_cpu(threshold_banks, cpu)[bank] = NULL;
+ free_cpumask_var(b->cpus);
kfree(b);
out:
return err;
@@ -617,7 +625,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
#endif
/* remove all sibling symlinks before unregistering */
- for_each_cpu_mask_nr(i, b->cpus) {
+ for_each_cpu(i, b->cpus) {
if (i == cpu)
continue;
@@ -630,6 +638,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
free_out:
kobject_del(b->kobj);
kobject_put(b->kobj);
+ free_cpumask_var(b->cpus);
kfree(b);
per_cpu(threshold_banks, cpu)[bank] = NULL;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index c17eaf5dd6d..5e8c79e748a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -7,6 +7,7 @@
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <asm/processor.h>
+#include <asm/apic.h>
#include <asm/msr.h>
#include <asm/mce.h>
#include <asm/hw_irq.h>
@@ -26,7 +27,7 @@ asmlinkage void smp_thermal_interrupt(void)
if (therm_throt_process(msr_val & 1))
mce_log_therm_throt_event(smp_processor_id(), msr_val);
- add_pda(irq_thermal_count, 1);
+ inc_irq_stat(irq_thermal_count);
irq_exit();
}
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index bfa5817afdd..c9f77ea69ed 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -1,6 +1,6 @@
/*
* P5 specific Machine Check Exception Reporting
- * (C) Copyright 2002 Alan Cox <alan@redhat.com>
+ * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
*/
#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index 62efc9c2b3a..2ac52d7b434 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -1,6 +1,6 @@
/*
* P6 specific Machine Check Exception Reporting
- * (C) Copyright 2002 Alan Cox <alan@redhat.com>
+ * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
*/
#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index f2be3e190c6..2a043d89811 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -1,6 +1,6 @@
/*
* IDT Winchip specific Machine Check Exception Reporting
- * (C) Copyright 2002 Alan Cox <alan@redhat.com>
+ * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
*/
#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 4e8d77f01ee..0c0a455fe95 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -14,14 +14,6 @@
#include <asm/pat.h>
#include "mtrr.h"
-struct mtrr_state {
- struct mtrr_var_range var_ranges[MAX_VAR_RANGES];
- mtrr_type fixed_ranges[NUM_FIXED_RANGES];
- unsigned char enabled;
- unsigned char have_fixed;
- mtrr_type def_type;
-};
-
struct fixed_range_block {
int base_msr; /* start address of an MTRR block */
int ranges; /* number of MTRRs in this block */
@@ -35,15 +27,19 @@ static struct fixed_range_block fixed_range_blocks[] = {
};
static unsigned long smp_changes_mask;
-static struct mtrr_state mtrr_state = {};
static int mtrr_state_set;
u64 mtrr_tom2;
-#undef MODULE_PARAM_PREFIX
-#define MODULE_PARAM_PREFIX "mtrr."
+struct mtrr_state_type mtrr_state = {};
+EXPORT_SYMBOL_GPL(mtrr_state);
-static int mtrr_show;
-module_param_named(show, mtrr_show, bool, 0);
+static int __initdata mtrr_show;
+static int __init mtrr_debug(char *opt)
+{
+ mtrr_show = 1;
+ return 0;
+}
+early_param("mtrr.show", mtrr_debug);
/*
* Returns the effective MTRR type for the region
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index c78c04821ea..236a401b825 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -49,7 +49,7 @@
u32 num_var_ranges = 0;
-unsigned int mtrr_usage_table[MAX_VAR_RANGES];
+unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
static DEFINE_MUTEX(mtrr_mutex);
u64 size_or_mask, size_and_mask;
@@ -574,7 +574,7 @@ struct mtrr_value {
unsigned long lsize;
};
-static struct mtrr_value mtrr_state[MAX_VAR_RANGES];
+static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES];
static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
{
@@ -803,6 +803,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
}
static struct res_range __initdata range[RANGE_NUM];
+static int __initdata nr_range;
#ifdef CONFIG_MTRR_SANITIZER
@@ -823,16 +824,14 @@ static int enable_mtrr_cleanup __initdata =
static int __init disable_mtrr_cleanup_setup(char *str)
{
- if (enable_mtrr_cleanup != -1)
- enable_mtrr_cleanup = 0;
+ enable_mtrr_cleanup = 0;
return 0;
}
early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
static int __init enable_mtrr_cleanup_setup(char *str)
{
- if (enable_mtrr_cleanup != -1)
- enable_mtrr_cleanup = 1;
+ enable_mtrr_cleanup = 1;
return 0;
}
early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
@@ -1206,39 +1205,43 @@ struct mtrr_cleanup_result {
#define PSHIFT (PAGE_SHIFT - 10)
static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
-static struct res_range __initdata range_new[RANGE_NUM];
static unsigned long __initdata min_loss_pfn[RANGE_NUM];
-static int __init mtrr_cleanup(unsigned address_bits)
+static void __init print_out_mtrr_range_state(void)
{
- unsigned long extra_remove_base, extra_remove_size;
- unsigned long base, size, def, dummy;
- mtrr_type type;
- int nr_range, nr_range_new;
- u64 chunk_size, gran_size;
- unsigned long range_sums, range_sums_new;
- int index_good;
- int num_reg_good;
int i;
+ char start_factor = 'K', size_factor = 'K';
+ unsigned long start_base, size_base;
+ mtrr_type type;
- /* extra one for all 0 */
- int num[MTRR_NUM_TYPES + 1];
+ for (i = 0; i < num_var_ranges; i++) {
- if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
- return 0;
- rdmsr(MTRRdefType_MSR, def, dummy);
- def &= 0xff;
- if (def != MTRR_TYPE_UNCACHABLE)
- return 0;
+ size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
+ if (!size_base)
+ continue;
- /* get it and store it aside */
- memset(range_state, 0, sizeof(range_state));
- for (i = 0; i < num_var_ranges; i++) {
- mtrr_if->get(i, &base, &size, &type);
- range_state[i].base_pfn = base;
- range_state[i].size_pfn = size;
- range_state[i].type = type;
+ size_base = to_size_factor(size_base, &size_factor),
+ start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
+ start_base = to_size_factor(start_base, &start_factor),
+ type = range_state[i].type;
+
+ printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+ i, start_base, start_factor,
+ size_base, size_factor,
+ (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
+ ((type == MTRR_TYPE_WRPROT) ? "WP" :
+ ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
+ );
}
+}
+
+static int __init mtrr_need_cleanup(void)
+{
+ int i;
+ mtrr_type type;
+ unsigned long size;
+ /* extra one for all 0 */
+ int num[MTRR_NUM_TYPES + 1];
/* check entries number */
memset(num, 0, sizeof(num));
@@ -1263,29 +1266,133 @@ static int __init mtrr_cleanup(unsigned address_bits)
num_var_ranges - num[MTRR_NUM_TYPES])
return 0;
- /* print original var MTRRs at first, for debugging: */
- printk(KERN_DEBUG "original variable MTRRs\n");
- for (i = 0; i < num_var_ranges; i++) {
- char start_factor = 'K', size_factor = 'K';
- unsigned long start_base, size_base;
+ return 1;
+}
- size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
- if (!size_base)
- continue;
+static unsigned long __initdata range_sums;
+static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
+ unsigned long extra_remove_base,
+ unsigned long extra_remove_size,
+ int i)
+{
+ int num_reg;
+ static struct res_range range_new[RANGE_NUM];
+ static int nr_range_new;
+ unsigned long range_sums_new;
+
+ /* convert ranges to var ranges state */
+ num_reg = x86_setup_var_mtrrs(range, nr_range,
+ chunk_size, gran_size);
+
+ /* we got new setting in range_state, check it */
+ memset(range_new, 0, sizeof(range_new));
+ nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
+ extra_remove_base, extra_remove_size);
+ range_sums_new = sum_ranges(range_new, nr_range_new);
+
+ result[i].chunk_sizek = chunk_size >> 10;
+ result[i].gran_sizek = gran_size >> 10;
+ result[i].num_reg = num_reg;
+ if (range_sums < range_sums_new) {
+ result[i].lose_cover_sizek =
+ (range_sums_new - range_sums) << PSHIFT;
+ result[i].bad = 1;
+ } else
+ result[i].lose_cover_sizek =
+ (range_sums - range_sums_new) << PSHIFT;
- size_base = to_size_factor(size_base, &size_factor),
- start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
- start_base = to_size_factor(start_base, &start_factor),
- type = range_state[i].type;
+ /* double check it */
+ if (!result[i].bad && !result[i].lose_cover_sizek) {
+ if (nr_range_new != nr_range ||
+ memcmp(range, range_new, sizeof(range)))
+ result[i].bad = 1;
+ }
- printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
- i, start_base, start_factor,
- size_base, size_factor,
- (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
- ((type == MTRR_TYPE_WRPROT) ? "WP" :
- ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
- );
+ if (!result[i].bad && (range_sums - range_sums_new <
+ min_loss_pfn[num_reg])) {
+ min_loss_pfn[num_reg] =
+ range_sums - range_sums_new;
}
+}
+
+static void __init mtrr_print_out_one_result(int i)
+{
+ char gran_factor, chunk_factor, lose_factor;
+ unsigned long gran_base, chunk_base, lose_base;
+
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+ printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
+ result[i].bad ? "*BAD*" : " ",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
+ result[i].num_reg, result[i].bad ? "-" : "",
+ lose_base, lose_factor);
+}
+
+static int __init mtrr_search_optimal_index(void)
+{
+ int i;
+ int num_reg_good;
+ int index_good;
+
+ if (nr_mtrr_spare_reg >= num_var_ranges)
+ nr_mtrr_spare_reg = num_var_ranges - 1;
+ num_reg_good = -1;
+ for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
+ if (!min_loss_pfn[i])
+ num_reg_good = i;
+ }
+
+ index_good = -1;
+ if (num_reg_good != -1) {
+ for (i = 0; i < NUM_RESULT; i++) {
+ if (!result[i].bad &&
+ result[i].num_reg == num_reg_good &&
+ !result[i].lose_cover_sizek) {
+ index_good = i;
+ break;
+ }
+ }
+ }
+
+ return index_good;
+}
+
+
+static int __init mtrr_cleanup(unsigned address_bits)
+{
+ unsigned long extra_remove_base, extra_remove_size;
+ unsigned long base, size, def, dummy;
+ mtrr_type type;
+ u64 chunk_size, gran_size;
+ int index_good;
+ int i;
+
+ if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
+ return 0;
+ rdmsr(MTRRdefType_MSR, def, dummy);
+ def &= 0xff;
+ if (def != MTRR_TYPE_UNCACHABLE)
+ return 0;
+
+ /* get it and store it aside */
+ memset(range_state, 0, sizeof(range_state));
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &base, &size, &type);
+ range_state[i].base_pfn = base;
+ range_state[i].size_pfn = size;
+ range_state[i].type = type;
+ }
+
+ /* check if we need handle it and can handle it */
+ if (!mtrr_need_cleanup())
+ return 0;
+
+ /* print original var MTRRs at first, for debugging: */
+ printk(KERN_DEBUG "original variable MTRRs\n");
+ print_out_mtrr_range_state();
memset(range, 0, sizeof(range));
extra_remove_size = 0;
@@ -1309,176 +1416,64 @@ static int __init mtrr_cleanup(unsigned address_bits)
range_sums >> (20 - PAGE_SHIFT));
if (mtrr_chunk_size && mtrr_gran_size) {
- int num_reg;
- char gran_factor, chunk_factor, lose_factor;
- unsigned long gran_base, chunk_base, lose_base;
-
- debug_print++;
- /* convert ranges to var ranges state */
- num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
- mtrr_gran_size);
+ i = 0;
+ mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
+ extra_remove_base, extra_remove_size, i);
- /* we got new setting in range_state, check it */
- memset(range_new, 0, sizeof(range_new));
- nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
- extra_remove_base,
- extra_remove_size);
- range_sums_new = sum_ranges(range_new, nr_range_new);
+ mtrr_print_out_one_result(i);
- i = 0;
- result[i].chunk_sizek = mtrr_chunk_size >> 10;
- result[i].gran_sizek = mtrr_gran_size >> 10;
- result[i].num_reg = num_reg;
- if (range_sums < range_sums_new) {
- result[i].lose_cover_sizek =
- (range_sums_new - range_sums) << PSHIFT;
- result[i].bad = 1;
- } else
- result[i].lose_cover_sizek =
- (range_sums - range_sums_new) << PSHIFT;
-
- gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
- chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
- lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
- printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
- result[i].bad?"*BAD*":" ",
- gran_base, gran_factor, chunk_base, chunk_factor);
- printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
- result[i].num_reg, result[i].bad?"-":"",
- lose_base, lose_factor);
if (!result[i].bad) {
set_var_mtrr_all(address_bits);
return 1;
}
printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
"will find optimal one\n");
- debug_print--;
- memset(result, 0, sizeof(result[0]));
}
i = 0;
memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
memset(result, 0, sizeof(result));
for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
- char gran_factor;
- unsigned long gran_base;
-
- if (debug_print)
- gran_base = to_size_factor(gran_size >> 10, &gran_factor);
for (chunk_size = gran_size; chunk_size < (1ULL<<32);
chunk_size <<= 1) {
- int num_reg;
- if (debug_print) {
- char chunk_factor;
- unsigned long chunk_base;
-
- chunk_base = to_size_factor(chunk_size>>10, &chunk_factor),
- printk(KERN_INFO "\n");
- printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n",
- gran_base, gran_factor, chunk_base, chunk_factor);
- }
if (i >= NUM_RESULT)
continue;
- /* convert ranges to var ranges state */
- num_reg = x86_setup_var_mtrrs(range, nr_range,
- chunk_size, gran_size);
-
- /* we got new setting in range_state, check it */
- memset(range_new, 0, sizeof(range_new));
- nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
- extra_remove_base, extra_remove_size);
- range_sums_new = sum_ranges(range_new, nr_range_new);
-
- result[i].chunk_sizek = chunk_size >> 10;
- result[i].gran_sizek = gran_size >> 10;
- result[i].num_reg = num_reg;
- if (range_sums < range_sums_new) {
- result[i].lose_cover_sizek =
- (range_sums_new - range_sums) << PSHIFT;
- result[i].bad = 1;
- } else
- result[i].lose_cover_sizek =
- (range_sums - range_sums_new) << PSHIFT;
-
- /* double check it */
- if (!result[i].bad && !result[i].lose_cover_sizek) {
- if (nr_range_new != nr_range ||
- memcmp(range, range_new, sizeof(range)))
- result[i].bad = 1;
+ mtrr_calc_range_state(chunk_size, gran_size,
+ extra_remove_base, extra_remove_size, i);
+ if (debug_print) {
+ mtrr_print_out_one_result(i);
+ printk(KERN_INFO "\n");
}
- if (!result[i].bad && (range_sums - range_sums_new <
- min_loss_pfn[num_reg])) {
- min_loss_pfn[num_reg] =
- range_sums - range_sums_new;
- }
i++;
}
}
- /* print out all */
- for (i = 0; i < NUM_RESULT; i++) {
- char gran_factor, chunk_factor, lose_factor;
- unsigned long gran_base, chunk_base, lose_base;
-
- gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
- chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
- lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
- printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
- result[i].bad?"*BAD*":" ",
- gran_base, gran_factor, chunk_base, chunk_factor);
- printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
- result[i].num_reg, result[i].bad?"-":"",
- lose_base, lose_factor);
- }
-
/* try to find the optimal index */
- if (nr_mtrr_spare_reg >= num_var_ranges)
- nr_mtrr_spare_reg = num_var_ranges - 1;
- num_reg_good = -1;
- for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
- if (!min_loss_pfn[i])
- num_reg_good = i;
- }
-
- index_good = -1;
- if (num_reg_good != -1) {
- for (i = 0; i < NUM_RESULT; i++) {
- if (!result[i].bad &&
- result[i].num_reg == num_reg_good &&
- !result[i].lose_cover_sizek) {
- index_good = i;
- break;
- }
- }
- }
+ index_good = mtrr_search_optimal_index();
if (index_good != -1) {
- char gran_factor, chunk_factor, lose_factor;
- unsigned long gran_base, chunk_base, lose_base;
-
printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
i = index_good;
- gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
- chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
- lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
- printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t",
- gran_base, gran_factor, chunk_base, chunk_factor);
- printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n",
- result[i].num_reg, lose_base, lose_factor);
+ mtrr_print_out_one_result(i);
+
/* convert ranges to var ranges state */
chunk_size = result[i].chunk_sizek;
chunk_size <<= 10;
gran_size = result[i].gran_sizek;
gran_size <<= 10;
- debug_print++;
x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
- debug_print--;
set_var_mtrr_all(address_bits);
+ printk(KERN_DEBUG "New variable MTRRs\n");
+ print_out_mtrr_range_state();
return 1;
+ } else {
+ /* print out all */
+ for (i = 0; i < NUM_RESULT; i++)
+ mtrr_print_out_one_result(i);
}
printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
@@ -1562,7 +1557,6 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
{
unsigned long i, base, size, highest_pfn = 0, def, dummy;
mtrr_type type;
- int nr_range;
u64 total_trim_size;
/* extra one for all 0 */
@@ -1600,8 +1594,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
/* kvm/qemu doesn't have mtrr set right, don't trim them all */
if (!highest_pfn) {
- WARN(!kvm_para_available(), KERN_WARNING
- "WARNING: strange, CPU MTRRs all blank?\n");
+ printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
return 0;
}
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 2dc4ec656b2..ffd60409cc6 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -8,11 +8,6 @@
#define MTRRcap_MSR 0x0fe
#define MTRRdefType_MSR 0x2ff
-#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
-#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
-
-#define NUM_FIXED_RANGES 88
-#define MAX_VAR_RANGES 256
#define MTRRfix64K_00000_MSR 0x250
#define MTRRfix16K_80000_MSR 0x258
#define MTRRfix16K_A0000_MSR 0x259
@@ -29,11 +24,7 @@
#define MTRR_CHANGE_MASK_VARIABLE 0x02
#define MTRR_CHANGE_MASK_DEFTYPE 0x04
-/* In the Intel processor's MTRR interface, the MTRR type is always held in
- an 8 bit field: */
-typedef u8 mtrr_type;
-
-extern unsigned int mtrr_usage_table[MAX_VAR_RANGES];
+extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
struct mtrr_ops {
u32 vendor;
@@ -70,13 +61,6 @@ struct set_mtrr_context {
u32 ccr3;
};
-struct mtrr_var_range {
- u32 base_lo;
- u32 base_hi;
- u32 mask_lo;
- u32 mask_hi;
-};
-
void set_mtrr_done(struct set_mtrr_context *ctxt);
void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
new file mode 100644
index 00000000000..284c399e323
--- /dev/null
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -0,0 +1,112 @@
+/*
+ * VMware Detection code.
+ *
+ * Copyright (C) 2008, VMware, Inc.
+ * Author : Alok N Kataria <akataria@vmware.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/dmi.h>
+#include <asm/div64.h>
+#include <asm/vmware.h>
+
+#define CPUID_VMWARE_INFO_LEAF 0x40000000
+#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
+#define VMWARE_HYPERVISOR_PORT 0x5658
+
+#define VMWARE_PORT_CMD_GETVERSION 10
+#define VMWARE_PORT_CMD_GETHZ 45
+
+#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
+ __asm__("inl (%%dx)" : \
+ "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
+ "0"(VMWARE_HYPERVISOR_MAGIC), \
+ "1"(VMWARE_PORT_CMD_##cmd), \
+ "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \
+ "memory");
+
+static inline int __vmware_platform(void)
+{
+ uint32_t eax, ebx, ecx, edx;
+ VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx);
+ return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
+}
+
+static unsigned long __vmware_get_tsc_khz(void)
+{
+ uint64_t tsc_hz;
+ uint32_t eax, ebx, ecx, edx;
+
+ VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
+
+ if (ebx == UINT_MAX)
+ return 0;
+ tsc_hz = eax | (((uint64_t)ebx) << 32);
+ do_div(tsc_hz, 1000);
+ BUG_ON(tsc_hz >> 32);
+ return tsc_hz;
+}
+
+/*
+ * While checking the dmi string infomation, just checking the product
+ * serial key should be enough, as this will always have a VMware
+ * specific string when running under VMware hypervisor.
+ */
+int vmware_platform(void)
+{
+ if (cpu_has_hypervisor) {
+ unsigned int eax, ebx, ecx, edx;
+ char hyper_vendor_id[13];
+
+ cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx);
+ memcpy(hyper_vendor_id + 0, &ebx, 4);
+ memcpy(hyper_vendor_id + 4, &ecx, 4);
+ memcpy(hyper_vendor_id + 8, &edx, 4);
+ hyper_vendor_id[12] = '\0';
+ if (!strcmp(hyper_vendor_id, "VMwareVMware"))
+ return 1;
+ } else if (dmi_available && dmi_name_in_serial("VMware") &&
+ __vmware_platform())
+ return 1;
+
+ return 0;
+}
+
+unsigned long vmware_get_tsc_khz(void)
+{
+ BUG_ON(!vmware_platform());
+ return __vmware_get_tsc_khz();
+}
+
+/*
+ * VMware hypervisor takes care of exporting a reliable TSC to the guest.
+ * Still, due to timing difference when running on virtual cpus, the TSC can
+ * be marked as unstable in some cases. For example, the TSC sync check at
+ * bootup can fail due to a marginal offset between vcpus' TSCs (though the
+ * TSCs do not drift from each other). Also, the ACPI PM timer clocksource
+ * is not suitable as a watchdog when running on a hypervisor because the
+ * kernel may miss a wrap of the counter if the vcpu is descheduled for a
+ * long time. To skip these checks at runtime we set these capability bits,
+ * so that the kernel could just trust the hypervisor with providing a
+ * reliable virtual TSC that is suitable for timekeeping.
+ */
+void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c)
+{
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
+}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 72cefd1e649..2ac1f0c2beb 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -39,10 +39,10 @@
#include <linux/device.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
+#include <linux/uaccess.h>
#include <asm/processor.h>
#include <asm/msr.h>
-#include <asm/uaccess.h>
#include <asm/system.h>
static struct class *cpuid_class;
@@ -82,7 +82,7 @@ static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
}
static ssize_t cpuid_read(struct file *file, char __user *buf,
- size_t count, loff_t * ppos)
+ size_t count, loff_t *ppos)
{
char __user *tmp = buf;
struct cpuid_regs cmd;
@@ -117,11 +117,11 @@ static int cpuid_open(struct inode *inode, struct file *file)
unsigned int cpu;
struct cpuinfo_x86 *c;
int ret = 0;
-
+
lock_kernel();
cpu = iminor(file->f_path.dentry->d_inode);
- if (cpu >= NR_CPUS || !cpu_online(cpu)) {
+ if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
ret = -ENXIO; /* No such CPU */
goto out;
}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index d84a852e4cd..ad7f2a696f4 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -24,10 +24,11 @@
#include <asm/apic.h>
#include <asm/hpet.h>
#include <linux/kdebug.h>
-#include <asm/smp.h>
+#include <asm/cpu.h>
#include <asm/reboot.h>
+#include <asm/virtext.h>
-#include <mach_ipi.h>
+#include <asm/genapic.h>
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
@@ -49,6 +50,15 @@ static void kdump_nmi_callback(int cpu, struct die_args *args)
#endif
crash_save_cpu(regs, cpu);
+ /* Disable VMX or SVM if needed.
+ *
+ * We need to disable virtualization on all CPUs.
+ * Having VMX or SVM enabled on any CPU may break rebooting
+ * after the kdump kernel has finished its task.
+ */
+ cpu_emergency_vmxoff();
+ cpu_emergency_svm_disable();
+
disable_local_APIC();
}
@@ -80,6 +90,14 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
local_irq_disable();
kdump_nmi_shootdown_cpus();
+
+ /* Booting kdump kernel with VMX or SVM enabled won't work,
+ * because (among other limitations) we can't disable paging
+ * with the virt flags.
+ */
+ cpu_emergency_vmxoff();
+ cpu_emergency_svm_disable();
+
lapic_shutdown();
#if defined(CONFIG_X86_IO_APIC)
disable_IO_APIC();
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 2b69994fd3a..169a120587b 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -6,23 +6,20 @@
* precise-event based sampling (PEBS).
*
* It manages:
- * - per-thread and per-cpu allocation of BTS and PEBS
- * - buffer memory allocation (optional)
- * - buffer overflow handling
+ * - DS and BTS hardware configuration
+ * - buffer overflow handling (to be done)
* - buffer access
*
- * It assumes:
- * - get_task_struct on all parameter tasks
- * - current is allowed to trace parameter tasks
+ * It does not do:
+ * - security checking (is the caller allowed to trace the task)
+ * - buffer allocation (memory accounting)
*
*
- * Copyright (C) 2007-2008 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
+ * Copyright (C) 2007-2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
*/
-#ifdef CONFIG_X86_DS
-
#include <asm/ds.h>
#include <linux/errno.h>
@@ -30,22 +27,69 @@
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/mm.h>
+#include <linux/kernel.h>
/*
* The configuration for a particular DS hardware implementation.
*/
struct ds_configuration {
- /* the size of the DS structure in bytes */
- unsigned char sizeof_ds;
- /* the size of one pointer-typed field in the DS structure in bytes;
- this covers the first 8 fields related to buffer management. */
+ /* the name of the configuration */
+ const char *name;
+ /* the size of one pointer-typed field in the DS structure and
+ in the BTS and PEBS buffers in bytes;
+ this covers the first 8 DS fields related to buffer management. */
unsigned char sizeof_field;
/* the size of a BTS/PEBS record in bytes */
unsigned char sizeof_rec[2];
+ /* a series of bit-masks to control various features indexed
+ * by enum ds_feature */
+ unsigned long ctl[dsf_ctl_max];
+};
+static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
+
+#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id())
+
+#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */
+#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */
+#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */
+
+#define BTS_CONTROL \
+ (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\
+ ds_cfg.ctl[dsf_bts_overflow])
+
+
+/*
+ * A BTS or PEBS tracer.
+ *
+ * This holds the configuration of the tracer and serves as a handle
+ * to identify tracers.
+ */
+struct ds_tracer {
+ /* the DS context (partially) owned by this tracer */
+ struct ds_context *context;
+ /* the buffer provided on ds_request() and its size in bytes */
+ void *buffer;
+ size_t size;
+};
+
+struct bts_tracer {
+ /* the common DS part */
+ struct ds_tracer ds;
+ /* the trace including the DS configuration */
+ struct bts_trace trace;
+ /* buffer overflow notification function */
+ bts_ovfl_callback_t ovfl;
};
-static struct ds_configuration ds_cfg;
+struct pebs_tracer {
+ /* the common DS part */
+ struct ds_tracer ds;
+ /* the trace including the DS configuration */
+ struct pebs_trace trace;
+ /* buffer overflow notification function */
+ pebs_ovfl_callback_t ovfl;
+};
/*
* Debug Store (DS) save area configuration (see Intel64 and IA32
@@ -111,32 +155,9 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
/*
- * Locking is done only for allocating BTS or PEBS resources and for
- * guarding context and buffer memory allocation.
- *
- * Most functions require the current task to own the ds context part
- * they are going to access. All the locking is done when validating
- * access to the context.
+ * Locking is done only for allocating BTS or PEBS resources.
*/
-static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
-
-/*
- * Validate that the current task is allowed to access the BTS/PEBS
- * buffer of the parameter task.
- *
- * Returns 0, if access is granted; -Eerrno, otherwise.
- */
-static inline int ds_validate_access(struct ds_context *context,
- enum ds_qualifier qual)
-{
- if (!context)
- return -EPERM;
-
- if (context->owner[qual] == current)
- return 0;
-
- return -EPERM;
-}
+static DEFINE_SPINLOCK(ds_lock);
/*
@@ -152,27 +173,32 @@ static inline int ds_validate_access(struct ds_context *context,
* >0 number of per-thread tracers
* <0 number of per-cpu tracers
*
- * The below functions to get and put tracers and to check the
- * allocation type require the ds_lock to be held by the caller.
- *
* Tracers essentially gives the number of ds contexts for a certain
* type of allocation.
*/
-static long tracers;
+static atomic_t tracers = ATOMIC_INIT(0);
static inline void get_tracer(struct task_struct *task)
{
- tracers += (task ? 1 : -1);
+ if (task)
+ atomic_inc(&tracers);
+ else
+ atomic_dec(&tracers);
}
static inline void put_tracer(struct task_struct *task)
{
- tracers -= (task ? 1 : -1);
+ if (task)
+ atomic_dec(&tracers);
+ else
+ atomic_inc(&tracers);
}
static inline int check_tracer(struct task_struct *task)
{
- return (task ? (tracers >= 0) : (tracers <= 0));
+ return task ?
+ (atomic_read(&tracers) >= 0) :
+ (atomic_read(&tracers) <= 0);
}
@@ -185,100 +211,83 @@ static inline int check_tracer(struct task_struct *task)
*
* Contexts are use-counted. They are allocated on first access and
* deallocated when the last user puts the context.
- *
- * We distinguish between an allocating and a non-allocating get of a
- * context:
- * - the allocating get is used for requesting BTS/PEBS resources. It
- * requires the caller to hold the global ds_lock.
- * - the non-allocating get is used for all other cases. A
- * non-existing context indicates an error. It acquires and releases
- * the ds_lock itself for obtaining the context.
- *
- * A context and its DS configuration are allocated and deallocated
- * together. A context always has a DS configuration of the
- * appropriate size.
- */
-static DEFINE_PER_CPU(struct ds_context *, system_context);
-
-#define this_system_context per_cpu(system_context, smp_processor_id())
-
-/*
- * Returns the pointer to the parameter task's context or to the
- * system-wide context, if task is NULL.
- *
- * Increases the use count of the returned context, if not NULL.
*/
-static inline struct ds_context *ds_get_context(struct task_struct *task)
-{
- struct ds_context *context;
+struct ds_context {
+ /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
+ unsigned char ds[MAX_SIZEOF_DS];
+ /* the owner of the BTS and PEBS configuration, respectively */
+ struct bts_tracer *bts_master;
+ struct pebs_tracer *pebs_master;
+ /* use count */
+ unsigned long count;
+ /* a pointer to the context location inside the thread_struct
+ * or the per_cpu context array */
+ struct ds_context **this;
+ /* a pointer to the task owning this context, or NULL, if the
+ * context is owned by a cpu */
+ struct task_struct *task;
+};
- spin_lock(&ds_lock);
+static DEFINE_PER_CPU(struct ds_context *, system_context_array);
- context = (task ? task->thread.ds_ctx : this_system_context);
- if (context)
- context->count++;
+#define system_context per_cpu(system_context_array, smp_processor_id())
- spin_unlock(&ds_lock);
- return context;
-}
-
-/*
- * Same as ds_get_context, but allocates the context and it's DS
- * structure, if necessary; returns NULL; if out of memory.
- *
- * pre: requires ds_lock to be held
- */
-static inline struct ds_context *ds_alloc_context(struct task_struct *task)
+static inline struct ds_context *ds_get_context(struct task_struct *task)
{
struct ds_context **p_context =
- (task ? &task->thread.ds_ctx : &this_system_context);
- struct ds_context *context = *p_context;
-
- if (!context) {
- context = kzalloc(sizeof(*context), GFP_KERNEL);
-
- if (!context)
- return NULL;
+ (task ? &task->thread.ds_ctx : &system_context);
+ struct ds_context *context = NULL;
+ struct ds_context *new_context = NULL;
+ unsigned long irq;
+
+ /* Chances are small that we already have a context. */
+ new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
+ if (!new_context)
+ return NULL;
- context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
- if (!context->ds) {
- kfree(context);
- return NULL;
- }
+ spin_lock_irqsave(&ds_lock, irq);
- *p_context = context;
+ context = *p_context;
+ if (!context) {
+ context = new_context;
context->this = p_context;
context->task = task;
+ context->count = 0;
if (task)
set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
if (!task || (task == current))
- wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0);
+ wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds);
- get_tracer(task);
+ *p_context = context;
}
context->count++;
+ spin_unlock_irqrestore(&ds_lock, irq);
+
+ if (context != new_context)
+ kfree(new_context);
+
return context;
}
-/*
- * Decreases the use count of the parameter context, if not NULL.
- * Deallocates the context, if the use count reaches zero.
- */
static inline void ds_put_context(struct ds_context *context)
{
+ unsigned long irq;
+
if (!context)
return;
- spin_lock(&ds_lock);
+ spin_lock_irqsave(&ds_lock, irq);
- if (--context->count)
- goto out;
+ if (--context->count) {
+ spin_unlock_irqrestore(&ds_lock, irq);
+ return;
+ }
*(context->this) = NULL;
@@ -288,132 +297,263 @@ static inline void ds_put_context(struct ds_context *context)
if (!context->task || (context->task == current))
wrmsrl(MSR_IA32_DS_AREA, 0);
- put_tracer(context->task);
+ spin_unlock_irqrestore(&ds_lock, irq);
- /* free any leftover buffers from tracers that did not
- * deallocate them properly. */
- kfree(context->buffer[ds_bts]);
- kfree(context->buffer[ds_pebs]);
- kfree(context->ds);
kfree(context);
- out:
- spin_unlock(&ds_lock);
}
/*
- * Handle a buffer overflow
+ * Call the tracer's callback on a buffer overflow.
*
- * task: the task whose buffers are overflowing;
- * NULL for a buffer overflow on the current cpu
* context: the ds context
* qual: the buffer type
*/
-static void ds_overflow(struct task_struct *task, struct ds_context *context,
- enum ds_qualifier qual)
+static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
{
- if (!context)
- return;
-
- if (context->callback[qual])
- (*context->callback[qual])(task);
-
- /* todo: do some more overflow handling */
+ switch (qual) {
+ case ds_bts:
+ if (context->bts_master &&
+ context->bts_master->ovfl)
+ context->bts_master->ovfl(context->bts_master);
+ break;
+ case ds_pebs:
+ if (context->pebs_master &&
+ context->pebs_master->ovfl)
+ context->pebs_master->ovfl(context->pebs_master);
+ break;
+ }
}
/*
- * Allocate a non-pageable buffer of the parameter size.
- * Checks the memory and the locked memory rlimit.
+ * Write raw data into the BTS or PEBS buffer.
*
- * Returns the buffer, if successful;
- * NULL, if out of memory or rlimit exceeded.
+ * The remainder of any partially written record is zeroed out.
*
- * size: the requested buffer size in bytes
- * pages (out): if not NULL, contains the number of pages reserved
+ * context: the DS context
+ * qual: the buffer type
+ * record: the data to write
+ * size: the size of the data
*/
-static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
+static int ds_write(struct ds_context *context, enum ds_qualifier qual,
+ const void *record, size_t size)
{
- unsigned long rlim, vm, pgsz;
- void *buffer;
+ int bytes_written = 0;
- pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ if (!record)
+ return -EINVAL;
- rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
- vm = current->mm->total_vm + pgsz;
- if (rlim < vm)
- return NULL;
+ while (size) {
+ unsigned long base, index, end, write_end, int_th;
+ unsigned long write_size, adj_write_size;
- rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
- vm = current->mm->locked_vm + pgsz;
- if (rlim < vm)
- return NULL;
+ /*
+ * write as much as possible without producing an
+ * overflow interrupt.
+ *
+ * interrupt_threshold must either be
+ * - bigger than absolute_maximum or
+ * - point to a record between buffer_base and absolute_maximum
+ *
+ * index points to a valid record.
+ */
+ base = ds_get(context->ds, qual, ds_buffer_base);
+ index = ds_get(context->ds, qual, ds_index);
+ end = ds_get(context->ds, qual, ds_absolute_maximum);
+ int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
- buffer = kzalloc(size, GFP_KERNEL);
- if (!buffer)
- return NULL;
+ write_end = min(end, int_th);
+
+ /* if we are already beyond the interrupt threshold,
+ * we fill the entire buffer */
+ if (write_end <= index)
+ write_end = end;
- current->mm->total_vm += pgsz;
- current->mm->locked_vm += pgsz;
+ if (write_end <= index)
+ break;
- if (pages)
- *pages = pgsz;
+ write_size = min((unsigned long) size, write_end - index);
+ memcpy((void *)index, record, write_size);
- return buffer;
+ record = (const char *)record + write_size;
+ size -= write_size;
+ bytes_written += write_size;
+
+ adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
+ adj_write_size *= ds_cfg.sizeof_rec[qual];
+
+ /* zero out trailing bytes */
+ memset((char *)index + write_size, 0,
+ adj_write_size - write_size);
+ index += adj_write_size;
+
+ if (index >= end)
+ index = base;
+ ds_set(context->ds, qual, ds_index, index);
+
+ if (index >= int_th)
+ ds_overflow(context, qual);
+ }
+
+ return bytes_written;
}
-static int ds_request(struct task_struct *task, void *base, size_t size,
- ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
+
+/*
+ * Branch Trace Store (BTS) uses the following format. Different
+ * architectures vary in the size of those fields.
+ * - source linear address
+ * - destination linear address
+ * - flags
+ *
+ * Later architectures use 64bit pointers throughout, whereas earlier
+ * architectures use 32bit pointers in 32bit mode.
+ *
+ * We compute the base address for the first 8 fields based on:
+ * - the field size stored in the DS configuration
+ * - the relative field position
+ *
+ * In order to store additional information in the BTS buffer, we use
+ * a special source address to indicate that the record requires
+ * special interpretation.
+ *
+ * Netburst indicated via a bit in the flags field whether the branch
+ * was predicted; this is ignored.
+ *
+ * We use two levels of abstraction:
+ * - the raw data level defined here
+ * - an arch-independent level defined in ds.h
+ */
+
+enum bts_field {
+ bts_from,
+ bts_to,
+ bts_flags,
+
+ bts_qual = bts_from,
+ bts_jiffies = bts_to,
+ bts_pid = bts_flags,
+
+ bts_qual_mask = (bts_qual_max - 1),
+ bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
+};
+
+static inline unsigned long bts_get(const char *base, enum bts_field field)
{
- struct ds_context *context;
- unsigned long buffer, adj;
- const unsigned long alignment = (1 << 3);
- int error = 0;
+ base += (ds_cfg.sizeof_field * field);
+ return *(unsigned long *)base;
+}
- if (!ds_cfg.sizeof_ds)
- return -EOPNOTSUPP;
+static inline void bts_set(char *base, enum bts_field field, unsigned long val)
+{
+ base += (ds_cfg.sizeof_field * field);;
+ (*(unsigned long *)base) = val;
+}
- /* we require some space to do alignment adjustments below */
- if (size < (alignment + ds_cfg.sizeof_rec[qual]))
+
+/*
+ * The raw BTS data is architecture dependent.
+ *
+ * For higher-level users, we give an arch-independent view.
+ * - ds.h defines struct bts_struct
+ * - bts_read translates one raw bts record into a bts_struct
+ * - bts_write translates one bts_struct into the raw format and
+ * writes it into the top of the parameter tracer's buffer.
+ *
+ * return: bytes read/written on success; -Eerrno, otherwise
+ */
+static int bts_read(struct bts_tracer *tracer, const void *at,
+ struct bts_struct *out)
+{
+ if (!tracer)
return -EINVAL;
- /* buffer overflow notification is not yet implemented */
- if (ovfl)
- return -EOPNOTSUPP;
+ if (at < tracer->trace.ds.begin)
+ return -EINVAL;
+ if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
+ return -EINVAL;
- spin_lock(&ds_lock);
+ memset(out, 0, sizeof(*out));
+ if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
+ out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
+ out->variant.timestamp.jiffies = bts_get(at, bts_jiffies);
+ out->variant.timestamp.pid = bts_get(at, bts_pid);
+ } else {
+ out->qualifier = bts_branch;
+ out->variant.lbr.from = bts_get(at, bts_from);
+ out->variant.lbr.to = bts_get(at, bts_to);
+
+ if (!out->variant.lbr.from && !out->variant.lbr.to)
+ out->qualifier = bts_invalid;
+ }
- if (!check_tracer(task))
- return -EPERM;
+ return ds_cfg.sizeof_rec[ds_bts];
+}
- error = -ENOMEM;
- context = ds_alloc_context(task);
- if (!context)
- goto out_unlock;
+static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
+{
+ unsigned char raw[MAX_SIZEOF_BTS];
- error = -EALREADY;
- if (context->owner[qual] == current)
- goto out_unlock;
- error = -EPERM;
- if (context->owner[qual] != NULL)
- goto out_unlock;
- context->owner[qual] = current;
+ if (!tracer)
+ return -EINVAL;
- spin_unlock(&ds_lock);
+ if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
+ return -EOVERFLOW;
+ switch (in->qualifier) {
+ case bts_invalid:
+ bts_set(raw, bts_from, 0);
+ bts_set(raw, bts_to, 0);
+ bts_set(raw, bts_flags, 0);
+ break;
+ case bts_branch:
+ bts_set(raw, bts_from, in->variant.lbr.from);
+ bts_set(raw, bts_to, in->variant.lbr.to);
+ bts_set(raw, bts_flags, 0);
+ break;
+ case bts_task_arrives:
+ case bts_task_departs:
+ bts_set(raw, bts_qual, (bts_escape | in->qualifier));
+ bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies);
+ bts_set(raw, bts_pid, in->variant.timestamp.pid);
+ break;
+ default:
+ return -EINVAL;
+ }
- error = -ENOMEM;
- if (!base) {
- base = ds_allocate_buffer(size, &context->pages[qual]);
- if (!base)
- goto out_release;
+ return ds_write(tracer->ds.context, ds_bts, raw,
+ ds_cfg.sizeof_rec[ds_bts]);
+}
- context->buffer[qual] = base;
- }
- error = 0;
- context->callback[qual] = ovfl;
+static void ds_write_config(struct ds_context *context,
+ struct ds_trace *cfg, enum ds_qualifier qual)
+{
+ unsigned char *ds = context->ds;
+
+ ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
+ ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
+ ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
+ ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
+}
+
+static void ds_read_config(struct ds_context *context,
+ struct ds_trace *cfg, enum ds_qualifier qual)
+{
+ unsigned char *ds = context->ds;
+
+ cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
+ cfg->top = (void *)ds_get(ds, qual, ds_index);
+ cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
+ cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
+}
+
+static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
+ void *base, size_t size, size_t ith,
+ unsigned int flags) {
+ unsigned long buffer, adj;
/* adjust the buffer address and size to meet alignment
* constraints:
@@ -425,395 +565,383 @@ static int ds_request(struct task_struct *task, void *base, size_t size,
*/
buffer = (unsigned long)base;
- adj = ALIGN(buffer, alignment) - buffer;
+ adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
buffer += adj;
size -= adj;
- size /= ds_cfg.sizeof_rec[qual];
- size *= ds_cfg.sizeof_rec[qual];
-
- ds_set(context->ds, qual, ds_buffer_base, buffer);
- ds_set(context->ds, qual, ds_index, buffer);
- ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
+ trace->n = size / ds_cfg.sizeof_rec[qual];
+ trace->size = ds_cfg.sizeof_rec[qual];
- if (ovfl) {
- /* todo: select a suitable interrupt threshold */
- } else
- ds_set(context->ds, qual,
- ds_interrupt_threshold, buffer + size + 1);
+ size = (trace->n * trace->size);
- /* we keep the context until ds_release */
- return error;
-
- out_release:
- context->owner[qual] = NULL;
- ds_put_context(context);
- return error;
-
- out_unlock:
- spin_unlock(&ds_lock);
- ds_put_context(context);
- return error;
-}
+ trace->begin = (void *)buffer;
+ trace->top = trace->begin;
+ trace->end = (void *)(buffer + size);
+ /* The value for 'no threshold' is -1, which will set the
+ * threshold outside of the buffer, just like we want it.
+ */
+ trace->ith = (void *)(buffer + size - ith);
-int ds_request_bts(struct task_struct *task, void *base, size_t size,
- ds_ovfl_callback_t ovfl)
-{
- return ds_request(task, base, size, ovfl, ds_bts);
+ trace->flags = flags;
}
-int ds_request_pebs(struct task_struct *task, void *base, size_t size,
- ds_ovfl_callback_t ovfl)
-{
- return ds_request(task, base, size, ovfl, ds_pebs);
-}
-static int ds_release(struct task_struct *task, enum ds_qualifier qual)
+static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
+ enum ds_qualifier qual, struct task_struct *task,
+ void *base, size_t size, size_t th, unsigned int flags)
{
struct ds_context *context;
int error;
- context = ds_get_context(task);
- error = ds_validate_access(context, qual);
- if (error < 0)
+ error = -EINVAL;
+ if (!base)
goto out;
- kfree(context->buffer[qual]);
- context->buffer[qual] = NULL;
-
- current->mm->total_vm -= context->pages[qual];
- current->mm->locked_vm -= context->pages[qual];
- context->pages[qual] = 0;
- context->owner[qual] = NULL;
-
- /*
- * we put the context twice:
- * once for the ds_get_context
- * once for the corresponding ds_request
- */
- ds_put_context(context);
- out:
- ds_put_context(context);
- return error;
-}
+ /* we require some space to do alignment adjustments below */
+ error = -EINVAL;
+ if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
+ goto out;
-int ds_release_bts(struct task_struct *task)
-{
- return ds_release(task, ds_bts);
-}
+ if (th != (size_t)-1) {
+ th *= ds_cfg.sizeof_rec[qual];
-int ds_release_pebs(struct task_struct *task)
-{
- return ds_release(task, ds_pebs);
-}
+ error = -EINVAL;
+ if (size <= th)
+ goto out;
+ }
-static int ds_get_index(struct task_struct *task, size_t *pos,
- enum ds_qualifier qual)
-{
- struct ds_context *context;
- unsigned long base, index;
- int error;
+ tracer->buffer = base;
+ tracer->size = size;
+ error = -ENOMEM;
context = ds_get_context(task);
- error = ds_validate_access(context, qual);
- if (error < 0)
+ if (!context)
goto out;
+ tracer->context = context;
- base = ds_get(context->ds, qual, ds_buffer_base);
- index = ds_get(context->ds, qual, ds_index);
+ ds_init_ds_trace(trace, qual, base, size, th, flags);
- error = ((index - base) / ds_cfg.sizeof_rec[qual]);
- if (pos)
- *pos = error;
+ error = 0;
out:
- ds_put_context(context);
return error;
}
-int ds_get_bts_index(struct task_struct *task, size_t *pos)
+struct bts_tracer *ds_request_bts(struct task_struct *task,
+ void *base, size_t size,
+ bts_ovfl_callback_t ovfl, size_t th,
+ unsigned int flags)
{
- return ds_get_index(task, pos, ds_bts);
-}
-
-int ds_get_pebs_index(struct task_struct *task, size_t *pos)
-{
- return ds_get_index(task, pos, ds_pebs);
-}
-
-static int ds_get_end(struct task_struct *task, size_t *pos,
- enum ds_qualifier qual)
-{
- struct ds_context *context;
- unsigned long base, end;
+ struct bts_tracer *tracer;
+ unsigned long irq;
int error;
- context = ds_get_context(task);
- error = ds_validate_access(context, qual);
- if (error < 0)
+ error = -EOPNOTSUPP;
+ if (!ds_cfg.ctl[dsf_bts])
goto out;
- base = ds_get(context->ds, qual, ds_buffer_base);
- end = ds_get(context->ds, qual, ds_absolute_maximum);
+ /* buffer overflow notification is not yet implemented */
+ error = -EOPNOTSUPP;
+ if (ovfl)
+ goto out;
- error = ((end - base) / ds_cfg.sizeof_rec[qual]);
- if (pos)
- *pos = error;
- out:
- ds_put_context(context);
- return error;
-}
+ error = -ENOMEM;
+ tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
+ if (!tracer)
+ goto out;
+ tracer->ovfl = ovfl;
-int ds_get_bts_end(struct task_struct *task, size_t *pos)
-{
- return ds_get_end(task, pos, ds_bts);
-}
+ error = ds_request(&tracer->ds, &tracer->trace.ds,
+ ds_bts, task, base, size, th, flags);
+ if (error < 0)
+ goto out_tracer;
-int ds_get_pebs_end(struct task_struct *task, size_t *pos)
-{
- return ds_get_end(task, pos, ds_pebs);
-}
-static int ds_access(struct task_struct *task, size_t index,
- const void **record, enum ds_qualifier qual)
-{
- struct ds_context *context;
- unsigned long base, idx;
- int error;
+ spin_lock_irqsave(&ds_lock, irq);
- if (!record)
- return -EINVAL;
+ error = -EPERM;
+ if (!check_tracer(task))
+ goto out_unlock;
+ get_tracer(task);
- context = ds_get_context(task);
- error = ds_validate_access(context, qual);
- if (error < 0)
- goto out;
+ error = -EPERM;
+ if (tracer->ds.context->bts_master)
+ goto out_put_tracer;
+ tracer->ds.context->bts_master = tracer;
- base = ds_get(context->ds, qual, ds_buffer_base);
- idx = base + (index * ds_cfg.sizeof_rec[qual]);
+ spin_unlock_irqrestore(&ds_lock, irq);
- error = -EINVAL;
- if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
- goto out;
- *record = (const void *)idx;
- error = ds_cfg.sizeof_rec[qual];
- out:
- ds_put_context(context);
- return error;
-}
+ tracer->trace.read = bts_read;
+ tracer->trace.write = bts_write;
-int ds_access_bts(struct task_struct *task, size_t index, const void **record)
-{
- return ds_access(task, index, record, ds_bts);
-}
+ ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+ ds_resume_bts(tracer);
-int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
-{
- return ds_access(task, index, record, ds_pebs);
+ return tracer;
+
+ out_put_tracer:
+ put_tracer(task);
+ out_unlock:
+ spin_unlock_irqrestore(&ds_lock, irq);
+ ds_put_context(tracer->ds.context);
+ out_tracer:
+ kfree(tracer);
+ out:
+ return ERR_PTR(error);
}
-static int ds_write(struct task_struct *task, const void *record, size_t size,
- enum ds_qualifier qual, int force)
+struct pebs_tracer *ds_request_pebs(struct task_struct *task,
+ void *base, size_t size,
+ pebs_ovfl_callback_t ovfl, size_t th,
+ unsigned int flags)
{
- struct ds_context *context;
+ struct pebs_tracer *tracer;
+ unsigned long irq;
int error;
- if (!record)
- return -EINVAL;
+ /* buffer overflow notification is not yet implemented */
+ error = -EOPNOTSUPP;
+ if (ovfl)
+ goto out;
- error = -EPERM;
- context = ds_get_context(task);
- if (!context)
+ error = -ENOMEM;
+ tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
+ if (!tracer)
goto out;
+ tracer->ovfl = ovfl;
- if (!force) {
- error = ds_validate_access(context, qual);
- if (error < 0)
- goto out;
- }
+ error = ds_request(&tracer->ds, &tracer->trace.ds,
+ ds_pebs, task, base, size, th, flags);
+ if (error < 0)
+ goto out_tracer;
- error = 0;
- while (size) {
- unsigned long base, index, end, write_end, int_th;
- unsigned long write_size, adj_write_size;
+ spin_lock_irqsave(&ds_lock, irq);
- /*
- * write as much as possible without producing an
- * overflow interrupt.
- *
- * interrupt_threshold must either be
- * - bigger than absolute_maximum or
- * - point to a record between buffer_base and absolute_maximum
- *
- * index points to a valid record.
- */
- base = ds_get(context->ds, qual, ds_buffer_base);
- index = ds_get(context->ds, qual, ds_index);
- end = ds_get(context->ds, qual, ds_absolute_maximum);
- int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
+ error = -EPERM;
+ if (!check_tracer(task))
+ goto out_unlock;
+ get_tracer(task);
- write_end = min(end, int_th);
+ error = -EPERM;
+ if (tracer->ds.context->pebs_master)
+ goto out_put_tracer;
+ tracer->ds.context->pebs_master = tracer;
- /* if we are already beyond the interrupt threshold,
- * we fill the entire buffer */
- if (write_end <= index)
- write_end = end;
+ spin_unlock_irqrestore(&ds_lock, irq);
- if (write_end <= index)
- goto out;
+ ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+ ds_resume_pebs(tracer);
- write_size = min((unsigned long) size, write_end - index);
- memcpy((void *)index, record, write_size);
+ return tracer;
- record = (const char *)record + write_size;
- size -= write_size;
- error += write_size;
+ out_put_tracer:
+ put_tracer(task);
+ out_unlock:
+ spin_unlock_irqrestore(&ds_lock, irq);
+ ds_put_context(tracer->ds.context);
+ out_tracer:
+ kfree(tracer);
+ out:
+ return ERR_PTR(error);
+}
- adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
- adj_write_size *= ds_cfg.sizeof_rec[qual];
+void ds_release_bts(struct bts_tracer *tracer)
+{
+ if (!tracer)
+ return;
- /* zero out trailing bytes */
- memset((char *)index + write_size, 0,
- adj_write_size - write_size);
- index += adj_write_size;
+ ds_suspend_bts(tracer);
- if (index >= end)
- index = base;
- ds_set(context->ds, qual, ds_index, index);
+ WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
+ tracer->ds.context->bts_master = NULL;
- if (index >= int_th)
- ds_overflow(task, context, qual);
- }
+ put_tracer(tracer->ds.context->task);
+ ds_put_context(tracer->ds.context);
- out:
- ds_put_context(context);
- return error;
+ kfree(tracer);
}
-int ds_write_bts(struct task_struct *task, const void *record, size_t size)
+void ds_suspend_bts(struct bts_tracer *tracer)
{
- return ds_write(task, record, size, ds_bts, /* force = */ 0);
-}
+ struct task_struct *task;
-int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
-{
- return ds_write(task, record, size, ds_pebs, /* force = */ 0);
-}
+ if (!tracer)
+ return;
-int ds_unchecked_write_bts(struct task_struct *task,
- const void *record, size_t size)
-{
- return ds_write(task, record, size, ds_bts, /* force = */ 1);
-}
+ task = tracer->ds.context->task;
-int ds_unchecked_write_pebs(struct task_struct *task,
- const void *record, size_t size)
-{
- return ds_write(task, record, size, ds_pebs, /* force = */ 1);
+ if (!task || (task == current))
+ update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL);
+
+ if (task) {
+ task->thread.debugctlmsr &= ~BTS_CONTROL;
+
+ if (!task->thread.debugctlmsr)
+ clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
+ }
}
-static int ds_reset_or_clear(struct task_struct *task,
- enum ds_qualifier qual, int clear)
+void ds_resume_bts(struct bts_tracer *tracer)
{
- struct ds_context *context;
- unsigned long base, end;
- int error;
+ struct task_struct *task;
+ unsigned long control;
- context = ds_get_context(task);
- error = ds_validate_access(context, qual);
- if (error < 0)
- goto out;
+ if (!tracer)
+ return;
- base = ds_get(context->ds, qual, ds_buffer_base);
- end = ds_get(context->ds, qual, ds_absolute_maximum);
+ task = tracer->ds.context->task;
- if (clear)
- memset((void *)base, 0, end - base);
+ control = ds_cfg.ctl[dsf_bts];
+ if (!(tracer->trace.ds.flags & BTS_KERNEL))
+ control |= ds_cfg.ctl[dsf_bts_kernel];
+ if (!(tracer->trace.ds.flags & BTS_USER))
+ control |= ds_cfg.ctl[dsf_bts_user];
- ds_set(context->ds, qual, ds_index, base);
+ if (task) {
+ task->thread.debugctlmsr |= control;
+ set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
+ }
- error = 0;
- out:
- ds_put_context(context);
- return error;
+ if (!task || (task == current))
+ update_debugctlmsr(get_debugctlmsr() | control);
}
-int ds_reset_bts(struct task_struct *task)
+void ds_release_pebs(struct pebs_tracer *tracer)
{
- return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
+ if (!tracer)
+ return;
+
+ ds_suspend_pebs(tracer);
+
+ WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
+ tracer->ds.context->pebs_master = NULL;
+
+ put_tracer(tracer->ds.context->task);
+ ds_put_context(tracer->ds.context);
+
+ kfree(tracer);
}
-int ds_reset_pebs(struct task_struct *task)
+void ds_suspend_pebs(struct pebs_tracer *tracer)
{
- return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
+
}
-int ds_clear_bts(struct task_struct *task)
+void ds_resume_pebs(struct pebs_tracer *tracer)
{
- return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
+
}
-int ds_clear_pebs(struct task_struct *task)
+const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
{
- return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
+ if (!tracer)
+ return NULL;
+
+ ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+ return &tracer->trace;
}
-int ds_get_pebs_reset(struct task_struct *task, u64 *value)
+const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
{
- struct ds_context *context;
- int error;
+ if (!tracer)
+ return NULL;
+
+ ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
+ tracer->trace.reset_value =
+ *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
- if (!value)
+ return &tracer->trace;
+}
+
+int ds_reset_bts(struct bts_tracer *tracer)
+{
+ if (!tracer)
return -EINVAL;
- context = ds_get_context(task);
- error = ds_validate_access(context, ds_pebs);
- if (error < 0)
- goto out;
+ tracer->trace.ds.top = tracer->trace.ds.begin;
- *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
+ ds_set(tracer->ds.context->ds, ds_bts, ds_index,
+ (unsigned long)tracer->trace.ds.top);
- error = 0;
- out:
- ds_put_context(context);
- return error;
+ return 0;
}
-int ds_set_pebs_reset(struct task_struct *task, u64 value)
+int ds_reset_pebs(struct pebs_tracer *tracer)
{
- struct ds_context *context;
- int error;
+ if (!tracer)
+ return -EINVAL;
- context = ds_get_context(task);
- error = ds_validate_access(context, ds_pebs);
- if (error < 0)
- goto out;
+ tracer->trace.ds.top = tracer->trace.ds.begin;
- *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
+ ds_set(tracer->ds.context->ds, ds_bts, ds_index,
+ (unsigned long)tracer->trace.ds.top);
- error = 0;
- out:
- ds_put_context(context);
- return error;
+ return 0;
+}
+
+int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
+{
+ if (!tracer)
+ return -EINVAL;
+
+ *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value;
+
+ return 0;
}
-static const struct ds_configuration ds_cfg_var = {
- .sizeof_ds = sizeof(long) * 12,
- .sizeof_field = sizeof(long),
- .sizeof_rec[ds_bts] = sizeof(long) * 3,
- .sizeof_rec[ds_pebs] = sizeof(long) * 10
+static const struct ds_configuration ds_cfg_netburst = {
+ .name = "Netburst",
+ .ctl[dsf_bts] = (1 << 2) | (1 << 3),
+ .ctl[dsf_bts_kernel] = (1 << 5),
+ .ctl[dsf_bts_user] = (1 << 6),
+
+ .sizeof_field = sizeof(long),
+ .sizeof_rec[ds_bts] = sizeof(long) * 3,
+#ifdef __i386__
+ .sizeof_rec[ds_pebs] = sizeof(long) * 10,
+#else
+ .sizeof_rec[ds_pebs] = sizeof(long) * 18,
+#endif
+};
+static const struct ds_configuration ds_cfg_pentium_m = {
+ .name = "Pentium M",
+ .ctl[dsf_bts] = (1 << 6) | (1 << 7),
+
+ .sizeof_field = sizeof(long),
+ .sizeof_rec[ds_bts] = sizeof(long) * 3,
+#ifdef __i386__
+ .sizeof_rec[ds_pebs] = sizeof(long) * 10,
+#else
+ .sizeof_rec[ds_pebs] = sizeof(long) * 18,
+#endif
};
-static const struct ds_configuration ds_cfg_64 = {
- .sizeof_ds = 8 * 12,
- .sizeof_field = 8,
- .sizeof_rec[ds_bts] = 8 * 3,
- .sizeof_rec[ds_pebs] = 8 * 10
+static const struct ds_configuration ds_cfg_core2_atom = {
+ .name = "Core 2/Atom",
+ .ctl[dsf_bts] = (1 << 6) | (1 << 7),
+ .ctl[dsf_bts_kernel] = (1 << 9),
+ .ctl[dsf_bts_user] = (1 << 10),
+
+ .sizeof_field = 8,
+ .sizeof_rec[ds_bts] = 8 * 3,
+ .sizeof_rec[ds_pebs] = 8 * 18,
};
-static inline void
+static void
ds_configure(const struct ds_configuration *cfg)
{
+ memset(&ds_cfg, 0, sizeof(ds_cfg));
ds_cfg = *cfg;
+
+ printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name);
+
+ if (!cpu_has_bts) {
+ ds_cfg.ctl[dsf_bts] = 0;
+ printk(KERN_INFO "[ds] bts not available\n");
+ }
+ if (!cpu_has_pebs)
+ printk(KERN_INFO "[ds] pebs not available\n");
+
+ WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field));
}
void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
@@ -821,25 +949,27 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
switch (c->x86) {
case 0x6:
switch (c->x86_model) {
- case 0xD:
- case 0xE: /* Pentium M */
- ds_configure(&ds_cfg_var);
+ case 0x9:
+ case 0xd: /* Pentium M */
+ ds_configure(&ds_cfg_pentium_m);
break;
- case 0xF: /* Core2 */
- case 0x1C: /* Atom */
- ds_configure(&ds_cfg_64);
+ case 0xf:
+ case 0x17: /* Core2 */
+ case 0x1c: /* Atom */
+ ds_configure(&ds_cfg_core2_atom);
break;
+ case 0x1a: /* i7 */
default:
/* sorry, don't know about them */
break;
}
break;
- case 0xF:
+ case 0xf:
switch (c->x86_model) {
case 0x0:
case 0x1:
case 0x2: /* Netburst */
- ds_configure(&ds_cfg_var);
+ ds_configure(&ds_cfg_netburst);
break;
default:
/* sorry, don't know about them */
@@ -852,13 +982,52 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
}
}
-void ds_free(struct ds_context *context)
+/*
+ * Change the DS configuration from tracing prev to tracing next.
+ */
+void ds_switch_to(struct task_struct *prev, struct task_struct *next)
+{
+ struct ds_context *prev_ctx = prev->thread.ds_ctx;
+ struct ds_context *next_ctx = next->thread.ds_ctx;
+
+ if (prev_ctx) {
+ update_debugctlmsr(0);
+
+ if (prev_ctx->bts_master &&
+ (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
+ struct bts_struct ts = {
+ .qualifier = bts_task_departs,
+ .variant.timestamp.jiffies = jiffies_64,
+ .variant.timestamp.pid = prev->pid
+ };
+ bts_write(prev_ctx->bts_master, &ts);
+ }
+ }
+
+ if (next_ctx) {
+ if (next_ctx->bts_master &&
+ (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
+ struct bts_struct ts = {
+ .qualifier = bts_task_arrives,
+ .variant.timestamp.jiffies = jiffies_64,
+ .variant.timestamp.pid = next->pid
+ };
+ bts_write(next_ctx->bts_master, &ts);
+ }
+
+ wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
+ }
+
+ update_debugctlmsr(next->thread.debugctlmsr);
+}
+
+void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
+{
+ clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR);
+ tsk->thread.ds_ctx = NULL;
+}
+
+void ds_exit_thread(struct task_struct *tsk)
{
- /* This is called when the task owning the parameter context
- * is dying. There should not be any user of that context left
- * to disturb us, anymore. */
- unsigned long leftovers = context->count;
- while (leftovers--)
- ds_put_context(context);
+ WARN_ON(tsk->thread.ds_ctx);
}
-#endif /* CONFIG_X86_DS */
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
new file mode 100644
index 00000000000..87d103ded1c
--- /dev/null
+++ b/arch/x86/kernel/dumpstack.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+#include <linux/hardirq.h>
+#include <linux/kdebug.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/kexec.h>
+#include <linux/bug.h>
+#include <linux/nmi.h>
+#include <linux/sysfs.h>
+
+#include <asm/stacktrace.h>
+
+#include "dumpstack.h"
+
+int panic_on_unrecovered_nmi;
+unsigned int code_bytes = 64;
+int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
+static int die_counter;
+
+void printk_address(unsigned long address, int reliable)
+{
+ printk(" [<%p>] %s%pS\n", (void *) address,
+ reliable ? "" : "? ", (void *) address);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static void
+print_ftrace_graph_addr(unsigned long addr, void *data,
+ const struct stacktrace_ops *ops,
+ struct thread_info *tinfo, int *graph)
+{
+ struct task_struct *task = tinfo->task;
+ unsigned long ret_addr;
+ int index = task->curr_ret_stack;
+
+ if (addr != (unsigned long)return_to_handler)
+ return;
+
+ if (!task->ret_stack || index < *graph)
+ return;
+
+ index -= *graph;
+ ret_addr = task->ret_stack[index].ret;
+
+ ops->address(data, ret_addr, 1);
+
+ (*graph)++;
+}
+#else
+static inline void
+print_ftrace_graph_addr(unsigned long addr, void *data,
+ const struct stacktrace_ops *ops,
+ struct thread_info *tinfo, int *graph)
+{ }
+#endif
+
+/*
+ * x86-64 can have up to three kernel stacks:
+ * process stack
+ * interrupt stack
+ * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
+ */
+
+static inline int valid_stack_ptr(struct thread_info *tinfo,
+ void *p, unsigned int size, void *end)
+{
+ void *t = tinfo;
+ if (end) {
+ if (p < end && p >= (end-THREAD_SIZE))
+ return 1;
+ else
+ return 0;
+ }
+ return p > t && p < t + THREAD_SIZE - size;
+}
+
+unsigned long
+print_context_stack(struct thread_info *tinfo,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data,
+ unsigned long *end, int *graph)
+{
+ struct stack_frame *frame = (struct stack_frame *)bp;
+
+ while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
+ unsigned long addr;
+
+ addr = *stack;
+ if (__kernel_text_address(addr)) {
+ if ((unsigned long) stack == bp + sizeof(long)) {
+ ops->address(data, addr, 1);
+ frame = frame->next_frame;
+ bp = (unsigned long) frame;
+ } else {
+ ops->address(data, addr, 0);
+ }
+ print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
+ }
+ stack++;
+ }
+ return bp;
+}
+
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+ printk(data);
+ print_symbol(msg, symbol);
+ printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+ printk("%s%s\n", (char *)data, msg);
+}
+
+static int print_trace_stack(void *data, char *name)
+{
+ printk("%s <%s> ", (char *)data, name);
+ return 0;
+}
+
+/*
+ * Print one address/symbol entries per line.
+ */
+static void print_trace_address(void *data, unsigned long addr, int reliable)
+{
+ touch_nmi_watchdog();
+ printk(data);
+ printk_address(addr, reliable);
+}
+
+static const struct stacktrace_ops print_trace_ops = {
+ .warning = print_trace_warning,
+ .warning_symbol = print_trace_warning_symbol,
+ .stack = print_trace_stack,
+ .address = print_trace_address,
+};
+
+void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp, char *log_lvl)
+{
+ printk("%sCall Trace:\n", log_lvl);
+ dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
+}
+
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp)
+{
+ show_trace_log_lvl(task, regs, stack, bp, "");
+}
+
+void show_stack(struct task_struct *task, unsigned long *sp)
+{
+ show_stack_log_lvl(task, NULL, sp, 0, "");
+}
+
+/*
+ * The architecture-independent dump_stack generator
+ */
+void dump_stack(void)
+{
+ unsigned long bp = 0;
+ unsigned long stack;
+
+#ifdef CONFIG_FRAME_POINTER
+ if (!bp)
+ get_bp(bp);
+#endif
+
+ printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+ current->pid, current->comm, print_tainted(),
+ init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+ show_trace(NULL, NULL, &stack, bp);
+}
+EXPORT_SYMBOL(dump_stack);
+
+static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static int die_owner = -1;
+static unsigned int die_nest_count;
+
+unsigned __kprobes long oops_begin(void)
+{
+ int cpu;
+ unsigned long flags;
+
+ oops_enter();
+
+ /* racy, but better than risking deadlock. */
+ raw_local_irq_save(flags);
+ cpu = smp_processor_id();
+ if (!__raw_spin_trylock(&die_lock)) {
+ if (cpu == die_owner)
+ /* nested oops. should stop eventually */;
+ else
+ __raw_spin_lock(&die_lock);
+ }
+ die_nest_count++;
+ die_owner = cpu;
+ console_verbose();
+ bust_spinlocks(1);
+ return flags;
+}
+
+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+{
+ if (regs && kexec_should_crash(current))
+ crash_kexec(regs);
+
+ bust_spinlocks(0);
+ die_owner = -1;
+ add_taint(TAINT_DIE);
+ die_nest_count--;
+ if (!die_nest_count)
+ /* Nest count reaches zero, release the lock. */
+ __raw_spin_unlock(&die_lock);
+ raw_local_irq_restore(flags);
+ oops_exit();
+
+ if (!signr)
+ return;
+ if (in_interrupt())
+ panic("Fatal exception in interrupt");
+ if (panic_on_oops)
+ panic("Fatal exception");
+ do_exit(signr);
+}
+
+int __kprobes __die(const char *str, struct pt_regs *regs, long err)
+{
+#ifdef CONFIG_X86_32
+ unsigned short ss;
+ unsigned long sp;
+#endif
+ printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
+#ifdef CONFIG_PREEMPT
+ printk("PREEMPT ");
+#endif
+#ifdef CONFIG_SMP
+ printk("SMP ");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ printk("DEBUG_PAGEALLOC");
+#endif
+ printk("\n");
+ sysfs_printk_last_file();
+ if (notify_die(DIE_OOPS, str, regs, err,
+ current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+ return 1;
+
+ show_registers(regs);
+#ifdef CONFIG_X86_32
+ sp = (unsigned long) (&regs->sp);
+ savesegment(ss, ss);
+ if (user_mode(regs)) {
+ sp = regs->sp;
+ ss = regs->ss & 0xffff;
+ }
+ printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
+ print_symbol("%s", regs->ip);
+ printk(" SS:ESP %04x:%08lx\n", ss, sp);
+#else
+ /* Executive summary in case the oops scrolled away */
+ printk(KERN_ALERT "RIP ");
+ printk_address(regs->ip, 1);
+ printk(" RSP <%016lx>\n", regs->sp);
+#endif
+ return 0;
+}
+
+/*
+ * This is gone through when something in the kernel has done something bad
+ * and is about to be terminated:
+ */
+void die(const char *str, struct pt_regs *regs, long err)
+{
+ unsigned long flags = oops_begin();
+ int sig = SIGSEGV;
+
+ if (!user_mode_vm(regs))
+ report_bug(regs->ip, regs);
+
+ if (__die(str, regs, err))
+ sig = 0;
+ oops_end(flags, regs, sig);
+}
+
+void notrace __kprobes
+die_nmi(char *str, struct pt_regs *regs, int do_panic)
+{
+ unsigned long flags;
+
+ if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
+ return;
+
+ /*
+ * We are in trouble anyway, lets at least try
+ * to get a message out.
+ */
+ flags = oops_begin();
+ printk(KERN_EMERG "%s", str);
+ printk(" on CPU%d, ip %08lx, registers:\n",
+ smp_processor_id(), regs->ip);
+ show_registers(regs);
+ oops_end(flags, regs, 0);
+ if (do_panic || panic_on_oops)
+ panic("Non maskable interrupt");
+ nmi_exit();
+ local_irq_enable();
+ do_exit(SIGBUS);
+}
+
+static int __init oops_setup(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ if (!strcmp(s, "panic"))
+ panic_on_oops = 1;
+ return 0;
+}
+early_param("oops", oops_setup);
+
+static int __init kstack_setup(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+ return 0;
+}
+early_param("kstack", kstack_setup);
+
+static int __init code_bytes_setup(char *s)
+{
+ code_bytes = simple_strtoul(s, NULL, 0);
+ if (code_bytes > 8192)
+ code_bytes = 8192;
+
+ return 1;
+}
+__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
new file mode 100644
index 00000000000..da87590b869
--- /dev/null
+++ b/arch/x86/kernel/dumpstack.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+
+#ifndef DUMPSTACK_H
+#define DUMPSTACK_H
+
+#ifdef CONFIG_X86_32
+#define STACKSLOTS_PER_LINE 8
+#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
+#else
+#define STACKSLOTS_PER_LINE 4
+#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
+#endif
+
+extern unsigned long
+print_context_stack(struct thread_info *tinfo,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data,
+ unsigned long *end, int *graph);
+
+extern void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp, char *log_lvl);
+
+extern void
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *sp, unsigned long bp, char *log_lvl);
+
+extern unsigned int code_bytes;
+extern int kstack_depth_to_print;
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+ struct stack_frame *next_frame;
+ unsigned long return_address;
+};
+#endif
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index b3614752197..d593cd1f58d 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -17,69 +17,14 @@
#include <asm/stacktrace.h>
-#define STACKSLOTS_PER_LINE 8
-#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
-
-int panic_on_unrecovered_nmi;
-int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
-static unsigned int code_bytes = 64;
-static int die_counter;
-
-void printk_address(unsigned long address, int reliable)
-{
- printk(" [<%p>] %s%pS\n", (void *) address,
- reliable ? "" : "? ", (void *) address);
-}
-
-static inline int valid_stack_ptr(struct thread_info *tinfo,
- void *p, unsigned int size, void *end)
-{
- void *t = tinfo;
- if (end) {
- if (p < end && p >= (end-THREAD_SIZE))
- return 1;
- else
- return 0;
- }
- return p > t && p < t + THREAD_SIZE - size;
-}
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
- struct stack_frame *next_frame;
- unsigned long return_address;
-};
-
-static inline unsigned long
-print_context_stack(struct thread_info *tinfo,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end)
-{
- struct stack_frame *frame = (struct stack_frame *)bp;
-
- while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
- unsigned long addr;
-
- addr = *stack;
- if (__kernel_text_address(addr)) {
- if ((unsigned long) stack == bp + sizeof(long)) {
- ops->address(data, addr, 1);
- frame = frame->next_frame;
- bp = (unsigned long) frame;
- } else {
- ops->address(data, addr, bp == 0);
- }
- }
- stack++;
- }
- return bp;
-}
+#include "dumpstack.h"
void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
const struct stacktrace_ops *ops, void *data)
{
+ int graph = 0;
+
if (!task)
task = current;
@@ -107,7 +52,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
context = (struct thread_info *)
((unsigned long)stack & (~(THREAD_SIZE - 1)));
- bp = print_context_stack(context, stack, bp, ops, data, NULL);
+ bp = print_context_stack(context, stack, bp, ops,
+ data, NULL, &graph);
stack = (unsigned long *)context->previous_esp;
if (!stack)
@@ -119,57 +65,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
}
EXPORT_SYMBOL(dump_trace);
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
- printk(data);
- print_symbol(msg, symbol);
- printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
- printk("%s%s\n", (char *)data, msg);
-}
-
-static int print_trace_stack(void *data, char *name)
-{
- printk("%s <%s> ", (char *)data, name);
- return 0;
-}
-
-/*
- * Print one address/symbol entries per line.
- */
-static void print_trace_address(void *data, unsigned long addr, int reliable)
-{
- touch_nmi_watchdog();
- printk(data);
- printk_address(addr, reliable);
-}
-
-static const struct stacktrace_ops print_trace_ops = {
- .warning = print_trace_warning,
- .warning_symbol = print_trace_warning_symbol,
- .stack = print_trace_stack,
- .address = print_trace_address,
-};
-
-static void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl)
-{
- printk("%sCall Trace:\n", log_lvl);
- dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
-}
-
-void show_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp)
-{
- show_trace_log_lvl(task, regs, stack, bp, "");
-}
-
-static void
+void
show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *sp, unsigned long bp, char *log_lvl)
{
@@ -196,33 +92,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
-void show_stack(struct task_struct *task, unsigned long *sp)
-{
- show_stack_log_lvl(task, NULL, sp, 0, "");
-}
-
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
- unsigned long bp = 0;
- unsigned long stack;
-
-#ifdef CONFIG_FRAME_POINTER
- if (!bp)
- get_bp(bp);
-#endif
-
- printk("Pid: %d, comm: %.20s %s %s %.*s\n",
- current->pid, current->comm, print_tainted(),
- init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
- show_trace(NULL, NULL, &stack, bp);
-}
-
-EXPORT_SYMBOL(dump_stack);
void show_registers(struct pt_regs *regs)
{
@@ -283,167 +152,3 @@ int is_valid_bugaddr(unsigned long ip)
return ud2 == 0x0b0f;
}
-static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
-static int die_owner = -1;
-static unsigned int die_nest_count;
-
-unsigned __kprobes long oops_begin(void)
-{
- unsigned long flags;
-
- oops_enter();
-
- if (die_owner != raw_smp_processor_id()) {
- console_verbose();
- raw_local_irq_save(flags);
- __raw_spin_lock(&die_lock);
- die_owner = smp_processor_id();
- die_nest_count = 0;
- bust_spinlocks(1);
- } else {
- raw_local_irq_save(flags);
- }
- die_nest_count++;
- return flags;
-}
-
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
-{
- bust_spinlocks(0);
- die_owner = -1;
- add_taint(TAINT_DIE);
- __raw_spin_unlock(&die_lock);
- raw_local_irq_restore(flags);
-
- if (!regs)
- return;
-
- if (kexec_should_crash(current))
- crash_kexec(regs);
- if (in_interrupt())
- panic("Fatal exception in interrupt");
- if (panic_on_oops)
- panic("Fatal exception");
- oops_exit();
- do_exit(signr);
-}
-
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
-{
- unsigned short ss;
- unsigned long sp;
-
- printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
-#ifdef CONFIG_PREEMPT
- printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
- printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
- printk("DEBUG_PAGEALLOC");
-#endif
- printk("\n");
- sysfs_printk_last_file();
- if (notify_die(DIE_OOPS, str, regs, err,
- current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
- return 1;
-
- show_registers(regs);
- /* Executive summary in case the oops scrolled away */
- sp = (unsigned long) (&regs->sp);
- savesegment(ss, ss);
- if (user_mode(regs)) {
- sp = regs->sp;
- ss = regs->ss & 0xffff;
- }
- printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
- print_symbol("%s", regs->ip);
- printk(" SS:ESP %04x:%08lx\n", ss, sp);
- return 0;
-}
-
-/*
- * This is gone through when something in the kernel has done something bad
- * and is about to be terminated:
- */
-void die(const char *str, struct pt_regs *regs, long err)
-{
- unsigned long flags = oops_begin();
-
- if (die_nest_count < 3) {
- report_bug(regs->ip, regs);
-
- if (__die(str, regs, err))
- regs = NULL;
- } else {
- printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
- }
-
- oops_end(flags, regs, SIGSEGV);
-}
-
-static DEFINE_SPINLOCK(nmi_print_lock);
-
-void notrace __kprobes
-die_nmi(char *str, struct pt_regs *regs, int do_panic)
-{
- if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
- return;
-
- spin_lock(&nmi_print_lock);
- /*
- * We are in trouble anyway, lets at least try
- * to get a message out:
- */
- bust_spinlocks(1);
- printk(KERN_EMERG "%s", str);
- printk(" on CPU%d, ip %08lx, registers:\n",
- smp_processor_id(), regs->ip);
- show_registers(regs);
- if (do_panic)
- panic("Non maskable interrupt");
- console_silent();
- spin_unlock(&nmi_print_lock);
-
- /*
- * If we are in kernel we are probably nested up pretty bad
- * and might aswell get out now while we still can:
- */
- if (!user_mode_vm(regs)) {
- current->thread.trap_no = 2;
- crash_kexec(regs);
- }
-
- bust_spinlocks(0);
- do_exit(SIGSEGV);
-}
-
-static int __init oops_setup(char *s)
-{
- if (!s)
- return -EINVAL;
- if (!strcmp(s, "panic"))
- panic_on_oops = 1;
- return 0;
-}
-early_param("oops", oops_setup);
-
-static int __init kstack_setup(char *s)
-{
- if (!s)
- return -EINVAL;
- kstack_depth_to_print = simple_strtoul(s, NULL, 0);
- return 0;
-}
-early_param("kstack", kstack_setup);
-
-static int __init code_bytes_setup(char *s)
-{
- code_bytes = simple_strtoul(s, NULL, 0);
- if (code_bytes > 8192)
- code_bytes = 8192;
-
- return 1;
-}
-__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 96a5db7da8a..d35db5993fd 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -17,19 +17,7 @@
#include <asm/stacktrace.h>
-#define STACKSLOTS_PER_LINE 4
-#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
-
-int panic_on_unrecovered_nmi;
-int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
-static unsigned int code_bytes = 64;
-static int die_counter;
-
-void printk_address(unsigned long address, int reliable)
-{
- printk(" [<%p>] %s%pS\n", (void *) address,
- reliable ? "" : "? ", (void *) address);
-}
+#include "dumpstack.h"
static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
unsigned *usedp, char **idp)
@@ -113,59 +101,16 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
* severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
*/
-static inline int valid_stack_ptr(struct thread_info *tinfo,
- void *p, unsigned int size, void *end)
-{
- void *t = tinfo;
- if (end) {
- if (p < end && p >= (end-THREAD_SIZE))
- return 1;
- else
- return 0;
- }
- return p > t && p < t + THREAD_SIZE - size;
-}
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
- struct stack_frame *next_frame;
- unsigned long return_address;
-};
-
-static inline unsigned long
-print_context_stack(struct thread_info *tinfo,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end)
-{
- struct stack_frame *frame = (struct stack_frame *)bp;
-
- while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
- unsigned long addr;
-
- addr = *stack;
- if (__kernel_text_address(addr)) {
- if ((unsigned long) stack == bp + sizeof(long)) {
- ops->address(data, addr, 1);
- frame = frame->next_frame;
- bp = (unsigned long) frame;
- } else {
- ops->address(data, addr, bp == 0);
- }
- }
- stack++;
- }
- return bp;
-}
-
void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
const struct stacktrace_ops *ops, void *data)
{
const unsigned cpu = get_cpu();
- unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
+ unsigned long *irq_stack_end =
+ (unsigned long *)per_cpu(irq_stack_ptr, cpu);
unsigned used = 0;
struct thread_info *tinfo;
+ int graph = 0;
if (!task)
task = current;
@@ -206,7 +151,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
break;
bp = print_context_stack(tinfo, stack, bp, ops,
- data, estack_end);
+ data, estack_end, &graph);
ops->stack(data, "<EOE>");
/*
* We link to the next stack via the
@@ -216,23 +161,23 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
stack = (unsigned long *) estack_end[-2];
continue;
}
- if (irqstack_end) {
- unsigned long *irqstack;
- irqstack = irqstack_end -
- (IRQSTACKSIZE - 64) / sizeof(*irqstack);
+ if (irq_stack_end) {
+ unsigned long *irq_stack;
+ irq_stack = irq_stack_end -
+ (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
- if (stack >= irqstack && stack < irqstack_end) {
+ if (stack >= irq_stack && stack < irq_stack_end) {
if (ops->stack(data, "IRQ") < 0)
break;
bp = print_context_stack(tinfo, stack, bp,
- ops, data, irqstack_end);
+ ops, data, irq_stack_end, &graph);
/*
* We link to the next stack (which would be
* the process stack normally) the last
* pointer (index -1 to end) in the IRQ stack:
*/
- stack = (unsigned long *) (irqstack_end[-1]);
- irqstack_end = NULL;
+ stack = (unsigned long *) (irq_stack_end[-1]);
+ irq_stack_end = NULL;
ops->stack(data, "EOI");
continue;
}
@@ -243,72 +188,22 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
/*
* This handles the process stack:
*/
- bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
+ bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph);
put_cpu();
}
EXPORT_SYMBOL(dump_trace);
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
- printk(data);
- print_symbol(msg, symbol);
- printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
- printk("%s%s\n", (char *)data, msg);
-}
-
-static int print_trace_stack(void *data, char *name)
-{
- printk("%s <%s> ", (char *)data, name);
- return 0;
-}
-
-/*
- * Print one address/symbol entries per line.
- */
-static void print_trace_address(void *data, unsigned long addr, int reliable)
-{
- touch_nmi_watchdog();
- printk(data);
- printk_address(addr, reliable);
-}
-
-static const struct stacktrace_ops print_trace_ops = {
- .warning = print_trace_warning,
- .warning_symbol = print_trace_warning_symbol,
- .stack = print_trace_stack,
- .address = print_trace_address,
-};
-
-static void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl)
-{
- printk("%sCall Trace:\n", log_lvl);
- dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
-}
-
-void show_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp)
-{
- show_trace_log_lvl(task, regs, stack, bp, "");
-}
-
-static void
+void
show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *sp, unsigned long bp, char *log_lvl)
{
unsigned long *stack;
int i;
const int cpu = smp_processor_id();
- unsigned long *irqstack_end =
- (unsigned long *) (cpu_pda(cpu)->irqstackptr);
- unsigned long *irqstack =
- (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
+ unsigned long *irq_stack_end =
+ (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
+ unsigned long *irq_stack =
+ (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
/*
* debugging aid: "show_stack(NULL, NULL);" prints the
@@ -324,9 +219,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
stack = sp;
for (i = 0; i < kstack_depth_to_print; i++) {
- if (stack >= irqstack && stack <= irqstack_end) {
- if (stack == irqstack_end) {
- stack = (unsigned long *) (irqstack_end[-1]);
+ if (stack >= irq_stack && stack <= irq_stack_end) {
+ if (stack == irq_stack_end) {
+ stack = (unsigned long *) (irq_stack_end[-1]);
printk(" <EOI> ");
}
} else {
@@ -342,39 +237,12 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
-void show_stack(struct task_struct *task, unsigned long *sp)
-{
- show_stack_log_lvl(task, NULL, sp, 0, "");
-}
-
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
- unsigned long bp = 0;
- unsigned long stack;
-
-#ifdef CONFIG_FRAME_POINTER
- if (!bp)
- get_bp(bp);
-#endif
-
- printk("Pid: %d, comm: %.20s %s %s %.*s\n",
- current->pid, current->comm, print_tainted(),
- init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
- show_trace(NULL, NULL, &stack, bp);
-}
-EXPORT_SYMBOL(dump_stack);
-
void show_registers(struct pt_regs *regs)
{
int i;
unsigned long sp;
const int cpu = smp_processor_id();
- struct task_struct *cur = cpu_pda(cpu)->pcurrent;
+ struct task_struct *cur = current;
sp = regs->sp;
printk("CPU %d ", cpu);
@@ -429,147 +297,3 @@ int is_valid_bugaddr(unsigned long ip)
return ud2 == 0x0b0f;
}
-static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
-static int die_owner = -1;
-static unsigned int die_nest_count;
-
-unsigned __kprobes long oops_begin(void)
-{
- int cpu;
- unsigned long flags;
-
- oops_enter();
-
- /* racy, but better than risking deadlock. */
- raw_local_irq_save(flags);
- cpu = smp_processor_id();
- if (!__raw_spin_trylock(&die_lock)) {
- if (cpu == die_owner)
- /* nested oops. should stop eventually */;
- else
- __raw_spin_lock(&die_lock);
- }
- die_nest_count++;
- die_owner = cpu;
- console_verbose();
- bust_spinlocks(1);
- return flags;
-}
-
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
-{
- die_owner = -1;
- bust_spinlocks(0);
- die_nest_count--;
- if (!die_nest_count)
- /* Nest count reaches zero, release the lock. */
- __raw_spin_unlock(&die_lock);
- raw_local_irq_restore(flags);
- if (!regs) {
- oops_exit();
- return;
- }
- if (in_interrupt())
- panic("Fatal exception in interrupt");
- if (panic_on_oops)
- panic("Fatal exception");
- oops_exit();
- do_exit(signr);
-}
-
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
-{
- printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
-#ifdef CONFIG_PREEMPT
- printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
- printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
- printk("DEBUG_PAGEALLOC");
-#endif
- printk("\n");
- sysfs_printk_last_file();
- if (notify_die(DIE_OOPS, str, regs, err,
- current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
- return 1;
-
- show_registers(regs);
- add_taint(TAINT_DIE);
- /* Executive summary in case the oops scrolled away */
- printk(KERN_ALERT "RIP ");
- printk_address(regs->ip, 1);
- printk(" RSP <%016lx>\n", regs->sp);
- if (kexec_should_crash(current))
- crash_kexec(regs);
- return 0;
-}
-
-void die(const char *str, struct pt_regs *regs, long err)
-{
- unsigned long flags = oops_begin();
-
- if (!user_mode(regs))
- report_bug(regs->ip, regs);
-
- if (__die(str, regs, err))
- regs = NULL;
- oops_end(flags, regs, SIGSEGV);
-}
-
-notrace __kprobes void
-die_nmi(char *str, struct pt_regs *regs, int do_panic)
-{
- unsigned long flags;
-
- if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
- return;
-
- flags = oops_begin();
- /*
- * We are in trouble anyway, lets at least try
- * to get a message out.
- */
- printk(KERN_EMERG "%s", str);
- printk(" on CPU%d, ip %08lx, registers:\n",
- smp_processor_id(), regs->ip);
- show_registers(regs);
- if (kexec_should_crash(current))
- crash_kexec(regs);
- if (do_panic || panic_on_oops)
- panic("Non maskable interrupt");
- oops_end(flags, NULL, SIGBUS);
- nmi_exit();
- local_irq_enable();
- do_exit(SIGBUS);
-}
-
-static int __init oops_setup(char *s)
-{
- if (!s)
- return -EINVAL;
- if (!strcmp(s, "panic"))
- panic_on_oops = 1;
- return 0;
-}
-early_param("oops", oops_setup);
-
-static int __init kstack_setup(char *s)
-{
- if (!s)
- return -EINVAL;
- kstack_depth_to_print = simple_strtoul(s, NULL, 0);
- return 0;
-}
-early_param("kstack", kstack_setup);
-
-static int __init code_bytes_setup(char *s)
-{
- code_bytes = simple_strtoul(s, NULL, 0);
- if (code_bytes > 8192)
- code_bytes = 8192;
-
- return 1;
-}
-__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 7aafeb5263e..e85826829cf 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -665,6 +665,27 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
}
#endif
+#ifdef CONFIG_HIBERNATION
+/**
+ * Mark ACPI NVS memory region, so that we can save/restore it during
+ * hibernation and the subsequent resume.
+ */
+static int __init e820_mark_nvs_memory(void)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+
+ if (ei->type == E820_NVS)
+ hibernate_nvs_register(ei->addr, ei->size);
+ }
+
+ return 0;
+}
+core_initcall(e820_mark_nvs_memory);
+#endif
+
/*
* Early reserved memory areas.
*/
@@ -677,22 +698,6 @@ struct early_res {
};
static struct early_res early_res[MAX_EARLY_RES] __initdata = {
{ 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
-#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
- { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
-#endif
-#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
- /*
- * But first pinch a few for the stack/trampoline stuff
- * FIXME: Don't need the extra page at 4K, but need to fix
- * trampoline before removing it. (see the GDT stuff)
- */
- { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
- /*
- * Has to be in very low memory so we can execute
- * real-mode AP code.
- */
- { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
-#endif
{}
};
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 3ce029ffaa5..76b8cd953de 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -17,6 +17,7 @@
#include <asm/io_apic.h>
#include <asm/apic.h>
#include <asm/iommu.h>
+#include <asm/gart.h>
static void __init fix_hypertransport_config(int num, int slot, int func)
{
@@ -188,20 +189,6 @@ static void __init ati_bugs_contd(int num, int slot, int func)
}
#endif
-#ifdef CONFIG_DMAR
-static void __init intel_g33_dmar(int num, int slot, int func)
-{
- struct acpi_table_header *dmar_tbl;
- acpi_status status;
-
- status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl);
- if (ACPI_SUCCESS(status)) {
- printk(KERN_INFO "BIOS BUG: DMAR advertised on Intel G31/G33 chipset -- ignoring\n");
- dmar_disabled = 1;
- }
-}
-#endif
-
#define QFLAG_APPLY_ONCE 0x1
#define QFLAG_APPLIED 0x2
#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -214,6 +201,12 @@ struct chipset {
void (*f)(int num, int slot, int func);
};
+/*
+ * Only works for devices on the root bus. If you add any devices
+ * not on bus 0 readd another loop level in early_quirks(). But
+ * be careful because at least the Nvidia quirk here relies on
+ * only matching on bus 0.
+ */
static struct chipset early_qrk[] __initdata = {
{ PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
@@ -225,10 +218,6 @@ static struct chipset early_qrk[] __initdata = {
PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
{ PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
-#ifdef CONFIG_DMAR
- { PCI_VENDOR_ID_INTEL, 0x29c0,
- PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar },
-#endif
{}
};
@@ -284,17 +273,17 @@ static int __init check_dev_quirk(int num, int slot, int func)
void __init early_quirks(void)
{
- int num, slot, func;
+ int slot, func;
if (!early_pci_allowed())
return;
/* Poor man's PCI discovery */
- for (num = 0; num < 32; num++)
- for (slot = 0; slot < 32; slot++)
- for (func = 0; func < 8; func++) {
- /* Only probe function 0 on single fn devices */
- if (check_dev_quirk(num, slot, func))
- break;
- }
+ /* Only scan the root bus */
+ for (slot = 0; slot < 32; slot++)
+ for (func = 0; func < 8; func++) {
+ /* Only probe function 0 on single fn devices */
+ if (check_dev_quirk(0, slot, func))
+ break;
+ }
}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 34ad997d383..639ad98238a 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -13,8 +13,8 @@
#include <asm/setup.h>
#include <xen/hvc-console.h>
#include <asm/pci-direct.h>
-#include <asm/pgtable.h>
#include <asm/fixmap.h>
+#include <asm/pgtable.h>
#include <linux/usb/ehci_def.h>
/* Simple VGA output */
@@ -875,49 +875,6 @@ static struct console early_dbgp_console = {
};
#endif
-/* Console interface to a host file on AMD's SimNow! */
-
-static int simnow_fd;
-
-enum {
- MAGIC1 = 0xBACCD00A,
- MAGIC2 = 0xCA110000,
- XOPEN = 5,
- XWRITE = 4,
-};
-
-static noinline long simnow(long cmd, long a, long b, long c)
-{
- long ret;
-
- asm volatile("cpuid" :
- "=a" (ret) :
- "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
- return ret;
-}
-
-static void __init simnow_init(char *str)
-{
- char *fn = "klog";
-
- if (*str == '=')
- fn = ++str;
- /* error ignored */
- simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
-}
-
-static void simnow_write(struct console *con, const char *s, unsigned n)
-{
- simnow(XWRITE, simnow_fd, (unsigned long)s, n);
-}
-
-static struct console simnow_console = {
- .name = "simnow",
- .write = simnow_write,
- .flags = CON_PRINTBUFFER,
- .index = -1,
-};
-
/* Direct interface for emergencies */
static struct console *early_console = &early_vga_console;
static int __initdata early_console_initialized;
@@ -929,7 +886,7 @@ asmlinkage void early_printk(const char *fmt, ...)
va_list ap;
va_start(ap, fmt);
- n = vscnprintf(buf, 512, fmt, ap);
+ n = vscnprintf(buf, sizeof(buf), fmt, ap);
early_console->write(early_console, buf, n);
va_end(ap);
}
@@ -960,10 +917,6 @@ static int __init setup_early_printk(char *buf)
max_ypos = boot_params.screen_info.orig_video_lines;
current_ypos = boot_params.screen_info.orig_y;
early_console = &early_vga_console;
- } else if (!strncmp(buf, "simnow", 6)) {
- simnow_init(buf + 6);
- early_console = &simnow_console;
- keep_early = 1;
#ifdef CONFIG_EARLY_PRINTK_DBGP
} else if (!strncmp(buf, "dbgp", 4)) {
if (early_dbgp_init(buf+4) < 0)
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 1119d247fe1..b205272ad39 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -366,10 +366,12 @@ void __init efi_init(void)
SMBIOS_TABLE_GUID)) {
efi.smbios = config_tables[i].table;
printk(" SMBIOS=0x%lx ", config_tables[i].table);
+#ifdef CONFIG_X86_UV
} else if (!efi_guidcmp(config_tables[i].guid,
UV_SYSTEM_TABLE_GUID)) {
efi.uv_systab = config_tables[i].table;
printk(" UVsystab=0x%lx ", config_tables[i].table);
+#endif
} else if (!efi_guidcmp(config_tables[i].guid,
HCDP_TABLE_GUID)) {
efi.hcdp = config_tables[i].table;
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 652c5287215..a4ee29127fd 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -36,6 +36,7 @@
#include <asm/proto.h>
#include <asm/efi.h>
#include <asm/cacheflush.h>
+#include <asm/fixmap.h>
static pgd_t save_pgd __initdata;
static unsigned long efi_flags __initdata;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 28b597ef9ca..e9920683145 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -30,12 +30,13 @@
* 1C(%esp) - %ds
* 20(%esp) - %es
* 24(%esp) - %fs
- * 28(%esp) - orig_eax
- * 2C(%esp) - %eip
- * 30(%esp) - %cs
- * 34(%esp) - %eflags
- * 38(%esp) - %oldesp
- * 3C(%esp) - %oldss
+ * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS
+ * 2C(%esp) - orig_eax
+ * 30(%esp) - %eip
+ * 34(%esp) - %cs
+ * 38(%esp) - %eflags
+ * 3C(%esp) - %oldesp
+ * 40(%esp) - %oldss
*
* "current" is in register %ebx during any slow entries.
*/
@@ -101,121 +102,221 @@
#define resume_userspace_sig resume_userspace
#endif
-#define SAVE_ALL \
- cld; \
- pushl %fs; \
- CFI_ADJUST_CFA_OFFSET 4;\
- /*CFI_REL_OFFSET fs, 0;*/\
- pushl %es; \
- CFI_ADJUST_CFA_OFFSET 4;\
- /*CFI_REL_OFFSET es, 0;*/\
- pushl %ds; \
- CFI_ADJUST_CFA_OFFSET 4;\
- /*CFI_REL_OFFSET ds, 0;*/\
- pushl %eax; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET eax, 0;\
- pushl %ebp; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET ebp, 0;\
- pushl %edi; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET edi, 0;\
- pushl %esi; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET esi, 0;\
- pushl %edx; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET edx, 0;\
- pushl %ecx; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET ecx, 0;\
- pushl %ebx; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET ebx, 0;\
- movl $(__USER_DS), %edx; \
- movl %edx, %ds; \
- movl %edx, %es; \
- movl $(__KERNEL_PERCPU), %edx; \
+/*
+ * User gs save/restore
+ *
+ * %gs is used for userland TLS and kernel only uses it for stack
+ * canary which is required to be at %gs:20 by gcc. Read the comment
+ * at the top of stackprotector.h for more info.
+ *
+ * Local labels 98 and 99 are used.
+ */
+#ifdef CONFIG_X86_32_LAZY_GS
+
+ /* unfortunately push/pop can't be no-op */
+.macro PUSH_GS
+ pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
+.endm
+.macro POP_GS pop=0
+ addl $(4 + \pop), %esp
+ CFI_ADJUST_CFA_OFFSET -(4 + \pop)
+.endm
+.macro POP_GS_EX
+.endm
+
+ /* all the rest are no-op */
+.macro PTGS_TO_GS
+.endm
+.macro PTGS_TO_GS_EX
+.endm
+.macro GS_TO_REG reg
+.endm
+.macro REG_TO_PTGS reg
+.endm
+.macro SET_KERNEL_GS reg
+.endm
+
+#else /* CONFIG_X86_32_LAZY_GS */
+
+.macro PUSH_GS
+ pushl %gs
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET gs, 0*/
+.endm
+
+.macro POP_GS pop=0
+98: popl %gs
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_RESTORE gs*/
+ .if \pop <> 0
+ add $\pop, %esp
+ CFI_ADJUST_CFA_OFFSET -\pop
+ .endif
+.endm
+.macro POP_GS_EX
+.pushsection .fixup, "ax"
+99: movl $0, (%esp)
+ jmp 98b
+.section __ex_table, "a"
+ .align 4
+ .long 98b, 99b
+.popsection
+.endm
+
+.macro PTGS_TO_GS
+98: mov PT_GS(%esp), %gs
+.endm
+.macro PTGS_TO_GS_EX
+.pushsection .fixup, "ax"
+99: movl $0, PT_GS(%esp)
+ jmp 98b
+.section __ex_table, "a"
+ .align 4
+ .long 98b, 99b
+.popsection
+.endm
+
+.macro GS_TO_REG reg
+ movl %gs, \reg
+ /*CFI_REGISTER gs, \reg*/
+.endm
+.macro REG_TO_PTGS reg
+ movl \reg, PT_GS(%esp)
+ /*CFI_REL_OFFSET gs, PT_GS*/
+.endm
+.macro SET_KERNEL_GS reg
+ movl $(__KERNEL_STACK_CANARY), \reg
+ movl \reg, %gs
+.endm
+
+#endif /* CONFIG_X86_32_LAZY_GS */
+
+.macro SAVE_ALL
+ cld
+ PUSH_GS
+ pushl %fs
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET fs, 0;*/
+ pushl %es
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET es, 0;*/
+ pushl %ds
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET ds, 0;*/
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET eax, 0
+ pushl %ebp
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebp, 0
+ pushl %edi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edi, 0
+ pushl %esi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET esi, 0
+ pushl %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edx, 0
+ pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx, 0
+ pushl %ebx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebx, 0
+ movl $(__USER_DS), %edx
+ movl %edx, %ds
+ movl %edx, %es
+ movl $(__KERNEL_PERCPU), %edx
movl %edx, %fs
+ SET_KERNEL_GS %edx
+.endm
-#define RESTORE_INT_REGS \
- popl %ebx; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE ebx;\
- popl %ecx; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE ecx;\
- popl %edx; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE edx;\
- popl %esi; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE esi;\
- popl %edi; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE edi;\
- popl %ebp; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE ebp;\
- popl %eax; \
- CFI_ADJUST_CFA_OFFSET -4;\
+.macro RESTORE_INT_REGS
+ popl %ebx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ebx
+ popl %ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ecx
+ popl %edx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE edx
+ popl %esi
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE esi
+ popl %edi
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE edi
+ popl %ebp
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ebp
+ popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
CFI_RESTORE eax
+.endm
-#define RESTORE_REGS \
- RESTORE_INT_REGS; \
-1: popl %ds; \
- CFI_ADJUST_CFA_OFFSET -4;\
- /*CFI_RESTORE ds;*/\
-2: popl %es; \
- CFI_ADJUST_CFA_OFFSET -4;\
- /*CFI_RESTORE es;*/\
-3: popl %fs; \
- CFI_ADJUST_CFA_OFFSET -4;\
- /*CFI_RESTORE fs;*/\
-.pushsection .fixup,"ax"; \
-4: movl $0,(%esp); \
- jmp 1b; \
-5: movl $0,(%esp); \
- jmp 2b; \
-6: movl $0,(%esp); \
- jmp 3b; \
-.section __ex_table,"a";\
- .align 4; \
- .long 1b,4b; \
- .long 2b,5b; \
- .long 3b,6b; \
+.macro RESTORE_REGS pop=0
+ RESTORE_INT_REGS
+1: popl %ds
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_RESTORE ds;*/
+2: popl %es
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_RESTORE es;*/
+3: popl %fs
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_RESTORE fs;*/
+ POP_GS \pop
+.pushsection .fixup, "ax"
+4: movl $0, (%esp)
+ jmp 1b
+5: movl $0, (%esp)
+ jmp 2b
+6: movl $0, (%esp)
+ jmp 3b
+.section __ex_table, "a"
+ .align 4
+ .long 1b, 4b
+ .long 2b, 5b
+ .long 3b, 6b
.popsection
+ POP_GS_EX
+.endm
-#define RING0_INT_FRAME \
- CFI_STARTPROC simple;\
- CFI_SIGNAL_FRAME;\
- CFI_DEF_CFA esp, 3*4;\
- /*CFI_OFFSET cs, -2*4;*/\
+.macro RING0_INT_FRAME
+ CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA esp, 3*4
+ /*CFI_OFFSET cs, -2*4;*/
CFI_OFFSET eip, -3*4
+.endm
-#define RING0_EC_FRAME \
- CFI_STARTPROC simple;\
- CFI_SIGNAL_FRAME;\
- CFI_DEF_CFA esp, 4*4;\
- /*CFI_OFFSET cs, -2*4;*/\
+.macro RING0_EC_FRAME
+ CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA esp, 4*4
+ /*CFI_OFFSET cs, -2*4;*/
CFI_OFFSET eip, -3*4
+.endm
-#define RING0_PTREGS_FRAME \
- CFI_STARTPROC simple;\
- CFI_SIGNAL_FRAME;\
- CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
- /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
- CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
- /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
- /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
- CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
- CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
- CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
- CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
- CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
- CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
+.macro RING0_PTREGS_FRAME
+ CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
+ /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
+ CFI_OFFSET eip, PT_EIP-PT_OLDESP
+ /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
+ /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
+ CFI_OFFSET eax, PT_EAX-PT_OLDESP
+ CFI_OFFSET ebp, PT_EBP-PT_OLDESP
+ CFI_OFFSET edi, PT_EDI-PT_OLDESP
+ CFI_OFFSET esi, PT_ESI-PT_OLDESP
+ CFI_OFFSET edx, PT_EDX-PT_OLDESP
+ CFI_OFFSET ecx, PT_ECX-PT_OLDESP
CFI_OFFSET ebx, PT_EBX-PT_OLDESP
+.endm
ENTRY(ret_from_fork)
CFI_STARTPROC
@@ -362,6 +463,7 @@ sysenter_exit:
xorl %ebp,%ebp
TRACE_IRQS_ON
1: mov PT_FS(%esp), %fs
+ PTGS_TO_GS
ENABLE_INTERRUPTS_SYSEXIT
#ifdef CONFIG_AUDITSYSCALL
@@ -410,6 +512,7 @@ sysexit_audit:
.align 4
.long 1b,2b
.popsection
+ PTGS_TO_GS_EX
ENDPROC(ia32_sysenter_target)
# system call handler stub
@@ -452,8 +555,7 @@ restore_all:
restore_nocheck:
TRACE_IRQS_IRET
restore_nocheck_notrace:
- RESTORE_REGS
- addl $4, %esp # skip orig_eax/error_code
+ RESTORE_REGS 4 # skip orig_eax/error_code
CFI_ADJUST_CFA_OFFSET -4
irq_return:
INTERRUPT_RETURN
@@ -595,52 +697,83 @@ syscall_badsys:
END(syscall_badsys)
CFI_ENDPROC
-#define FIXUP_ESPFIX_STACK \
- /* since we are on a wrong stack, we cant make it a C code :( */ \
- PER_CPU(gdt_page, %ebx); \
- GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
- addl %esp, %eax; \
- pushl $__KERNEL_DS; \
- CFI_ADJUST_CFA_OFFSET 4; \
- pushl %eax; \
- CFI_ADJUST_CFA_OFFSET 4; \
- lss (%esp), %esp; \
- CFI_ADJUST_CFA_OFFSET -8;
-#define UNWIND_ESPFIX_STACK \
- movl %ss, %eax; \
- /* see if on espfix stack */ \
- cmpw $__ESPFIX_SS, %ax; \
- jne 27f; \
- movl $__KERNEL_DS, %eax; \
- movl %eax, %ds; \
- movl %eax, %es; \
- /* switch to normal stack */ \
- FIXUP_ESPFIX_STACK; \
-27:;
+/*
+ * System calls that need a pt_regs pointer.
+ */
+#define PTREGSCALL(name) \
+ ALIGN; \
+ptregs_##name: \
+ leal 4(%esp),%eax; \
+ jmp sys_##name;
+
+PTREGSCALL(iopl)
+PTREGSCALL(fork)
+PTREGSCALL(clone)
+PTREGSCALL(vfork)
+PTREGSCALL(execve)
+PTREGSCALL(sigaltstack)
+PTREGSCALL(sigreturn)
+PTREGSCALL(rt_sigreturn)
+PTREGSCALL(vm86)
+PTREGSCALL(vm86old)
+
+.macro FIXUP_ESPFIX_STACK
+ /* since we are on a wrong stack, we cant make it a C code :( */
+ PER_CPU(gdt_page, %ebx)
+ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
+ addl %esp, %eax
+ pushl $__KERNEL_DS
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ lss (%esp), %esp
+ CFI_ADJUST_CFA_OFFSET -8
+.endm
+.macro UNWIND_ESPFIX_STACK
+ movl %ss, %eax
+ /* see if on espfix stack */
+ cmpw $__ESPFIX_SS, %ax
+ jne 27f
+ movl $__KERNEL_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ /* switch to normal stack */
+ FIXUP_ESPFIX_STACK
+27:
+.endm
/*
- * Build the entry stubs and pointer table with
- * some assembler magic.
+ * Build the entry stubs and pointer table with some assembler magic.
+ * We pack 7 stubs into a single 32-byte chunk, which will fit in a
+ * single cache line on all modern x86 implementations.
*/
-.section .rodata,"a"
+.section .init.rodata,"a"
ENTRY(interrupt)
.text
-
+ .p2align 5
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
ENTRY(irq_entries_start)
RING0_INT_FRAME
-vector=0
-.rept NR_VECTORS
- ALIGN
- .if vector
+vector=FIRST_EXTERNAL_VECTOR
+.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
+ .balign 32
+ .rept 7
+ .if vector < NR_VECTORS
+ .if vector <> FIRST_EXTERNAL_VECTOR
CFI_ADJUST_CFA_OFFSET -4
- .endif
-1: pushl $~(vector)
+ .endif
+1: pushl $(~vector+0x80) /* Note: always in signed byte range */
CFI_ADJUST_CFA_OFFSET 4
- jmp common_interrupt
- .previous
+ .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
+ jmp 2f
+ .endif
+ .previous
.long 1b
- .text
+ .text
vector=vector+1
+ .endif
+ .endr
+2: jmp common_interrupt
.endr
END(irq_entries_start)
@@ -652,8 +785,9 @@ END(interrupt)
* the CPU automatically disables interrupts when executing an IRQ vector,
* so IRQ-flags tracing has to follow that:
*/
- ALIGN
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
common_interrupt:
+ addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */
SAVE_ALL
TRACE_IRQS_OFF
movl %esp,%eax
@@ -662,7 +796,7 @@ common_interrupt:
ENDPROC(common_interrupt)
CFI_ENDPROC
-#define BUILD_INTERRUPT(name, nr) \
+#define BUILD_INTERRUPT3(name, nr, fn) \
ENTRY(name) \
RING0_INT_FRAME; \
pushl $~(nr); \
@@ -670,72 +804,15 @@ ENTRY(name) \
SAVE_ALL; \
TRACE_IRQS_OFF \
movl %esp,%eax; \
- call smp_##name; \
+ call fn; \
jmp ret_from_intr; \
CFI_ENDPROC; \
ENDPROC(name)
-/* The include is where all of the SMP etc. interrupts come from */
-#include "entry_arch.h"
+#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name)
-KPROBE_ENTRY(page_fault)
- RING0_EC_FRAME
- pushl $do_page_fault
- CFI_ADJUST_CFA_OFFSET 4
- ALIGN
-error_code:
- /* the function address is in %fs's slot on the stack */
- pushl %es
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET es, 0*/
- pushl %ds
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET ds, 0*/
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET eax, 0
- pushl %ebp
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ebp, 0
- pushl %edi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edi, 0
- pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET esi, 0
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edx, 0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx, 0
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ebx, 0
- cld
- pushl %fs
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET fs, 0*/
- movl $(__KERNEL_PERCPU), %ecx
- movl %ecx, %fs
- UNWIND_ESPFIX_STACK
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
- /*CFI_REGISTER es, ecx*/
- movl PT_FS(%esp), %edi # get the function address
- movl PT_ORIG_EAX(%esp), %edx # get the error code
- movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
- mov %ecx, PT_FS(%esp)
- /*CFI_REL_OFFSET fs, ES*/
- movl $(__USER_DS), %ecx
- movl %ecx, %ds
- movl %ecx, %es
- TRACE_IRQS_OFF
- movl %esp,%eax # pt_regs pointer
- call *%edi
- jmp ret_from_exception
- CFI_ENDPROC
-KPROBE_END(page_fault)
+/* The include is where all of the SMP etc. interrupts come from */
+#include <asm/entry_arch.h>
ENTRY(coprocessor_error)
RING0_INT_FRAME
@@ -767,140 +844,6 @@ ENTRY(device_not_available)
CFI_ENDPROC
END(device_not_available)
-/*
- * Debug traps and NMI can happen at the one SYSENTER instruction
- * that sets up the real kernel stack. Check here, since we can't
- * allow the wrong stack to be used.
- *
- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
- * already pushed 3 words if it hits on the sysenter instruction:
- * eflags, cs and eip.
- *
- * We just load the right stack, and push the three (known) values
- * by hand onto the new stack - while updating the return eip past
- * the instruction that would have done it for sysenter.
- */
-#define FIX_STACK(offset, ok, label) \
- cmpw $__KERNEL_CS,4(%esp); \
- jne ok; \
-label: \
- movl TSS_sysenter_sp0+offset(%esp),%esp; \
- CFI_DEF_CFA esp, 0; \
- CFI_UNDEFINED eip; \
- pushfl; \
- CFI_ADJUST_CFA_OFFSET 4; \
- pushl $__KERNEL_CS; \
- CFI_ADJUST_CFA_OFFSET 4; \
- pushl $sysenter_past_esp; \
- CFI_ADJUST_CFA_OFFSET 4; \
- CFI_REL_OFFSET eip, 0
-
-KPROBE_ENTRY(debug)
- RING0_INT_FRAME
- cmpl $ia32_sysenter_target,(%esp)
- jne debug_stack_correct
- FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
-debug_stack_correct:
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
- xorl %edx,%edx # error code 0
- movl %esp,%eax # pt_regs pointer
- call do_debug
- jmp ret_from_exception
- CFI_ENDPROC
-KPROBE_END(debug)
-
-/*
- * NMI is doubly nasty. It can happen _while_ we're handling
- * a debug fault, and the debug fault hasn't yet been able to
- * clear up the stack. So we first check whether we got an
- * NMI on the sysenter entry path, but after that we need to
- * check whether we got an NMI on the debug path where the debug
- * fault happened on the sysenter path.
- */
-KPROBE_ENTRY(nmi)
- RING0_INT_FRAME
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- movl %ss, %eax
- cmpw $__ESPFIX_SS, %ax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
- je nmi_espfix_stack
- cmpl $ia32_sysenter_target,(%esp)
- je nmi_stack_fixup
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- movl %esp,%eax
- /* Do not access memory above the end of our stack page,
- * it might not exist.
- */
- andl $(THREAD_SIZE-1),%eax
- cmpl $(THREAD_SIZE-20),%eax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
- jae nmi_stack_correct
- cmpl $ia32_sysenter_target,12(%esp)
- je nmi_debug_stack_check
-nmi_stack_correct:
- /* We have a RING0_INT_FRAME here */
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
- xorl %edx,%edx # zero error code
- movl %esp,%eax # pt_regs pointer
- call do_nmi
- jmp restore_nocheck_notrace
- CFI_ENDPROC
-
-nmi_stack_fixup:
- RING0_INT_FRAME
- FIX_STACK(12,nmi_stack_correct, 1)
- jmp nmi_stack_correct
-
-nmi_debug_stack_check:
- /* We have a RING0_INT_FRAME here */
- cmpw $__KERNEL_CS,16(%esp)
- jne nmi_stack_correct
- cmpl $debug,(%esp)
- jb nmi_stack_correct
- cmpl $debug_esp_fix_insn,(%esp)
- ja nmi_stack_correct
- FIX_STACK(24,nmi_stack_correct, 1)
- jmp nmi_stack_correct
-
-nmi_espfix_stack:
- /* We have a RING0_INT_FRAME here.
- *
- * create the pointer to lss back
- */
- pushl %ss
- CFI_ADJUST_CFA_OFFSET 4
- pushl %esp
- CFI_ADJUST_CFA_OFFSET 4
- addw $4, (%esp)
- /* copy the iret frame of 12 bytes */
- .rept 3
- pushl 16(%esp)
- CFI_ADJUST_CFA_OFFSET 4
- .endr
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
- FIXUP_ESPFIX_STACK # %eax == %esp
- xorl %edx,%edx # zero error code
- call do_nmi
- RESTORE_REGS
- lss 12+4(%esp), %esp # back to espfix stack
- CFI_ADJUST_CFA_OFFSET -24
- jmp irq_return
- CFI_ENDPROC
-KPROBE_END(nmi)
-
#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
iret
@@ -916,19 +859,6 @@ ENTRY(native_irq_enable_sysexit)
END(native_irq_enable_sysexit)
#endif
-KPROBE_ENTRY(int3)
- RING0_INT_FRAME
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
- xorl %edx,%edx # zero error code
- movl %esp,%eax # pt_regs pointer
- call do_int3
- jmp ret_from_exception
- CFI_ENDPROC
-KPROBE_END(int3)
-
ENTRY(overflow)
RING0_INT_FRAME
pushl $0
@@ -993,14 +923,6 @@ ENTRY(stack_segment)
CFI_ENDPROC
END(stack_segment)
-KPROBE_ENTRY(general_protection)
- RING0_EC_FRAME
- pushl $do_general_protection
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-KPROBE_END(general_protection)
-
ENTRY(alignment_check)
RING0_EC_FRAME
pushl $do_alignment_check
@@ -1051,6 +973,7 @@ ENTRY(kernel_thread_helper)
push %eax
CFI_ADJUST_CFA_OFFSET 4
call do_exit
+ ud2 # padding for call trace
CFI_ENDPROC
ENDPROC(kernel_thread_helper)
@@ -1157,6 +1080,9 @@ ENTRY(mcount)
END(mcount)
ENTRY(ftrace_caller)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
pushl %eax
pushl %ecx
pushl %edx
@@ -1171,6 +1097,11 @@ ftrace_call:
popl %edx
popl %ecx
popl %eax
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+ jmp ftrace_stub
+#endif
.globl ftrace_stub
ftrace_stub:
@@ -1180,8 +1111,18 @@ END(ftrace_caller)
#else /* ! CONFIG_DYNAMIC_FTRACE */
ENTRY(mcount)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
cmpl $ftrace_stub, ftrace_trace_function
jnz trace
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ cmpl $ftrace_stub, ftrace_graph_return
+ jnz ftrace_graph_caller
+
+ cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
+ jnz ftrace_graph_caller
+#endif
.globl ftrace_stub
ftrace_stub:
ret
@@ -1200,13 +1141,265 @@ trace:
popl %edx
popl %ecx
popl %eax
-
jmp ftrace_stub
END(mcount)
#endif /* CONFIG_DYNAMIC_FTRACE */
#endif /* CONFIG_FUNCTION_TRACER */
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 0xc(%esp), %edx
+ lea 0x4(%ebp), %eax
+ subl $MCOUNT_INSN_SIZE, %edx
+ call prepare_ftrace_return
+ popl %edx
+ popl %ecx
+ popl %eax
+ ret
+END(ftrace_graph_caller)
+
+.globl return_to_handler
+return_to_handler:
+ pushl $0
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ call ftrace_return_to_handler
+ movl %eax, 0xc(%esp)
+ popl %edx
+ popl %ecx
+ popl %eax
+ ret
+#endif
+
.section .rodata,"a"
#include "syscall_table_32.S"
syscall_table_size=(.-sys_call_table)
+
+/*
+ * Some functions should be protected against kprobes
+ */
+ .pushsection .kprobes.text, "ax"
+
+ENTRY(page_fault)
+ RING0_EC_FRAME
+ pushl $do_page_fault
+ CFI_ADJUST_CFA_OFFSET 4
+ ALIGN
+error_code:
+ /* the function address is in %gs's slot on the stack */
+ pushl %fs
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET fs, 0*/
+ pushl %es
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET es, 0*/
+ pushl %ds
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET ds, 0*/
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET eax, 0
+ pushl %ebp
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebp, 0
+ pushl %edi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edi, 0
+ pushl %esi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET esi, 0
+ pushl %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edx, 0
+ pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx, 0
+ pushl %ebx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebx, 0
+ cld
+ movl $(__KERNEL_PERCPU), %ecx
+ movl %ecx, %fs
+ UNWIND_ESPFIX_STACK
+ GS_TO_REG %ecx
+ movl PT_GS(%esp), %edi # get the function address
+ movl PT_ORIG_EAX(%esp), %edx # get the error code
+ movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
+ REG_TO_PTGS %ecx
+ SET_KERNEL_GS %ecx
+ movl $(__USER_DS), %ecx
+ movl %ecx, %ds
+ movl %ecx, %es
+ TRACE_IRQS_OFF
+ movl %esp,%eax # pt_regs pointer
+ call *%edi
+ jmp ret_from_exception
+ CFI_ENDPROC
+END(page_fault)
+
+/*
+ * Debug traps and NMI can happen at the one SYSENTER instruction
+ * that sets up the real kernel stack. Check here, since we can't
+ * allow the wrong stack to be used.
+ *
+ * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
+ * already pushed 3 words if it hits on the sysenter instruction:
+ * eflags, cs and eip.
+ *
+ * We just load the right stack, and push the three (known) values
+ * by hand onto the new stack - while updating the return eip past
+ * the instruction that would have done it for sysenter.
+ */
+.macro FIX_STACK offset ok label
+ cmpw $__KERNEL_CS, 4(%esp)
+ jne \ok
+\label:
+ movl TSS_sysenter_sp0 + \offset(%esp), %esp
+ CFI_DEF_CFA esp, 0
+ CFI_UNDEFINED eip
+ pushfl
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl $__KERNEL_CS
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl $sysenter_past_esp
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET eip, 0
+.endm
+
+ENTRY(debug)
+ RING0_INT_FRAME
+ cmpl $ia32_sysenter_target,(%esp)
+ jne debug_stack_correct
+ FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
+debug_stack_correct:
+ pushl $-1 # mark this as an int
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ TRACE_IRQS_OFF
+ xorl %edx,%edx # error code 0
+ movl %esp,%eax # pt_regs pointer
+ call do_debug
+ jmp ret_from_exception
+ CFI_ENDPROC
+END(debug)
+
+/*
+ * NMI is doubly nasty. It can happen _while_ we're handling
+ * a debug fault, and the debug fault hasn't yet been able to
+ * clear up the stack. So we first check whether we got an
+ * NMI on the sysenter entry path, but after that we need to
+ * check whether we got an NMI on the debug path where the debug
+ * fault happened on the sysenter path.
+ */
+ENTRY(nmi)
+ RING0_INT_FRAME
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ movl %ss, %eax
+ cmpw $__ESPFIX_SS, %ax
+ popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
+ je nmi_espfix_stack
+ cmpl $ia32_sysenter_target,(%esp)
+ je nmi_stack_fixup
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ movl %esp,%eax
+ /* Do not access memory above the end of our stack page,
+ * it might not exist.
+ */
+ andl $(THREAD_SIZE-1),%eax
+ cmpl $(THREAD_SIZE-20),%eax
+ popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
+ jae nmi_stack_correct
+ cmpl $ia32_sysenter_target,12(%esp)
+ je nmi_debug_stack_check
+nmi_stack_correct:
+ /* We have a RING0_INT_FRAME here */
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ xorl %edx,%edx # zero error code
+ movl %esp,%eax # pt_regs pointer
+ call do_nmi
+ jmp restore_nocheck_notrace
+ CFI_ENDPROC
+
+nmi_stack_fixup:
+ RING0_INT_FRAME
+ FIX_STACK 12, nmi_stack_correct, 1
+ jmp nmi_stack_correct
+
+nmi_debug_stack_check:
+ /* We have a RING0_INT_FRAME here */
+ cmpw $__KERNEL_CS,16(%esp)
+ jne nmi_stack_correct
+ cmpl $debug,(%esp)
+ jb nmi_stack_correct
+ cmpl $debug_esp_fix_insn,(%esp)
+ ja nmi_stack_correct
+ FIX_STACK 24, nmi_stack_correct, 1
+ jmp nmi_stack_correct
+
+nmi_espfix_stack:
+ /* We have a RING0_INT_FRAME here.
+ *
+ * create the pointer to lss back
+ */
+ pushl %ss
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl %esp
+ CFI_ADJUST_CFA_OFFSET 4
+ addw $4, (%esp)
+ /* copy the iret frame of 12 bytes */
+ .rept 3
+ pushl 16(%esp)
+ CFI_ADJUST_CFA_OFFSET 4
+ .endr
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ FIXUP_ESPFIX_STACK # %eax == %esp
+ xorl %edx,%edx # zero error code
+ call do_nmi
+ RESTORE_REGS
+ lss 12+4(%esp), %esp # back to espfix stack
+ CFI_ADJUST_CFA_OFFSET -24
+ jmp irq_return
+ CFI_ENDPROC
+END(nmi)
+
+ENTRY(int3)
+ RING0_INT_FRAME
+ pushl $-1 # mark this as an int
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ TRACE_IRQS_OFF
+ xorl %edx,%edx # zero error code
+ movl %esp,%eax # pt_regs pointer
+ call do_int3
+ jmp ret_from_exception
+ CFI_ENDPROC
+END(int3)
+
+ENTRY(general_protection)
+ RING0_EC_FRAME
+ pushl $do_general_protection
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+END(general_protection)
+
+/*
+ * End of kprobes section
+ */
+ .popsection
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b86f332c96a..fbcf96b295f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -11,15 +11,15 @@
*
* NOTE: This code handles signal-recognition, which happens every time
* after an interrupt and after each system call.
- *
- * Normal syscalls and interrupts don't save a full stack frame, this is
+ *
+ * Normal syscalls and interrupts don't save a full stack frame, this is
* only done for syscall tracing, signals or fork/exec et.al.
- *
- * A note on terminology:
- * - top of stack: Architecture defined interrupt frame from SS to RIP
- * at the top of the kernel process stack.
+ *
+ * A note on terminology:
+ * - top of stack: Architecture defined interrupt frame from SS to RIP
+ * at the top of the kernel process stack.
* - partial stack frame: partially saved registers upto R11.
- * - full stack frame: Like partial stack frame, but all register saved.
+ * - full stack frame: Like partial stack frame, but all register saved.
*
* Some macro usage:
* - CFI macros are used to generate dwarf2 unwind information for better
@@ -52,6 +52,7 @@
#include <asm/irqflags.h>
#include <asm/paravirt.h>
#include <asm/ftrace.h>
+#include <asm/percpu.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
#include <linux/elf-em.h>
@@ -60,7 +61,6 @@
#define __AUDIT_ARCH_LE 0x40000000
.code64
-
#ifdef CONFIG_FUNCTION_TRACER
#ifdef CONFIG_DYNAMIC_FTRACE
ENTRY(mcount)
@@ -68,16 +68,10 @@ ENTRY(mcount)
END(mcount)
ENTRY(ftrace_caller)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
- /* taken from glibc */
- subq $0x38, %rsp
- movq %rax, (%rsp)
- movq %rcx, 8(%rsp)
- movq %rdx, 16(%rsp)
- movq %rsi, 24(%rsp)
- movq %rdi, 32(%rsp)
- movq %r8, 40(%rsp)
- movq %r9, 48(%rsp)
+ MCOUNT_SAVE_FRAME
movq 0x38(%rsp), %rdi
movq 8(%rbp), %rsi
@@ -87,14 +81,13 @@ ENTRY(ftrace_caller)
ftrace_call:
call ftrace_stub
- movq 48(%rsp), %r9
- movq 40(%rsp), %r8
- movq 32(%rsp), %rdi
- movq 24(%rsp), %rsi
- movq 16(%rsp), %rdx
- movq 8(%rsp), %rcx
- movq (%rsp), %rax
- addq $0x38, %rsp
+ MCOUNT_RESTORE_FRAME
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+ jmp ftrace_stub
+#endif
.globl ftrace_stub
ftrace_stub:
@@ -103,15 +96,63 @@ END(ftrace_caller)
#else /* ! CONFIG_DYNAMIC_FTRACE */
ENTRY(mcount)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
cmpq $ftrace_stub, ftrace_trace_function
jnz trace
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ cmpq $ftrace_stub, ftrace_graph_return
+ jnz ftrace_graph_caller
+
+ cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
+ jnz ftrace_graph_caller
+#endif
+
.globl ftrace_stub
ftrace_stub:
retq
trace:
- /* taken from glibc */
- subq $0x38, %rsp
+ MCOUNT_SAVE_FRAME
+
+ movq 0x38(%rsp), %rdi
+ movq 8(%rbp), %rsi
+ subq $MCOUNT_INSN_SIZE, %rdi
+
+ call *ftrace_trace_function
+
+ MCOUNT_RESTORE_FRAME
+
+ jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
+ MCOUNT_SAVE_FRAME
+
+ leaq 8(%rbp), %rdi
+ movq 0x38(%rsp), %rsi
+ subq $MCOUNT_INSN_SIZE, %rsi
+
+ call prepare_ftrace_return
+
+ MCOUNT_RESTORE_FRAME
+
+ retq
+END(ftrace_graph_caller)
+
+
+.globl return_to_handler
+return_to_handler:
+ subq $80, %rsp
+
movq %rax, (%rsp)
movq %rcx, 8(%rsp)
movq %rdx, 16(%rsp)
@@ -119,13 +160,14 @@ trace:
movq %rdi, 32(%rsp)
movq %r8, 40(%rsp)
movq %r9, 48(%rsp)
+ movq %r10, 56(%rsp)
+ movq %r11, 64(%rsp)
- movq 0x38(%rsp), %rdi
- movq 8(%rbp), %rsi
- subq $MCOUNT_INSN_SIZE, %rdi
-
- call *ftrace_trace_function
+ call ftrace_return_to_handler
+ movq %rax, 72(%rsp)
+ movq 64(%rsp), %r11
+ movq 56(%rsp), %r10
movq 48(%rsp), %r9
movq 40(%rsp), %r8
movq 32(%rsp), %rdi
@@ -133,16 +175,14 @@ trace:
movq 16(%rsp), %rdx
movq 8(%rsp), %rcx
movq (%rsp), %rax
- addq $0x38, %rsp
+ addq $72, %rsp
+ retq
+#endif
- jmp ftrace_stub
-END(mcount)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FUNCTION_TRACER */
#ifndef CONFIG_PREEMPT
#define retint_kernel retint_restore_args
-#endif
+#endif
#ifdef CONFIG_PARAVIRT
ENTRY(native_usergs_sysret64)
@@ -161,29 +201,29 @@ ENTRY(native_usergs_sysret64)
.endm
/*
- * C code is not supposed to know about undefined top of stack. Every time
- * a C function with an pt_regs argument is called from the SYSCALL based
+ * C code is not supposed to know about undefined top of stack. Every time
+ * a C function with an pt_regs argument is called from the SYSCALL based
* fast path FIXUP_TOP_OF_STACK is needed.
* RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
* manipulation.
- */
-
- /* %rsp:at FRAMEEND */
- .macro FIXUP_TOP_OF_STACK tmp
- movq %gs:pda_oldrsp,\tmp
- movq \tmp,RSP(%rsp)
- movq $__USER_DS,SS(%rsp)
- movq $__USER_CS,CS(%rsp)
- movq $-1,RCX(%rsp)
- movq R11(%rsp),\tmp /* get eflags */
- movq \tmp,EFLAGS(%rsp)
+ */
+
+ /* %rsp:at FRAMEEND */
+ .macro FIXUP_TOP_OF_STACK tmp offset=0
+ movq PER_CPU_VAR(old_rsp),\tmp
+ movq \tmp,RSP+\offset(%rsp)
+ movq $__USER_DS,SS+\offset(%rsp)
+ movq $__USER_CS,CS+\offset(%rsp)
+ movq $-1,RCX+\offset(%rsp)
+ movq R11+\offset(%rsp),\tmp /* get eflags */
+ movq \tmp,EFLAGS+\offset(%rsp)
.endm
- .macro RESTORE_TOP_OF_STACK tmp,offset=0
- movq RSP-\offset(%rsp),\tmp
- movq \tmp,%gs:pda_oldrsp
- movq EFLAGS-\offset(%rsp),\tmp
- movq \tmp,R11-\offset(%rsp)
+ .macro RESTORE_TOP_OF_STACK tmp offset=0
+ movq RSP+\offset(%rsp),\tmp
+ movq \tmp,PER_CPU_VAR(old_rsp)
+ movq EFLAGS+\offset(%rsp),\tmp
+ movq \tmp,R11+\offset(%rsp)
.endm
.macro FAKE_STACK_FRAME child_rip
@@ -195,7 +235,7 @@ ENTRY(native_usergs_sysret64)
pushq %rax /* rsp */
CFI_ADJUST_CFA_OFFSET 8
CFI_REL_OFFSET rsp,0
- pushq $(1<<9) /* eflags - interrupts on */
+ pushq $X86_EFLAGS_IF /* eflags - interrupts on */
CFI_ADJUST_CFA_OFFSET 8
/*CFI_REL_OFFSET rflags,0*/
pushq $__KERNEL_CS /* cs */
@@ -213,62 +253,187 @@ ENTRY(native_usergs_sysret64)
CFI_ADJUST_CFA_OFFSET -(6*8)
.endm
- .macro CFI_DEFAULT_STACK start=1
+/*
+ * initial frame state for interrupts (and exceptions without error code)
+ */
+ .macro EMPTY_FRAME start=1 offset=0
.if \start
- CFI_STARTPROC simple
+ CFI_STARTPROC simple
CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8
+ CFI_DEF_CFA rsp,8+\offset
.else
- CFI_DEF_CFA_OFFSET SS+8
+ CFI_DEF_CFA_OFFSET 8+\offset
.endif
- CFI_REL_OFFSET r15,R15
- CFI_REL_OFFSET r14,R14
- CFI_REL_OFFSET r13,R13
- CFI_REL_OFFSET r12,R12
- CFI_REL_OFFSET rbp,RBP
- CFI_REL_OFFSET rbx,RBX
- CFI_REL_OFFSET r11,R11
- CFI_REL_OFFSET r10,R10
- CFI_REL_OFFSET r9,R9
- CFI_REL_OFFSET r8,R8
- CFI_REL_OFFSET rax,RAX
- CFI_REL_OFFSET rcx,RCX
- CFI_REL_OFFSET rdx,RDX
- CFI_REL_OFFSET rsi,RSI
- CFI_REL_OFFSET rdi,RDI
- CFI_REL_OFFSET rip,RIP
- /*CFI_REL_OFFSET cs,CS*/
- /*CFI_REL_OFFSET rflags,EFLAGS*/
- CFI_REL_OFFSET rsp,RSP
- /*CFI_REL_OFFSET ss,SS*/
.endm
+
+/*
+ * initial frame state for interrupts (and exceptions without error code)
+ */
+ .macro INTR_FRAME start=1 offset=0
+ EMPTY_FRAME \start, SS+8+\offset-RIP
+ /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
+ CFI_REL_OFFSET rsp, RSP+\offset-RIP
+ /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
+ /*CFI_REL_OFFSET cs, CS+\offset-RIP*/
+ CFI_REL_OFFSET rip, RIP+\offset-RIP
+ .endm
+
+/*
+ * initial frame state for exceptions with error code (and interrupts
+ * with vector already pushed)
+ */
+ .macro XCPT_FRAME start=1 offset=0
+ INTR_FRAME \start, RIP+\offset-ORIG_RAX
+ /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
+ .endm
+
+/*
+ * frame that enables calling into C.
+ */
+ .macro PARTIAL_FRAME start=1 offset=0
+ XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
+ CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
+ CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
+ CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
+ CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
+ CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
+ CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
+ CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
+ CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
+ CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
+ .endm
+
/*
- * A newly forked process directly context switches into this.
- */
-/* rdi: prev */
+ * frame that enables passing a complete pt_regs to a C function.
+ */
+ .macro DEFAULT_FRAME start=1 offset=0
+ PARTIAL_FRAME \start, R11+\offset-R15
+ CFI_REL_OFFSET rbx, RBX+\offset
+ CFI_REL_OFFSET rbp, RBP+\offset
+ CFI_REL_OFFSET r12, R12+\offset
+ CFI_REL_OFFSET r13, R13+\offset
+ CFI_REL_OFFSET r14, R14+\offset
+ CFI_REL_OFFSET r15, R15+\offset
+ .endm
+
+/* save partial stack frame */
+ENTRY(save_args)
+ XCPT_FRAME
+ cld
+ movq_cfi rdi, RDI+16-ARGOFFSET
+ movq_cfi rsi, RSI+16-ARGOFFSET
+ movq_cfi rdx, RDX+16-ARGOFFSET
+ movq_cfi rcx, RCX+16-ARGOFFSET
+ movq_cfi rax, RAX+16-ARGOFFSET
+ movq_cfi r8, R8+16-ARGOFFSET
+ movq_cfi r9, R9+16-ARGOFFSET
+ movq_cfi r10, R10+16-ARGOFFSET
+ movq_cfi r11, R11+16-ARGOFFSET
+
+ leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */
+ movq_cfi rbp, 8 /* push %rbp */
+ leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
+ testl $3, CS(%rdi)
+ je 1f
+ SWAPGS
+ /*
+ * irq_count is used to check if a CPU is already on an interrupt stack
+ * or not. While this is essentially redundant with preempt_count it is
+ * a little cheaper to use a separate counter in the PDA (short of
+ * moving irq_enter into assembly, which would be too much work)
+ */
+1: incl PER_CPU_VAR(irq_count)
+ jne 2f
+ popq_cfi %rax /* move return address... */
+ mov PER_CPU_VAR(irq_stack_ptr),%rsp
+ EMPTY_FRAME 0
+ pushq_cfi %rbp /* backlink for unwinder */
+ pushq_cfi %rax /* ... to the new stack */
+ /*
+ * We entered an interrupt context - irqs are off:
+ */
+2: TRACE_IRQS_OFF
+ ret
+ CFI_ENDPROC
+END(save_args)
+
+ENTRY(save_rest)
+ PARTIAL_FRAME 1 REST_SKIP+8
+ movq 5*8+16(%rsp), %r11 /* save return address */
+ movq_cfi rbx, RBX+16
+ movq_cfi rbp, RBP+16
+ movq_cfi r12, R12+16
+ movq_cfi r13, R13+16
+ movq_cfi r14, R14+16
+ movq_cfi r15, R15+16
+ movq %r11, 8(%rsp) /* return address */
+ FIXUP_TOP_OF_STACK %r11, 16
+ ret
+ CFI_ENDPROC
+END(save_rest)
+
+/* save complete stack frame */
+ENTRY(save_paranoid)
+ XCPT_FRAME 1 RDI+8
+ cld
+ movq_cfi rdi, RDI+8
+ movq_cfi rsi, RSI+8
+ movq_cfi rdx, RDX+8
+ movq_cfi rcx, RCX+8
+ movq_cfi rax, RAX+8
+ movq_cfi r8, R8+8
+ movq_cfi r9, R9+8
+ movq_cfi r10, R10+8
+ movq_cfi r11, R11+8
+ movq_cfi rbx, RBX+8
+ movq_cfi rbp, RBP+8
+ movq_cfi r12, R12+8
+ movq_cfi r13, R13+8
+ movq_cfi r14, R14+8
+ movq_cfi r15, R15+8
+ movl $1,%ebx
+ movl $MSR_GS_BASE,%ecx
+ rdmsr
+ testl %edx,%edx
+ js 1f /* negative -> in kernel */
+ SWAPGS
+ xorl %ebx,%ebx
+1: ret
+ CFI_ENDPROC
+END(save_paranoid)
+
+/*
+ * A newly forked process directly context switches into this address.
+ *
+ * rdi: prev task we switched from
+ */
ENTRY(ret_from_fork)
- CFI_DEFAULT_STACK
+ DEFAULT_FRAME
+
+ LOCK ; btr $TIF_FORK,TI_flags(%r8)
+
push kernel_eflags(%rip)
CFI_ADJUST_CFA_OFFSET 8
- popf # reset kernel eflags
+ popf # reset kernel eflags
CFI_ADJUST_CFA_OFFSET -8
- call schedule_tail
+
+ call schedule_tail # rdi: 'prev' task parameter
+
GET_THREAD_INFO(%rcx)
- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
- jnz rff_trace
-rff_action:
+
+ CFI_REMEMBER_STATE
RESTORE_REST
- testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
+
+ testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
je int_ret_from_sys_call
- testl $_TIF_IA32,TI_flags(%rcx)
+
+ testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
jnz int_ret_from_sys_call
- RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
- jmp ret_from_sys_call
-rff_trace:
- movq %rsp,%rdi
- call syscall_trace_leave
- GET_THREAD_INFO(%rcx)
- jmp rff_action
+
+ RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
+ jmp ret_from_sys_call # go to the SYSRET fastpath
+
+ CFI_RESTORE_STATE
CFI_ENDPROC
END(ret_from_fork)
@@ -278,20 +443,20 @@ END(ret_from_fork)
* SYSCALL does not save anything on the stack and does not change the
* stack pointer.
*/
-
+
/*
- * Register setup:
+ * Register setup:
* rax system call number
* rdi arg0
- * rcx return address for syscall/sysret, C arg3
+ * rcx return address for syscall/sysret, C arg3
* rsi arg1
- * rdx arg2
+ * rdx arg2
* r10 arg3 (--> moved to rcx for C)
* r8 arg4
* r9 arg5
* r11 eflags for syscall/sysret, temporary for C
- * r12-r15,rbp,rbx saved by C code, not touched.
- *
+ * r12-r15,rbp,rbx saved by C code, not touched.
+ *
* Interrupts are off on entry.
* Only called from user space.
*
@@ -301,12 +466,12 @@ END(ret_from_fork)
* When user can change the frames always force IRET. That is because
* it deals with uncanonical addresses better. SYSRET has trouble
* with them due to bugs in both AMD and Intel CPUs.
- */
+ */
ENTRY(system_call)
CFI_STARTPROC simple
CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,PDA_STACKOFFSET
+ CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
CFI_REGISTER rip,rcx
/*CFI_REGISTER rflags,r11*/
SWAPGS_UNSAFE_STACK
@@ -317,15 +482,15 @@ ENTRY(system_call)
*/
ENTRY(system_call_after_swapgs)
- movq %rsp,%gs:pda_oldrsp
- movq %gs:pda_kernelstack,%rsp
+ movq %rsp,PER_CPU_VAR(old_rsp)
+ movq PER_CPU_VAR(kernel_stack),%rsp
/*
* No need to follow this irqs off/on section - it's straight
* and short:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_ARGS 8,1
- movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
+ movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
GET_THREAD_INFO(%rcx)
@@ -339,19 +504,19 @@ system_call_fastpath:
movq %rax,RAX-ARGOFFSET(%rsp)
/*
* Syscall return path ending with SYSRET (fast path)
- * Has incomplete stack frame and undefined top of stack.
- */
+ * Has incomplete stack frame and undefined top of stack.
+ */
ret_from_sys_call:
movl $_TIF_ALLWORK_MASK,%edi
/* edi: flagmask */
-sysret_check:
+sysret_check:
LOCKDEP_SYS_EXIT
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
movl TI_flags(%rcx),%edx
andl %edi,%edx
- jnz sysret_careful
+ jnz sysret_careful
CFI_REMEMBER_STATE
/*
* sysretq will re-enable interrupts:
@@ -361,12 +526,12 @@ sysret_check:
CFI_REGISTER rip,rcx
RESTORE_ARGS 0,-ARG_SKIP,1
/*CFI_REGISTER rflags,r11*/
- movq %gs:pda_oldrsp, %rsp
+ movq PER_CPU_VAR(old_rsp), %rsp
USERGS_SYSRET64
CFI_RESTORE_STATE
/* Handle reschedules */
- /* edx: work, edi: workmask */
+ /* edx: work, edi: workmask */
sysret_careful:
bt $TIF_NEED_RESCHED,%edx
jnc sysret_signal
@@ -379,7 +544,7 @@ sysret_careful:
CFI_ADJUST_CFA_OFFSET -8
jmp sysret_check
- /* Handle a signal */
+ /* Handle a signal */
sysret_signal:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
@@ -388,17 +553,20 @@ sysret_signal:
jc sysret_audit
#endif
/* edx: work flags (arg3) */
- leaq do_notify_resume(%rip),%rax
leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
xorl %esi,%esi # oldset -> arg2
- call ptregscall_common
+ SAVE_REST
+ FIXUP_TOP_OF_STACK %r11
+ call do_notify_resume
+ RESTORE_TOP_OF_STACK %r11
+ RESTORE_REST
movl $_TIF_WORK_MASK,%edi
/* Use IRET because user could have changed frame. This
works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
-
+
badsys:
movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
jmp ret_from_sys_call
@@ -437,7 +605,7 @@ sysret_audit:
#endif /* CONFIG_AUDITSYSCALL */
/* Do syscall tracing */
-tracesys:
+tracesys:
#ifdef CONFIG_AUDITSYSCALL
testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
jz auditsys
@@ -460,8 +628,8 @@ tracesys:
call *sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
/* Use IRET because user could have changed frame */
-
-/*
+
+/*
* Syscall return path ending with IRET.
* Has correct top of stack, but partial stack frame.
*/
@@ -505,18 +673,18 @@ int_very_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_REST
- /* Check for syscall exit trace */
+ /* Check for syscall exit trace */
testl $_TIF_WORK_SYSCALL_EXIT,%edx
jz int_signal
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
- leaq 8(%rsp),%rdi # &ptregs -> arg1
+ leaq 8(%rsp),%rdi # &ptregs -> arg1
call syscall_trace_leave
popq %rdi
CFI_ADJUST_CFA_OFFSET -8
andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
jmp int_restore_rest
-
+
int_signal:
testl $_TIF_DO_NOTIFY_MASK,%edx
jz 1f
@@ -531,22 +699,24 @@ int_restore_rest:
jmp int_with_check
CFI_ENDPROC
END(system_call)
-
-/*
+
+/*
* Certain special system calls that need to save a complete full stack frame.
- */
-
+ */
.macro PTREGSCALL label,func,arg
- .globl \label
-\label:
- leaq \func(%rip),%rax
- leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
- jmp ptregscall_common
+ENTRY(\label)
+ PARTIAL_FRAME 1 8 /* offset 8: return address */
+ subq $REST_SKIP, %rsp
+ CFI_ADJUST_CFA_OFFSET REST_SKIP
+ call save_rest
+ DEFAULT_FRAME 0 8 /* offset 8: return address */
+ leaq 8(%rsp), \arg /* pt_regs pointer */
+ call \func
+ jmp ptregscall_common
+ CFI_ENDPROC
END(\label)
.endm
- CFI_STARTPROC
-
PTREGSCALL stub_clone, sys_clone, %r8
PTREGSCALL stub_fork, sys_fork, %rdi
PTREGSCALL stub_vfork, sys_vfork, %rdi
@@ -554,25 +724,18 @@ END(\label)
PTREGSCALL stub_iopl, sys_iopl, %rsi
ENTRY(ptregscall_common)
- popq %r11
- CFI_ADJUST_CFA_OFFSET -8
- CFI_REGISTER rip, r11
- SAVE_REST
- movq %r11, %r15
- CFI_REGISTER rip, r15
- FIXUP_TOP_OF_STACK %r11
- call *%rax
- RESTORE_TOP_OF_STACK %r11
- movq %r15, %r11
- CFI_REGISTER rip, r11
- RESTORE_REST
- pushq %r11
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rip, 0
- ret
+ DEFAULT_FRAME 1 8 /* offset 8: return address */
+ RESTORE_TOP_OF_STACK %r11, 8
+ movq_cfi_restore R15+8, r15
+ movq_cfi_restore R14+8, r14
+ movq_cfi_restore R13+8, r13
+ movq_cfi_restore R12+8, r12
+ movq_cfi_restore RBP+8, rbp
+ movq_cfi_restore RBX+8, rbx
+ ret $REST_SKIP /* pop extended registers */
CFI_ENDPROC
END(ptregscall_common)
-
+
ENTRY(stub_execve)
CFI_STARTPROC
popq %r11
@@ -588,11 +751,11 @@ ENTRY(stub_execve)
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_execve)
-
+
/*
* sigreturn is special because it needs to restore all registers on return.
* This cannot be done with SYSRET, so use the IRET return path instead.
- */
+ */
ENTRY(stub_rt_sigreturn)
CFI_STARTPROC
addq $8, %rsp
@@ -608,76 +771,76 @@ ENTRY(stub_rt_sigreturn)
END(stub_rt_sigreturn)
/*
- * initial frame state for interrupts and exceptions
+ * Build the entry stubs and pointer table with some assembler magic.
+ * We pack 7 stubs into a single 32-byte chunk, which will fit in a
+ * single cache line on all modern x86 implementations.
*/
- .macro _frame ref
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8-\ref
- /*CFI_REL_OFFSET ss,SS-\ref*/
- CFI_REL_OFFSET rsp,RSP-\ref
- /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
- /*CFI_REL_OFFSET cs,CS-\ref*/
- CFI_REL_OFFSET rip,RIP-\ref
- .endm
+ .section .init.rodata,"a"
+ENTRY(interrupt)
+ .text
+ .p2align 5
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
+ENTRY(irq_entries_start)
+ INTR_FRAME
+vector=FIRST_EXTERNAL_VECTOR
+.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
+ .balign 32
+ .rept 7
+ .if vector < NR_VECTORS
+ .if vector <> FIRST_EXTERNAL_VECTOR
+ CFI_ADJUST_CFA_OFFSET -8
+ .endif
+1: pushq $(~vector+0x80) /* Note: always in signed byte range */
+ CFI_ADJUST_CFA_OFFSET 8
+ .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
+ jmp 2f
+ .endif
+ .previous
+ .quad 1b
+ .text
+vector=vector+1
+ .endif
+ .endr
+2: jmp common_interrupt
+.endr
+ CFI_ENDPROC
+END(irq_entries_start)
-/* initial frame state for interrupts (and exceptions without error code) */
-#define INTR_FRAME _frame RIP
-/* initial frame state for exceptions with error code (and interrupts with
- vector already pushed) */
-#define XCPT_FRAME _frame ORIG_RAX
+.previous
+END(interrupt)
+.previous
-/*
+/*
* Interrupt entry/exit.
*
* Interrupt entry points save only callee clobbered registers in fast path.
- *
- * Entry runs with interrupts off.
- */
+ *
+ * Entry runs with interrupts off.
+ */
-/* 0(%rsp): interrupt number */
+/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
- cld
- SAVE_ARGS
- leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
- pushq %rbp
- /*
- * Save rbp twice: One is for marking the stack frame, as usual, and the
- * other, to fill pt_regs properly. This is because bx comes right
- * before the last saved register in that structure, and not bp. If the
- * base pointer were in the place bx is today, this would not be needed.
- */
- movq %rbp, -8(%rsp)
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rbp, 0
- movq %rsp,%rbp
- CFI_DEF_CFA_REGISTER rbp
- testl $3,CS(%rdi)
- je 1f
- SWAPGS
- /* irqcount is used to check if a CPU is already on an interrupt
- stack or not. While this is essentially redundant with preempt_count
- it is a little cheaper to use a separate counter in the PDA
- (short of moving irq_enter into assembly, which would be too
- much work) */
-1: incl %gs:pda_irqcount
- cmoveq %gs:pda_irqstackptr,%rsp
- push %rbp # backlink for old unwinder
- /*
- * We entered an interrupt context - irqs are off:
- */
- TRACE_IRQS_OFF
+ subq $10*8, %rsp
+ CFI_ADJUST_CFA_OFFSET 10*8
+ call save_args
+ PARTIAL_FRAME 0
call \func
.endm
-ENTRY(common_interrupt)
+ /*
+ * The interrupt stubs push (~vector+0x80) onto the stack and
+ * then jump to common_interrupt.
+ */
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
+common_interrupt:
XCPT_FRAME
+ addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
interrupt do_IRQ
- /* 0(%rsp): oldrsp-ARGOFFSET */
+ /* 0(%rsp): old_rsp-ARGOFFSET */
ret_from_intr:
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- decl %gs:pda_irqcount
+ decl PER_CPU_VAR(irq_count)
leaveq
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET -8
@@ -685,12 +848,12 @@ exit_intr:
GET_THREAD_INFO(%rcx)
testl $3,CS-ARGOFFSET(%rsp)
je retint_kernel
-
+
/* Interrupt came from user space */
/*
* Has a correct top of stack, but a partial stack frame
* %rcx: thread info. Interrupts off.
- */
+ */
retint_with_reschedule:
movl $_TIF_WORK_MASK,%edi
retint_check:
@@ -763,20 +926,20 @@ retint_careful:
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
call schedule
- popq %rdi
+ popq %rdi
CFI_ADJUST_CFA_OFFSET -8
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp retint_check
-
+
retint_signal:
testl $_TIF_DO_NOTIFY_MASK,%edx
jz retint_swapgs
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_REST
- movq $-1,ORIG_RAX(%rsp)
+ movq $-1,ORIG_RAX(%rsp)
xorl %esi,%esi # oldset
movq %rsp,%rdi # &pt_regs
call do_notify_resume
@@ -798,324 +961,213 @@ ENTRY(retint_kernel)
jnc retint_restore_args
call preempt_schedule_irq
jmp exit_intr
-#endif
+#endif
CFI_ENDPROC
END(common_interrupt)
-
+
/*
* APIC interrupts.
- */
- .macro apicinterrupt num,func
+ */
+.macro apicinterrupt num sym do_sym
+ENTRY(\sym)
INTR_FRAME
pushq $~(\num)
CFI_ADJUST_CFA_OFFSET 8
- interrupt \func
+ interrupt \do_sym
jmp ret_from_intr
CFI_ENDPROC
- .endm
-
-ENTRY(thermal_interrupt)
- apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
-END(thermal_interrupt)
-
-ENTRY(threshold_interrupt)
- apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
-END(threshold_interrupt)
-
-#ifdef CONFIG_SMP
-ENTRY(reschedule_interrupt)
- apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
-END(reschedule_interrupt)
-
- .macro INVALIDATE_ENTRY num
-ENTRY(invalidate_interrupt\num)
- apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
-END(invalidate_interrupt\num)
- .endm
+END(\sym)
+.endm
- INVALIDATE_ENTRY 0
- INVALIDATE_ENTRY 1
- INVALIDATE_ENTRY 2
- INVALIDATE_ENTRY 3
- INVALIDATE_ENTRY 4
- INVALIDATE_ENTRY 5
- INVALIDATE_ENTRY 6
- INVALIDATE_ENTRY 7
-
-ENTRY(call_function_interrupt)
- apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
-END(call_function_interrupt)
-ENTRY(call_function_single_interrupt)
- apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
-END(call_function_single_interrupt)
-ENTRY(irq_move_cleanup_interrupt)
- apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
-END(irq_move_cleanup_interrupt)
+#ifdef CONFIG_SMP
+apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
+ irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
#endif
-ENTRY(apic_timer_interrupt)
- apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
-END(apic_timer_interrupt)
+#ifdef CONFIG_X86_UV
+apicinterrupt UV_BAU_MESSAGE \
+ uv_bau_message_intr1 uv_bau_message_interrupt
+#endif
+apicinterrupt LOCAL_TIMER_VECTOR \
+ apic_timer_interrupt smp_apic_timer_interrupt
+
+#ifdef CONFIG_SMP
+apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
+ invalidate_interrupt0 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \
+ invalidate_interrupt1 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \
+ invalidate_interrupt2 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \
+ invalidate_interrupt3 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \
+ invalidate_interrupt4 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \
+ invalidate_interrupt5 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \
+ invalidate_interrupt6 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
+ invalidate_interrupt7 smp_invalidate_interrupt
+#endif
-ENTRY(uv_bau_message_intr1)
- apicinterrupt 220,uv_bau_message_interrupt
-END(uv_bau_message_intr1)
+apicinterrupt THRESHOLD_APIC_VECTOR \
+ threshold_interrupt mce_threshold_interrupt
+apicinterrupt THERMAL_APIC_VECTOR \
+ thermal_interrupt smp_thermal_interrupt
+
+#ifdef CONFIG_SMP
+apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
+ call_function_single_interrupt smp_call_function_single_interrupt
+apicinterrupt CALL_FUNCTION_VECTOR \
+ call_function_interrupt smp_call_function_interrupt
+apicinterrupt RESCHEDULE_VECTOR \
+ reschedule_interrupt smp_reschedule_interrupt
+#endif
-ENTRY(error_interrupt)
- apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
-END(error_interrupt)
+apicinterrupt ERROR_APIC_VECTOR \
+ error_interrupt smp_error_interrupt
+apicinterrupt SPURIOUS_APIC_VECTOR \
+ spurious_interrupt smp_spurious_interrupt
-ENTRY(spurious_interrupt)
- apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
-END(spurious_interrupt)
-
/*
* Exception entry points.
- */
- .macro zeroentry sym
+ */
+.macro zeroentry sym do_sym
+ENTRY(\sym)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $0 /* push error code/oldrax */
- CFI_ADJUST_CFA_OFFSET 8
- pushq %rax /* push real oldrax to the rdi slot */
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rax,0
- leaq \sym(%rip),%rax
- jmp error_entry
+ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
+ subq $15*8,%rsp
+ CFI_ADJUST_CFA_OFFSET 15*8
+ call error_entry
+ DEFAULT_FRAME 0
+ movq %rsp,%rdi /* pt_regs pointer */
+ xorl %esi,%esi /* no error code */
+ call \do_sym
+ jmp error_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
- .endm
+END(\sym)
+.endm
- .macro errorentry sym
- XCPT_FRAME
+.macro paranoidzeroentry sym do_sym
+ENTRY(\sym)
+ INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq %rax
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rax,0
- leaq \sym(%rip),%rax
- jmp error_entry
+ subq $15*8, %rsp
+ call save_paranoid
+ TRACE_IRQS_OFF
+ movq %rsp,%rdi /* pt_regs pointer */
+ xorl %esi,%esi /* no error code */
+ call \do_sym
+ jmp paranoid_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
- .endm
+END(\sym)
+.endm
- /* error code is on the stack already */
- /* handle NMI like exceptions that can happen everywhere */
- .macro paranoidentry sym, ist=0, irqtrace=1
- SAVE_ALL
- cld
- movl $1,%ebx
- movl $MSR_GS_BASE,%ecx
- rdmsr
- testl %edx,%edx
- js 1f
- SWAPGS
- xorl %ebx,%ebx
-1:
- .if \ist
- movq %gs:pda_data_offset, %rbp
- .endif
- .if \irqtrace
- TRACE_IRQS_OFF
- .endif
- movq %rsp,%rdi
- movq ORIG_RAX(%rsp),%rsi
- movq $-1,ORIG_RAX(%rsp)
- .if \ist
- subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
- .endif
- call \sym
- .if \ist
- addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
- .endif
- DISABLE_INTERRUPTS(CLBR_NONE)
- .if \irqtrace
+.macro paranoidzeroentry_ist sym do_sym ist
+ENTRY(\sym)
+ INTR_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
+ CFI_ADJUST_CFA_OFFSET 8
+ subq $15*8, %rsp
+ call save_paranoid
TRACE_IRQS_OFF
- .endif
- .endm
+ movq %rsp,%rdi /* pt_regs pointer */
+ xorl %esi,%esi /* no error code */
+ PER_CPU(init_tss, %rbp)
+ subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
+ call \do_sym
+ addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
+ jmp paranoid_exit /* %ebx: no swapgs flag */
+ CFI_ENDPROC
+END(\sym)
+.endm
- /*
- * "Paranoid" exit path from exception stack.
- * Paranoid because this is used by NMIs and cannot take
- * any kernel state for granted.
- * We don't do kernel preemption checks here, because only
- * NMI should be common and it does not enable IRQs and
- * cannot get reschedule ticks.
- *
- * "trace" is 0 for the NMI handler only, because irq-tracing
- * is fundamentally NMI-unsafe. (we cannot change the soft and
- * hard flags at once, atomically)
- */
- .macro paranoidexit trace=1
- /* ebx: no swapgs flag */
-paranoid_exit\trace:
- testl %ebx,%ebx /* swapgs needed? */
- jnz paranoid_restore\trace
- testl $3,CS(%rsp)
- jnz paranoid_userspace\trace
-paranoid_swapgs\trace:
- .if \trace
- TRACE_IRQS_IRETQ 0
- .endif
- SWAPGS_UNSAFE_STACK
-paranoid_restore\trace:
- RESTORE_ALL 8
- jmp irq_return
-paranoid_userspace\trace:
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%ebx
- andl $_TIF_WORK_MASK,%ebx
- jz paranoid_swapgs\trace
- movq %rsp,%rdi /* &pt_regs */
- call sync_regs
- movq %rax,%rsp /* switch stack for scheduling */
- testl $_TIF_NEED_RESCHED,%ebx
- jnz paranoid_schedule\trace
- movl %ebx,%edx /* arg3: thread flags */
- .if \trace
- TRACE_IRQS_ON
- .endif
- ENABLE_INTERRUPTS(CLBR_NONE)
- xorl %esi,%esi /* arg2: oldset */
- movq %rsp,%rdi /* arg1: &pt_regs */
- call do_notify_resume
- DISABLE_INTERRUPTS(CLBR_NONE)
- .if \trace
- TRACE_IRQS_OFF
- .endif
- jmp paranoid_userspace\trace
-paranoid_schedule\trace:
- .if \trace
- TRACE_IRQS_ON
- .endif
- ENABLE_INTERRUPTS(CLBR_ANY)
- call schedule
- DISABLE_INTERRUPTS(CLBR_ANY)
- .if \trace
- TRACE_IRQS_OFF
- .endif
- jmp paranoid_userspace\trace
+.macro errorentry sym do_sym
+ENTRY(\sym)
+ XCPT_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
+ subq $15*8,%rsp
+ CFI_ADJUST_CFA_OFFSET 15*8
+ call error_entry
+ DEFAULT_FRAME 0
+ movq %rsp,%rdi /* pt_regs pointer */
+ movq ORIG_RAX(%rsp),%rsi /* get error code */
+ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
+ call \do_sym
+ jmp error_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
- .endm
+END(\sym)
+.endm
-/*
- * Exception entry point. This expects an error code/orig_rax on the stack
- * and the exception handler in %rax.
- */
-KPROBE_ENTRY(error_entry)
- _frame RDI
- CFI_REL_OFFSET rax,0
- /* rdi slot contains rax, oldrax contains error code */
- cld
- subq $14*8,%rsp
- CFI_ADJUST_CFA_OFFSET (14*8)
- movq %rsi,13*8(%rsp)
- CFI_REL_OFFSET rsi,RSI
- movq 14*8(%rsp),%rsi /* load rax from rdi slot */
- CFI_REGISTER rax,rsi
- movq %rdx,12*8(%rsp)
- CFI_REL_OFFSET rdx,RDX
- movq %rcx,11*8(%rsp)
- CFI_REL_OFFSET rcx,RCX
- movq %rsi,10*8(%rsp) /* store rax */
- CFI_REL_OFFSET rax,RAX
- movq %r8, 9*8(%rsp)
- CFI_REL_OFFSET r8,R8
- movq %r9, 8*8(%rsp)
- CFI_REL_OFFSET r9,R9
- movq %r10,7*8(%rsp)
- CFI_REL_OFFSET r10,R10
- movq %r11,6*8(%rsp)
- CFI_REL_OFFSET r11,R11
- movq %rbx,5*8(%rsp)
- CFI_REL_OFFSET rbx,RBX
- movq %rbp,4*8(%rsp)
- CFI_REL_OFFSET rbp,RBP
- movq %r12,3*8(%rsp)
- CFI_REL_OFFSET r12,R12
- movq %r13,2*8(%rsp)
- CFI_REL_OFFSET r13,R13
- movq %r14,1*8(%rsp)
- CFI_REL_OFFSET r14,R14
- movq %r15,(%rsp)
- CFI_REL_OFFSET r15,R15
- xorl %ebx,%ebx
- testl $3,CS(%rsp)
- je error_kernelspace
-error_swapgs:
- SWAPGS
-error_sti:
- TRACE_IRQS_OFF
- movq %rdi,RDI(%rsp)
- CFI_REL_OFFSET rdi,RDI
- movq %rsp,%rdi
- movq ORIG_RAX(%rsp),%rsi /* get error code */
- movq $-1,ORIG_RAX(%rsp)
- call *%rax
- /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
-error_exit:
- movl %ebx,%eax
- RESTORE_REST
- DISABLE_INTERRUPTS(CLBR_NONE)
+ /* error code is on the stack already */
+.macro paranoiderrorentry sym do_sym
+ENTRY(\sym)
+ XCPT_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
+ subq $15*8,%rsp
+ CFI_ADJUST_CFA_OFFSET 15*8
+ call save_paranoid
+ DEFAULT_FRAME 0
TRACE_IRQS_OFF
- GET_THREAD_INFO(%rcx)
- testl %eax,%eax
- jne retint_kernel
- LOCKDEP_SYS_EXIT_IRQ
- movl TI_flags(%rcx),%edx
- movl $_TIF_WORK_MASK,%edi
- andl %edi,%edx
- jnz retint_careful
- jmp retint_swapgs
+ movq %rsp,%rdi /* pt_regs pointer */
+ movq ORIG_RAX(%rsp),%rsi /* get error code */
+ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
+ call \do_sym
+ jmp paranoid_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
+END(\sym)
+.endm
-error_kernelspace:
- incl %ebx
- /* There are two places in the kernel that can potentially fault with
- usergs. Handle them here. The exception handlers after
- iret run with kernel gs again, so don't set the user space flag.
- B stepping K8s sometimes report an truncated RIP for IRET
- exceptions returning to compat mode. Check for these here too. */
- leaq irq_return(%rip),%rcx
- cmpq %rcx,RIP(%rsp)
- je error_swapgs
- movl %ecx,%ecx /* zero extend */
- cmpq %rcx,RIP(%rsp)
- je error_swapgs
- cmpq $gs_change,RIP(%rsp)
- je error_swapgs
- jmp error_sti
-KPROBE_END(error_entry)
-
- /* Reload gs selector with exception handling */
- /* edi: new selector */
+zeroentry divide_error do_divide_error
+zeroentry overflow do_overflow
+zeroentry bounds do_bounds
+zeroentry invalid_op do_invalid_op
+zeroentry device_not_available do_device_not_available
+paranoiderrorentry double_fault do_double_fault
+zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
+errorentry invalid_TSS do_invalid_TSS
+errorentry segment_not_present do_segment_not_present
+zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
+zeroentry coprocessor_error do_coprocessor_error
+errorentry alignment_check do_alignment_check
+zeroentry simd_coprocessor_error do_simd_coprocessor_error
+
+ /* Reload gs selector with exception handling */
+ /* edi: new selector */
ENTRY(native_load_gs_index)
CFI_STARTPROC
pushf
CFI_ADJUST_CFA_OFFSET 8
- DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
- SWAPGS
-gs_change:
- movl %edi,%gs
+ DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
+ SWAPGS
+gs_change:
+ movl %edi,%gs
2: mfence /* workaround */
SWAPGS
- popf
+ popf
CFI_ADJUST_CFA_OFFSET -8
- ret
+ ret
CFI_ENDPROC
-ENDPROC(native_load_gs_index)
-
- .section __ex_table,"a"
- .align 8
- .quad gs_change,bad_gs
- .previous
- .section .fixup,"ax"
+END(native_load_gs_index)
+
+ .section __ex_table,"a"
+ .align 8
+ .quad gs_change,bad_gs
+ .previous
+ .section .fixup,"ax"
/* running with kernelgs */
-bad_gs:
+bad_gs:
SWAPGS /* switch back to user gs */
xorl %eax,%eax
- movl %eax,%gs
- jmp 2b
- .previous
-
+ movl %eax,%gs
+ jmp 2b
+ .previous
+
/*
* Create a kernel thread.
*
@@ -1138,7 +1190,7 @@ ENTRY(kernel_thread)
xorl %r8d,%r8d
xorl %r9d,%r9d
-
+
# clone now
call do_fork
movq %rax,RAX(%rsp)
@@ -1149,15 +1201,15 @@ ENTRY(kernel_thread)
* so internally to the x86_64 port you can rely on kernel_thread()
* not to reschedule the child before returning, this avoids the need
* of hacks for example to fork off the per-CPU idle tasks.
- * [Hopefully no generic code relies on the reschedule -AK]
+ * [Hopefully no generic code relies on the reschedule -AK]
*/
RESTORE_ALL
UNFAKE_STACK_FRAME
ret
CFI_ENDPROC
-ENDPROC(kernel_thread)
-
-child_rip:
+END(kernel_thread)
+
+ENTRY(child_rip)
pushq $0 # fake return address
CFI_STARTPROC
/*
@@ -1170,8 +1222,9 @@ child_rip:
# exit
mov %eax, %edi
call do_exit
+ ud2 # padding for call trace
CFI_ENDPROC
-ENDPROC(child_rip)
+END(child_rip)
/*
* execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
@@ -1191,10 +1244,10 @@ ENDPROC(child_rip)
ENTRY(kernel_execve)
CFI_STARTPROC
FAKE_STACK_FRAME $0
- SAVE_ALL
+ SAVE_ALL
movq %rsp,%rcx
call sys_execve
- movq %rax, RAX(%rsp)
+ movq %rax, RAX(%rsp)
RESTORE_REST
testq %rax,%rax
je int_ret_from_sys_call
@@ -1202,129 +1255,7 @@ ENTRY(kernel_execve)
UNFAKE_STACK_FRAME
ret
CFI_ENDPROC
-ENDPROC(kernel_execve)
-
-KPROBE_ENTRY(page_fault)
- errorentry do_page_fault
-KPROBE_END(page_fault)
-
-ENTRY(coprocessor_error)
- zeroentry do_coprocessor_error
-END(coprocessor_error)
-
-ENTRY(simd_coprocessor_error)
- zeroentry do_simd_coprocessor_error
-END(simd_coprocessor_error)
-
-ENTRY(device_not_available)
- zeroentry do_device_not_available
-END(device_not_available)
-
- /* runs on exception stack */
-KPROBE_ENTRY(debug)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $0
- CFI_ADJUST_CFA_OFFSET 8
- paranoidentry do_debug, DEBUG_STACK
- paranoidexit
-KPROBE_END(debug)
-
- /* runs on exception stack */
-KPROBE_ENTRY(nmi)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $-1
- CFI_ADJUST_CFA_OFFSET 8
- paranoidentry do_nmi, 0, 0
-#ifdef CONFIG_TRACE_IRQFLAGS
- paranoidexit 0
-#else
- jmp paranoid_exit1
- CFI_ENDPROC
-#endif
-KPROBE_END(nmi)
-
-KPROBE_ENTRY(int3)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $0
- CFI_ADJUST_CFA_OFFSET 8
- paranoidentry do_int3, DEBUG_STACK
- jmp paranoid_exit1
- CFI_ENDPROC
-KPROBE_END(int3)
-
-ENTRY(overflow)
- zeroentry do_overflow
-END(overflow)
-
-ENTRY(bounds)
- zeroentry do_bounds
-END(bounds)
-
-ENTRY(invalid_op)
- zeroentry do_invalid_op
-END(invalid_op)
-
-ENTRY(coprocessor_segment_overrun)
- zeroentry do_coprocessor_segment_overrun
-END(coprocessor_segment_overrun)
-
- /* runs on exception stack */
-ENTRY(double_fault)
- XCPT_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- paranoidentry do_double_fault
- jmp paranoid_exit1
- CFI_ENDPROC
-END(double_fault)
-
-ENTRY(invalid_TSS)
- errorentry do_invalid_TSS
-END(invalid_TSS)
-
-ENTRY(segment_not_present)
- errorentry do_segment_not_present
-END(segment_not_present)
-
- /* runs on exception stack */
-ENTRY(stack_segment)
- XCPT_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- paranoidentry do_stack_segment
- jmp paranoid_exit1
- CFI_ENDPROC
-END(stack_segment)
-
-KPROBE_ENTRY(general_protection)
- errorentry do_general_protection
-KPROBE_END(general_protection)
-
-ENTRY(alignment_check)
- errorentry do_alignment_check
-END(alignment_check)
-
-ENTRY(divide_error)
- zeroentry do_divide_error
-END(divide_error)
-
-ENTRY(spurious_interrupt_bug)
- zeroentry do_spurious_interrupt_bug
-END(spurious_interrupt_bug)
-
-#ifdef CONFIG_X86_MCE
- /* runs on exception stack */
-ENTRY(machine_check)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $0
- CFI_ADJUST_CFA_OFFSET 8
- paranoidentry do_machine_check
- jmp paranoid_exit1
- CFI_ENDPROC
-END(machine_check)
-#endif
+END(kernel_execve)
/* Call softirq on interrupt stack. Interrupts are off. */
ENTRY(call_softirq)
@@ -1334,81 +1265,77 @@ ENTRY(call_softirq)
CFI_REL_OFFSET rbp,0
mov %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
- incl %gs:pda_irqcount
- cmove %gs:pda_irqstackptr,%rsp
+ incl PER_CPU_VAR(irq_count)
+ cmove PER_CPU_VAR(irq_stack_ptr),%rsp
push %rbp # backlink for old unwinder
call __do_softirq
leaveq
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET -8
- decl %gs:pda_irqcount
+ decl PER_CPU_VAR(irq_count)
ret
CFI_ENDPROC
-ENDPROC(call_softirq)
-
-KPROBE_ENTRY(ignore_sysret)
- CFI_STARTPROC
- mov $-ENOSYS,%eax
- sysret
- CFI_ENDPROC
-ENDPROC(ignore_sysret)
+END(call_softirq)
#ifdef CONFIG_XEN
-ENTRY(xen_hypervisor_callback)
- zeroentry xen_do_hypervisor_callback
-END(xen_hypervisor_callback)
+zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
/*
-# A note on the "critical region" in our callback handler.
-# We want to avoid stacking callback handlers due to events occurring
-# during handling of the last event. To do this, we keep events disabled
-# until we've done all processing. HOWEVER, we must enable events before
-# popping the stack frame (can't be done atomically) and so it would still
-# be possible to get enough handler activations to overflow the stack.
-# Although unlikely, bugs of that kind are hard to track down, so we'd
-# like to avoid the possibility.
-# So, on entry to the handler we detect whether we interrupted an
-# existing activation in its critical region -- if so, we pop the current
-# activation and restart the handler using the previous one.
-*/
+ * A note on the "critical region" in our callback handler.
+ * We want to avoid stacking callback handlers due to events occurring
+ * during handling of the last event. To do this, we keep events disabled
+ * until we've done all processing. HOWEVER, we must enable events before
+ * popping the stack frame (can't be done atomically) and so it would still
+ * be possible to get enough handler activations to overflow the stack.
+ * Although unlikely, bugs of that kind are hard to track down, so we'd
+ * like to avoid the possibility.
+ * So, on entry to the handler we detect whether we interrupted an
+ * existing activation in its critical region -- if so, we pop the current
+ * activation and restart the handler using the previous one.
+ */
ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
CFI_STARTPROC
-/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
- see the correct pointer to the pt_regs */
+/*
+ * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
+ * see the correct pointer to the pt_regs
+ */
movq %rdi, %rsp # we don't return, adjust the stack frame
CFI_ENDPROC
- CFI_DEFAULT_STACK
-11: incl %gs:pda_irqcount
+ DEFAULT_FRAME
+11: incl PER_CPU_VAR(irq_count)
movq %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
- cmovzq %gs:pda_irqstackptr,%rsp
+ cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
pushq %rbp # backlink for old unwinder
call xen_evtchn_do_upcall
popq %rsp
CFI_DEF_CFA_REGISTER rsp
- decl %gs:pda_irqcount
+ decl PER_CPU_VAR(irq_count)
jmp error_exit
CFI_ENDPROC
END(do_hypervisor_callback)
/*
-# Hypervisor uses this for application faults while it executes.
-# We get here for two reasons:
-# 1. Fault while reloading DS, ES, FS or GS
-# 2. Fault while executing IRET
-# Category 1 we do not need to fix up as Xen has already reloaded all segment
-# registers that could be reloaded and zeroed the others.
-# Category 2 we fix up by killing the current process. We cannot use the
-# normal Linux return path in this case because if we use the IRET hypercall
-# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
-# We distinguish between categories by comparing each saved segment register
-# with its current contents: any discrepancy means we in category 1.
-*/
+ * Hypervisor uses this for application faults while it executes.
+ * We get here for two reasons:
+ * 1. Fault while reloading DS, ES, FS or GS
+ * 2. Fault while executing IRET
+ * Category 1 we do not need to fix up as Xen has already reloaded all segment
+ * registers that could be reloaded and zeroed the others.
+ * Category 2 we fix up by killing the current process. We cannot use the
+ * normal Linux return path in this case because if we use the IRET hypercall
+ * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+ * We distinguish between categories by comparing each saved segment register
+ * with its current contents: any discrepancy means we in category 1.
+ */
ENTRY(xen_failsafe_callback)
- framesz = (RIP-0x30) /* workaround buggy gas */
- _frame framesz
- CFI_REL_OFFSET rcx, 0
- CFI_REL_OFFSET r11, 8
+ INTR_FRAME 1 (6*8)
+ /*CFI_REL_OFFSET gs,GS*/
+ /*CFI_REL_OFFSET fs,FS*/
+ /*CFI_REL_OFFSET es,ES*/
+ /*CFI_REL_OFFSET ds,DS*/
+ CFI_REL_OFFSET r11,8
+ CFI_REL_OFFSET rcx,0
movw %ds,%cx
cmpw %cx,0x10(%rsp)
CFI_REMEMBER_STATE
@@ -1429,12 +1356,9 @@ ENTRY(xen_failsafe_callback)
CFI_RESTORE r11
addq $0x30,%rsp
CFI_ADJUST_CFA_OFFSET -0x30
- pushq $0
- CFI_ADJUST_CFA_OFFSET 8
- pushq %r11
- CFI_ADJUST_CFA_OFFSET 8
- pushq %rcx
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $0 /* RIP */
+ pushq_cfi %r11
+ pushq_cfi %rcx
jmp general_protection
CFI_RESTORE_STATE
1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
@@ -1444,11 +1368,223 @@ ENTRY(xen_failsafe_callback)
CFI_RESTORE r11
addq $0x30,%rsp
CFI_ADJUST_CFA_OFFSET -0x30
- pushq $0
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $0
SAVE_ALL
jmp error_exit
CFI_ENDPROC
END(xen_failsafe_callback)
#endif /* CONFIG_XEN */
+
+/*
+ * Some functions should be protected against kprobes
+ */
+ .pushsection .kprobes.text, "ax"
+
+paranoidzeroentry_ist debug do_debug DEBUG_STACK
+paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
+paranoiderrorentry stack_segment do_stack_segment
+errorentry general_protection do_general_protection
+errorentry page_fault do_page_fault
+#ifdef CONFIG_X86_MCE
+paranoidzeroentry machine_check do_machine_check
+#endif
+
+ /*
+ * "Paranoid" exit path from exception stack.
+ * Paranoid because this is used by NMIs and cannot take
+ * any kernel state for granted.
+ * We don't do kernel preemption checks here, because only
+ * NMI should be common and it does not enable IRQs and
+ * cannot get reschedule ticks.
+ *
+ * "trace" is 0 for the NMI handler only, because irq-tracing
+ * is fundamentally NMI-unsafe. (we cannot change the soft and
+ * hard flags at once, atomically)
+ */
+
+ /* ebx: no swapgs flag */
+ENTRY(paranoid_exit)
+ INTR_FRAME
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ testl %ebx,%ebx /* swapgs needed? */
+ jnz paranoid_restore
+ testl $3,CS(%rsp)
+ jnz paranoid_userspace
+paranoid_swapgs:
+ TRACE_IRQS_IRETQ 0
+ SWAPGS_UNSAFE_STACK
+paranoid_restore:
+ RESTORE_ALL 8
+ jmp irq_return
+paranoid_userspace:
+ GET_THREAD_INFO(%rcx)
+ movl TI_flags(%rcx),%ebx
+ andl $_TIF_WORK_MASK,%ebx
+ jz paranoid_swapgs
+ movq %rsp,%rdi /* &pt_regs */
+ call sync_regs
+ movq %rax,%rsp /* switch stack for scheduling */
+ testl $_TIF_NEED_RESCHED,%ebx
+ jnz paranoid_schedule
+ movl %ebx,%edx /* arg3: thread flags */
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ xorl %esi,%esi /* arg2: oldset */
+ movq %rsp,%rdi /* arg1: &pt_regs */
+ call do_notify_resume
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ jmp paranoid_userspace
+paranoid_schedule:
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_ANY)
+ call schedule
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_OFF
+ jmp paranoid_userspace
+ CFI_ENDPROC
+END(paranoid_exit)
+
+/*
+ * Exception entry point. This expects an error code/orig_rax on the stack.
+ * returns in "no swapgs flag" in %ebx.
+ */
+ENTRY(error_entry)
+ XCPT_FRAME
+ CFI_ADJUST_CFA_OFFSET 15*8
+ /* oldrax contains error code */
+ cld
+ movq_cfi rdi, RDI+8
+ movq_cfi rsi, RSI+8
+ movq_cfi rdx, RDX+8
+ movq_cfi rcx, RCX+8
+ movq_cfi rax, RAX+8
+ movq_cfi r8, R8+8
+ movq_cfi r9, R9+8
+ movq_cfi r10, R10+8
+ movq_cfi r11, R11+8
+ movq_cfi rbx, RBX+8
+ movq_cfi rbp, RBP+8
+ movq_cfi r12, R12+8
+ movq_cfi r13, R13+8
+ movq_cfi r14, R14+8
+ movq_cfi r15, R15+8
+ xorl %ebx,%ebx
+ testl $3,CS+8(%rsp)
+ je error_kernelspace
+error_swapgs:
+ SWAPGS
+error_sti:
+ TRACE_IRQS_OFF
+ ret
+ CFI_ENDPROC
+
+/*
+ * There are two places in the kernel that can potentially fault with
+ * usergs. Handle them here. The exception handlers after iret run with
+ * kernel gs again, so don't set the user space flag. B stepping K8s
+ * sometimes report an truncated RIP for IRET exceptions returning to
+ * compat mode. Check for these here too.
+ */
+error_kernelspace:
+ incl %ebx
+ leaq irq_return(%rip),%rcx
+ cmpq %rcx,RIP+8(%rsp)
+ je error_swapgs
+ movl %ecx,%ecx /* zero extend */
+ cmpq %rcx,RIP+8(%rsp)
+ je error_swapgs
+ cmpq $gs_change,RIP+8(%rsp)
+ je error_swapgs
+ jmp error_sti
+END(error_entry)
+
+
+/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
+ENTRY(error_exit)
+ DEFAULT_FRAME
+ movl %ebx,%eax
+ RESTORE_REST
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ GET_THREAD_INFO(%rcx)
+ testl %eax,%eax
+ jne retint_kernel
+ LOCKDEP_SYS_EXIT_IRQ
+ movl TI_flags(%rcx),%edx
+ movl $_TIF_WORK_MASK,%edi
+ andl %edi,%edx
+ jnz retint_careful
+ jmp retint_swapgs
+ CFI_ENDPROC
+END(error_exit)
+
+
+ /* runs on exception stack */
+ENTRY(nmi)
+ INTR_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
+ pushq_cfi $-1
+ subq $15*8, %rsp
+ CFI_ADJUST_CFA_OFFSET 15*8
+ call save_paranoid
+ DEFAULT_FRAME 0
+ /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
+ movq %rsp,%rdi
+ movq $-1,%rsi
+ call do_nmi
+#ifdef CONFIG_TRACE_IRQFLAGS
+ /* paranoidexit; without TRACE_IRQS_OFF */
+ /* ebx: no swapgs flag */
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ testl %ebx,%ebx /* swapgs needed? */
+ jnz nmi_restore
+ testl $3,CS(%rsp)
+ jnz nmi_userspace
+nmi_swapgs:
+ SWAPGS_UNSAFE_STACK
+nmi_restore:
+ RESTORE_ALL 8
+ jmp irq_return
+nmi_userspace:
+ GET_THREAD_INFO(%rcx)
+ movl TI_flags(%rcx),%ebx
+ andl $_TIF_WORK_MASK,%ebx
+ jz nmi_swapgs
+ movq %rsp,%rdi /* &pt_regs */
+ call sync_regs
+ movq %rax,%rsp /* switch stack for scheduling */
+ testl $_TIF_NEED_RESCHED,%ebx
+ jnz nmi_schedule
+ movl %ebx,%edx /* arg3: thread flags */
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ xorl %esi,%esi /* arg2: oldset */
+ movq %rsp,%rdi /* arg1: &pt_regs */
+ call do_notify_resume
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ jmp nmi_userspace
+nmi_schedule:
+ ENABLE_INTERRUPTS(CLBR_ANY)
+ call schedule
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ jmp nmi_userspace
+ CFI_ENDPROC
+#else
+ jmp paranoid_exit
+ CFI_ENDPROC
+#endif
+END(nmi)
+
+ENTRY(ignore_sysret)
+ CFI_STARTPROC
+ mov $-ENOSYS,%eax
+ sysret
+ CFI_ENDPROC
+END(ignore_sysret)
+
+/*
+ * End of kprobes section
+ */
+ .popsection
diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c
index f454c78fcef..55515d73d9c 100644
--- a/arch/x86/kernel/es7000_32.c
+++ b/arch/x86/kernel/es7000_32.c
@@ -38,8 +38,10 @@
#include <asm/io.h>
#include <asm/nmi.h>
#include <asm/smp.h>
+#include <asm/atomic.h>
#include <asm/apicdef.h>
-#include <mach_mpparse.h>
+#include <asm/genapic.h>
+#include <asm/setup.h>
/*
* ES7000 chipsets
@@ -161,6 +163,39 @@ es7000_rename_gsi(int ioapic, int gsi)
return gsi;
}
+static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
+{
+ unsigned long vect = 0, psaival = 0;
+
+ if (psai == NULL)
+ return -1;
+
+ vect = ((unsigned long)__pa(eip)/0x1000) << 16;
+ psaival = (0x1000000 | vect | cpu);
+
+ while (*psai & 0x1000000)
+ ;
+
+ *psai = psaival;
+
+ return 0;
+}
+
+static int __init es7000_update_genapic(void)
+{
+ apic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
+
+ /* MPENTIUMIII */
+ if (boot_cpu_data.x86 == 6 &&
+ (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) {
+ es7000_update_genapic_to_cluster();
+ apic->wait_for_init_deassert = NULL;
+ apic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
+ }
+
+ return 0;
+}
+
void __init
setup_unisys(void)
{
@@ -176,6 +211,8 @@ setup_unisys(void)
else
es7000_plat = ES7000_CLASSIC;
ioapic_renumber_irq = es7000_rename_gsi;
+
+ x86_quirks->update_genapic = es7000_update_genapic;
}
/*
@@ -324,40 +361,449 @@ es7000_mip_write(struct mip_reg *mip_reg)
return status;
}
-int
-es7000_start_cpu(int cpu, unsigned long eip)
+void __init es7000_enable_apic_mode(void)
{
- unsigned long vect = 0, psaival = 0;
+ struct mip_reg es7000_mip_reg;
+ int mip_status;
- if (psai == NULL)
- return -1;
+ if (!es7000_plat)
+ return;
- vect = ((unsigned long)__pa(eip)/0x1000) << 16;
- psaival = (0x1000000 | vect | cpu);
+ printk("ES7000: Enabling APIC mode.\n");
+ memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
+ es7000_mip_reg.off_0 = MIP_SW_APIC;
+ es7000_mip_reg.off_38 = MIP_VALID;
- while (*psai & 0x1000000)
- ;
+ while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0) {
+ printk("es7000_enable_apic_mode: command failed, status = %x\n",
+ mip_status);
+ }
+}
+
+/*
+ * APIC driver for the Unisys ES7000 chipset.
+ */
+#define APIC_DEFINITION 1
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <asm/mpspec.h>
+#include <asm/genapic.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/smp.h>
+#include <asm/ipi.h>
+
+#define APIC_DFR_VALUE_CLUSTER (APIC_DFR_CLUSTER)
+#define INT_DELIVERY_MODE_CLUSTER (dest_LowestPrio)
+#define INT_DEST_MODE_CLUSTER (1) /* logical delivery broadcast to all procs */
+
+#define APIC_DFR_VALUE (APIC_DFR_FLAT)
+
+extern void es7000_enable_apic_mode(void);
+extern int apic_version [MAX_APICS];
+extern u8 cpu_2_logical_apicid[];
+extern unsigned int boot_cpu_physical_apicid;
+
+extern int parse_unisys_oem (char *oemptr);
+extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
+extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr);
+extern void setup_unisys(void);
+
+#define apicid_cluster(apicid) (apicid & 0xF0)
+#define xapic_phys_to_log_apicid(cpu) per_cpu(x86_bios_cpu_apicid, cpu)
+
+static void es7000_vector_allocation_domain(int cpu, cpumask_t *retmask)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
+}
- *psai = psaival;
+static void es7000_wait_for_init_deassert(atomic_t *deassert)
+{
+#ifndef CONFIG_ES7000_CLUSTERED_APIC
+ while (!atomic_read(deassert))
+ cpu_relax();
+#endif
+ return;
+}
+
+static unsigned int es7000_get_apic_id(unsigned long x)
+{
+ return (x >> 24) & 0xFF;
+}
+
+#ifdef CONFIG_ACPI
+static int es7000_check_dsdt(void)
+{
+ struct acpi_table_header header;
+
+ if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) &&
+ !strncmp(header.oem_id, "UNISYS", 6))
+ return 1;
return 0;
+}
+#endif
+static void es7000_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ default_send_IPI_mask_sequence_phys(mask, vector);
}
-void __init
-es7000_sw_apic(void)
-{
- if (es7000_plat) {
- int mip_status;
- struct mip_reg es7000_mip_reg;
-
- printk("ES7000: Enabling APIC mode.\n");
- memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
- es7000_mip_reg.off_0 = MIP_SW_APIC;
- es7000_mip_reg.off_38 = (MIP_VALID);
- while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0)
- printk("es7000_sw_apic: command failed, status = %x\n",
- mip_status);
- return;
+static void es7000_send_IPI_allbutself(int vector)
+{
+ default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
+}
+
+static void es7000_send_IPI_all(int vector)
+{
+ es7000_send_IPI_mask(cpu_online_mask, vector);
+}
+
+static int es7000_apic_id_registered(void)
+{
+ return 1;
+}
+
+static const cpumask_t *target_cpus_cluster(void)
+{
+ return &CPU_MASK_ALL;
+}
+
+static const cpumask_t *es7000_target_cpus(void)
+{
+ return &cpumask_of_cpu(smp_processor_id());
+}
+
+static unsigned long
+es7000_check_apicid_used(physid_mask_t bitmap, int apicid)
+{
+ return 0;
+}
+static unsigned long es7000_check_apicid_present(int bit)
+{
+ return physid_isset(bit, phys_cpu_present_map);
+}
+
+static unsigned long calculate_ldr(int cpu)
+{
+ unsigned long id = xapic_phys_to_log_apicid(cpu);
+
+ return (SET_APIC_LOGICAL_ID(id));
+}
+
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LdR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+static void es7000_init_apic_ldr_cluster(void)
+{
+ unsigned long val;
+ int cpu = smp_processor_id();
+
+ apic_write(APIC_DFR, APIC_DFR_VALUE_CLUSTER);
+ val = calculate_ldr(cpu);
+ apic_write(APIC_LDR, val);
+}
+
+static void es7000_init_apic_ldr(void)
+{
+ unsigned long val;
+ int cpu = smp_processor_id();
+
+ apic_write(APIC_DFR, APIC_DFR_VALUE);
+ val = calculate_ldr(cpu);
+ apic_write(APIC_LDR, val);
+}
+
+static void es7000_setup_apic_routing(void)
+{
+ int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
+ printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
+ (apic_version[apic] == 0x14) ?
+ "Physical Cluster" : "Logical Cluster",
+ nr_ioapics, cpus_addr(*es7000_target_cpus())[0]);
+}
+
+static int es7000_apicid_to_node(int logical_apicid)
+{
+ return 0;
+}
+
+
+static int es7000_cpu_present_to_apicid(int mps_cpu)
+{
+ if (!mps_cpu)
+ return boot_cpu_physical_apicid;
+ else if (mps_cpu < nr_cpu_ids)
+ return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
+ else
+ return BAD_APICID;
+}
+
+static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid)
+{
+ static int id = 0;
+ physid_mask_t mask;
+
+ mask = physid_mask_of_physid(id);
+ ++id;
+
+ return mask;
+}
+
+/* Mapping from cpu number to logical apicid */
+static int es7000_cpu_to_logical_apicid(int cpu)
+{
+#ifdef CONFIG_SMP
+ if (cpu >= nr_cpu_ids)
+ return BAD_APICID;
+ return (int)cpu_2_logical_apicid[cpu];
+#else
+ return logical_smp_processor_id();
+#endif
+}
+
+static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map)
+{
+ /* For clustered we don't have a good way to do this yet - hack */
+ return physids_promote(0xff);
+}
+
+static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
+{
+ boot_cpu_physical_apicid = read_apic_id();
+ return (1);
+}
+
+static unsigned int
+es7000_cpu_mask_to_apicid_cluster(const struct cpumask *cpumask)
+{
+ int cpus_found = 0;
+ int num_bits_set;
+ int apicid;
+ int cpu;
+
+ num_bits_set = cpumask_weight(cpumask);
+ /* Return id to all */
+ if (num_bits_set == nr_cpu_ids)
+ return 0xFF;
+ /*
+ * The cpus in the mask must all be on the apic cluster. If are not
+ * on the same apicid cluster return default value of target_cpus():
+ */
+ cpu = cpumask_first(cpumask);
+ apicid = es7000_cpu_to_logical_apicid(cpu);
+
+ while (cpus_found < num_bits_set) {
+ if (cpumask_test_cpu(cpu, cpumask)) {
+ int new_apicid = es7000_cpu_to_logical_apicid(cpu);
+
+ if (apicid_cluster(apicid) !=
+ apicid_cluster(new_apicid)) {
+ printk ("%s: Not a valid mask!\n", __func__);
+
+ return 0xFF;
+ }
+ apicid = new_apicid;
+ cpus_found++;
+ }
+ cpu++;
+ }
+ return apicid;
+}
+
+static unsigned int es7000_cpu_mask_to_apicid(const cpumask_t *cpumask)
+{
+ int cpus_found = 0;
+ int num_bits_set;
+ int apicid;
+ int cpu;
+
+ num_bits_set = cpus_weight(*cpumask);
+ /* Return id to all */
+ if (num_bits_set == nr_cpu_ids)
+ return es7000_cpu_to_logical_apicid(0);
+ /*
+ * The cpus in the mask must all be on the apic cluster. If are not
+ * on the same apicid cluster return default value of target_cpus():
+ */
+ cpu = first_cpu(*cpumask);
+ apicid = es7000_cpu_to_logical_apicid(cpu);
+ while (cpus_found < num_bits_set) {
+ if (cpu_isset(cpu, *cpumask)) {
+ int new_apicid = es7000_cpu_to_logical_apicid(cpu);
+
+ if (apicid_cluster(apicid) !=
+ apicid_cluster(new_apicid)) {
+ printk ("%s: Not a valid mask!\n", __func__);
+
+ return es7000_cpu_to_logical_apicid(0);
+ }
+ apicid = new_apicid;
+ cpus_found++;
+ }
+ cpu++;
}
+ return apicid;
}
+
+static unsigned int
+es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
+ const struct cpumask *andmask)
+{
+ int apicid = es7000_cpu_to_logical_apicid(0);
+ cpumask_var_t cpumask;
+
+ if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
+ return apicid;
+
+ cpumask_and(cpumask, inmask, andmask);
+ cpumask_and(cpumask, cpumask, cpu_online_mask);
+ apicid = es7000_cpu_mask_to_apicid(cpumask);
+
+ free_cpumask_var(cpumask);
+
+ return apicid;
+}
+
+static int es7000_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+void __init es7000_update_genapic_to_cluster(void)
+{
+ apic->target_cpus = target_cpus_cluster;
+ apic->irq_delivery_mode = INT_DELIVERY_MODE_CLUSTER;
+ apic->irq_dest_mode = INT_DEST_MODE_CLUSTER;
+
+ apic->init_apic_ldr = es7000_init_apic_ldr_cluster;
+
+ apic->cpu_mask_to_apicid = es7000_cpu_mask_to_apicid_cluster;
+}
+
+static int probe_es7000(void)
+{
+ /* probed later in mptable/ACPI hooks */
+ return 0;
+}
+
+static __init int
+es7000_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
+{
+ if (mpc->oemptr) {
+ struct mpc_oemtable *oem_table =
+ (struct mpc_oemtable *)mpc->oemptr;
+
+ if (!strncmp(oem, "UNISYS", 6))
+ return parse_unisys_oem((char *)oem_table);
+ }
+ return 0;
+}
+
+#ifdef CONFIG_ACPI
+/* Hook from generic ACPI tables.c */
+static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ unsigned long oem_addr = 0;
+ int check_dsdt;
+ int ret = 0;
+
+ /* check dsdt at first to avoid clear fix_map for oem_addr */
+ check_dsdt = es7000_check_dsdt();
+
+ if (!find_unisys_acpi_oem_table(&oem_addr)) {
+ if (check_dsdt)
+ ret = parse_unisys_oem((char *)oem_addr);
+ else {
+ setup_unisys();
+ ret = 1;
+ }
+ /*
+ * we need to unmap it
+ */
+ unmap_unisys_acpi_oem_table(oem_addr);
+ }
+ return ret;
+}
+#else
+static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ return 0;
+}
+#endif
+
+
+struct genapic apic_es7000 = {
+
+ .name = "es7000",
+ .probe = probe_es7000,
+ .acpi_madt_oem_check = es7000_acpi_madt_oem_check,
+ .apic_id_registered = es7000_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ /* phys delivery to target CPUs: */
+ .irq_dest_mode = 0,
+
+ .target_cpus = es7000_target_cpus,
+ .disable_esr = 1,
+ .dest_logical = 0,
+ .check_apicid_used = es7000_check_apicid_used,
+ .check_apicid_present = es7000_check_apicid_present,
+
+ .vector_allocation_domain = es7000_vector_allocation_domain,
+ .init_apic_ldr = es7000_init_apic_ldr,
+
+ .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
+ .setup_apic_routing = es7000_setup_apic_routing,
+ .multi_timer_check = NULL,
+ .apicid_to_node = es7000_apicid_to_node,
+ .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
+ .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
+ .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = es7000_check_phys_apicid_present,
+ .enable_apic_mode = es7000_enable_apic_mode,
+ .phys_pkg_id = es7000_phys_pkg_id,
+ .mps_oem_check = es7000_mps_oem_check,
+
+ .get_apic_id = es7000_get_apic_id,
+ .set_apic_id = NULL,
+ .apic_id_mask = 0xFF << 24,
+
+ .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = es7000_send_IPI_mask,
+ .send_IPI_mask_allbutself = NULL,
+ .send_IPI_allbutself = es7000_send_IPI_allbutself,
+ .send_IPI_all = es7000_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+
+ .wakeup_cpu = NULL,
+
+ .trampoline_phys_low = 0x467,
+ .trampoline_phys_high = 0x469,
+
+ .wait_for_init_deassert = es7000_wait_for_init_deassert,
+
+ /* Nothing to do for most platforms, since cleared by the INIT cycle: */
+ .smp_callin_clear_local_apic = NULL,
+ .store_NMI_vector = NULL,
+ .inquire_remote_apic = default_inquire_remote_apic,
+};
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 50ea0ac8c9b..231bdd3c5b1 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -14,14 +14,17 @@
#include <linux/uaccess.h>
#include <linux/ftrace.h>
#include <linux/percpu.h>
+#include <linux/sched.h>
#include <linux/init.h>
#include <linux/list.h>
#include <asm/ftrace.h>
+#include <linux/ftrace.h>
#include <asm/nops.h>
+#include <asm/nmi.h>
-static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
+#ifdef CONFIG_DYNAMIC_FTRACE
union ftrace_code_union {
char code[MCOUNT_INSN_SIZE];
@@ -31,18 +34,12 @@ union ftrace_code_union {
} __attribute__((packed));
};
-
static int ftrace_calc_offset(long ip, long addr)
{
return (int)(addr - ip);
}
-unsigned char *ftrace_nop_replace(void)
-{
- return ftrace_nop;
-}
-
-unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
{
static union ftrace_code_union calc;
@@ -56,7 +53,142 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
return calc.code;
}
-int
+/*
+ * Modifying code must take extra care. On an SMP machine, if
+ * the code being modified is also being executed on another CPU
+ * that CPU will have undefined results and possibly take a GPF.
+ * We use kstop_machine to stop other CPUS from exectuing code.
+ * But this does not stop NMIs from happening. We still need
+ * to protect against that. We separate out the modification of
+ * the code to take care of this.
+ *
+ * Two buffers are added: An IP buffer and a "code" buffer.
+ *
+ * 1) Put the instruction pointer into the IP buffer
+ * and the new code into the "code" buffer.
+ * 2) Set a flag that says we are modifying code
+ * 3) Wait for any running NMIs to finish.
+ * 4) Write the code
+ * 5) clear the flag.
+ * 6) Wait for any running NMIs to finish.
+ *
+ * If an NMI is executed, the first thing it does is to call
+ * "ftrace_nmi_enter". This will check if the flag is set to write
+ * and if it is, it will write what is in the IP and "code" buffers.
+ *
+ * The trick is, it does not matter if everyone is writing the same
+ * content to the code location. Also, if a CPU is executing code
+ * it is OK to write to that code location if the contents being written
+ * are the same as what exists.
+ */
+
+static atomic_t in_nmi = ATOMIC_INIT(0);
+static int mod_code_status; /* holds return value of text write */
+static int mod_code_write; /* set when NMI should do the write */
+static void *mod_code_ip; /* holds the IP to write to */
+static void *mod_code_newcode; /* holds the text to write to the IP */
+
+static unsigned nmi_wait_count;
+static atomic_t nmi_update_count = ATOMIC_INIT(0);
+
+int ftrace_arch_read_dyn_info(char *buf, int size)
+{
+ int r;
+
+ r = snprintf(buf, size, "%u %u",
+ nmi_wait_count,
+ atomic_read(&nmi_update_count));
+ return r;
+}
+
+static void ftrace_mod_code(void)
+{
+ /*
+ * Yes, more than one CPU process can be writing to mod_code_status.
+ * (and the code itself)
+ * But if one were to fail, then they all should, and if one were
+ * to succeed, then they all should.
+ */
+ mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
+ MCOUNT_INSN_SIZE);
+}
+
+void ftrace_nmi_enter(void)
+{
+ atomic_inc(&in_nmi);
+ /* Must have in_nmi seen before reading write flag */
+ smp_mb();
+ if (mod_code_write) {
+ ftrace_mod_code();
+ atomic_inc(&nmi_update_count);
+ }
+}
+
+void ftrace_nmi_exit(void)
+{
+ /* Finish all executions before clearing in_nmi */
+ smp_wmb();
+ atomic_dec(&in_nmi);
+}
+
+static void wait_for_nmi(void)
+{
+ int waited = 0;
+
+ while (atomic_read(&in_nmi)) {
+ waited = 1;
+ cpu_relax();
+ }
+
+ if (waited)
+ nmi_wait_count++;
+}
+
+static int
+do_ftrace_mod_code(unsigned long ip, void *new_code)
+{
+ mod_code_ip = (void *)ip;
+ mod_code_newcode = new_code;
+
+ /* The buffers need to be visible before we let NMIs write them */
+ smp_wmb();
+
+ mod_code_write = 1;
+
+ /* Make sure write bit is visible before we wait on NMIs */
+ smp_mb();
+
+ wait_for_nmi();
+
+ /* Make sure all running NMIs have finished before we write the code */
+ smp_mb();
+
+ ftrace_mod_code();
+
+ /* Make sure the write happens before clearing the bit */
+ smp_wmb();
+
+ mod_code_write = 0;
+
+ /* make sure NMIs see the cleared bit */
+ smp_mb();
+
+ wait_for_nmi();
+
+ return mod_code_status;
+}
+
+
+
+
+static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
+
+static unsigned char *ftrace_nop_replace(void)
+{
+ return ftrace_nop;
+}
+
+static int
ftrace_modify_code(unsigned long ip, unsigned char *old_code,
unsigned char *new_code)
{
@@ -81,7 +213,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
return -EINVAL;
/* replace the text with the new text */
- if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
+ if (do_ftrace_mod_code(ip, new_code))
return -EPERM;
sync_core();
@@ -89,6 +221,29 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
return 0;
}
+int ftrace_make_nop(struct module *mod,
+ struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned char *new, *old;
+ unsigned long ip = rec->ip;
+
+ old = ftrace_call_replace(ip, addr);
+ new = ftrace_nop_replace();
+
+ return ftrace_modify_code(rec->ip, old, new);
+}
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned char *new, *old;
+ unsigned long ip = rec->ip;
+
+ old = ftrace_nop_replace();
+ new = ftrace_call_replace(ip, addr);
+
+ return ftrace_modify_code(rec->ip, old, new);
+}
+
int ftrace_update_ftrace_func(ftrace_func_t func)
{
unsigned long ip = (unsigned long)(&ftrace_call);
@@ -165,3 +320,219 @@ int __init ftrace_dyn_arch_init(void *data)
return 0;
}
+#endif
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern void ftrace_graph_call(void);
+
+static int ftrace_mod_jmp(unsigned long ip,
+ int old_offset, int new_offset)
+{
+ unsigned char code[MCOUNT_INSN_SIZE];
+
+ if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
+ return -EFAULT;
+
+ if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
+ return -EINVAL;
+
+ *(int *)(&code[1]) = new_offset;
+
+ if (do_ftrace_mod_code(ip, &code))
+ return -EPERM;
+
+ return 0;
+}
+
+int ftrace_enable_ftrace_graph_caller(void)
+{
+ unsigned long ip = (unsigned long)(&ftrace_graph_call);
+ int old_offset, new_offset;
+
+ old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
+ new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
+
+ return ftrace_mod_jmp(ip, old_offset, new_offset);
+}
+
+int ftrace_disable_ftrace_graph_caller(void)
+{
+ unsigned long ip = (unsigned long)(&ftrace_graph_call);
+ int old_offset, new_offset;
+
+ old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
+ new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
+
+ return ftrace_mod_jmp(ip, old_offset, new_offset);
+}
+
+#else /* CONFIG_DYNAMIC_FTRACE */
+
+/*
+ * These functions are picked from those used on
+ * this page for dynamic ftrace. They have been
+ * simplified to ignore all traces in NMI context.
+ */
+static atomic_t in_nmi;
+
+void ftrace_nmi_enter(void)
+{
+ atomic_inc(&in_nmi);
+}
+
+void ftrace_nmi_exit(void)
+{
+ atomic_dec(&in_nmi);
+}
+
+#endif /* !CONFIG_DYNAMIC_FTRACE */
+
+/* Add a function return address to the trace stack on thread info.*/
+static int push_return_trace(unsigned long ret, unsigned long long time,
+ unsigned long func, int *depth)
+{
+ int index;
+
+ if (!current->ret_stack)
+ return -EBUSY;
+
+ /* The return trace stack is full */
+ if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+ atomic_inc(&current->trace_overrun);
+ return -EBUSY;
+ }
+
+ index = ++current->curr_ret_stack;
+ barrier();
+ current->ret_stack[index].ret = ret;
+ current->ret_stack[index].func = func;
+ current->ret_stack[index].calltime = time;
+ *depth = index;
+
+ return 0;
+}
+
+/* Retrieve a function return address to the trace stack on thread info.*/
+static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
+{
+ int index;
+
+ index = current->curr_ret_stack;
+
+ if (unlikely(index < 0)) {
+ ftrace_graph_stop();
+ WARN_ON(1);
+ /* Might as well panic, otherwise we have no where to go */
+ *ret = (unsigned long)panic;
+ return;
+ }
+
+ *ret = current->ret_stack[index].ret;
+ trace->func = current->ret_stack[index].func;
+ trace->calltime = current->ret_stack[index].calltime;
+ trace->overrun = atomic_read(&current->trace_overrun);
+ trace->depth = index;
+ barrier();
+ current->curr_ret_stack--;
+
+}
+
+/*
+ * Send the trace to the ring-buffer.
+ * @return the original return address.
+ */
+unsigned long ftrace_return_to_handler(void)
+{
+ struct ftrace_graph_ret trace;
+ unsigned long ret;
+
+ pop_return_trace(&trace, &ret);
+ trace.rettime = cpu_clock(raw_smp_processor_id());
+ ftrace_graph_return(&trace);
+
+ if (unlikely(!ret)) {
+ ftrace_graph_stop();
+ WARN_ON(1);
+ /* Might as well panic. What else to do? */
+ ret = (unsigned long)panic;
+ }
+
+ return ret;
+}
+
+/*
+ * Hook the return address and push it in the stack of return addrs
+ * in current thread info.
+ */
+void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
+{
+ unsigned long old;
+ unsigned long long calltime;
+ int faulted;
+ struct ftrace_graph_ent trace;
+ unsigned long return_hooker = (unsigned long)
+ &return_to_handler;
+
+ /* Nmi's are currently unsupported */
+ if (unlikely(atomic_read(&in_nmi)))
+ return;
+
+ if (unlikely(atomic_read(&current->tracing_graph_pause)))
+ return;
+
+ /*
+ * Protect against fault, even if it shouldn't
+ * happen. This tool is too much intrusive to
+ * ignore such a protection.
+ */
+ asm volatile(
+ "1: " _ASM_MOV " (%[parent]), %[old]\n"
+ "2: " _ASM_MOV " %[return_hooker], (%[parent])\n"
+ " movl $0, %[faulted]\n"
+ "3:\n"
+
+ ".section .fixup, \"ax\"\n"
+ "4: movl $1, %[faulted]\n"
+ " jmp 3b\n"
+ ".previous\n"
+
+ _ASM_EXTABLE(1b, 4b)
+ _ASM_EXTABLE(2b, 4b)
+
+ : [old] "=r" (old), [faulted] "=r" (faulted)
+ : [parent] "r" (parent), [return_hooker] "r" (return_hooker)
+ : "memory"
+ );
+
+ if (unlikely(faulted)) {
+ ftrace_graph_stop();
+ WARN_ON(1);
+ return;
+ }
+
+ if (unlikely(!__kernel_text_address(old))) {
+ ftrace_graph_stop();
+ *parent = old;
+ WARN_ON(1);
+ return;
+ }
+
+ calltime = cpu_clock(raw_smp_processor_id());
+
+ if (push_return_trace(old, calltime,
+ self_addr, &trace.depth) == -EBUSY) {
+ *parent = old;
+ return;
+ }
+
+ trace.func = self_addr;
+
+ /* Only trace if the calling function expects to */
+ if (!ftrace_graph_entry(&trace)) {
+ current->curr_ret_stack--;
+ *parent = old;
+ }
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 6c9bfc9e1e9..820dea5d0eb 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -21,6 +21,7 @@
#include <asm/smp.h>
#include <asm/ipi.h>
#include <asm/genapic.h>
+#include <asm/setup.h>
extern struct genapic apic_flat;
extern struct genapic apic_physflat;
@@ -28,10 +29,12 @@ extern struct genapic apic_x2xpic_uv_x;
extern struct genapic apic_x2apic_phys;
extern struct genapic apic_x2apic_cluster;
-struct genapic __read_mostly *genapic = &apic_flat;
+struct genapic __read_mostly *apic = &apic_flat;
static struct genapic *apic_probe[] __initdata = {
+#ifdef CONFIG_X86_UV
&apic_x2apic_uv_x,
+#endif
&apic_x2apic_phys,
&apic_x2apic_cluster,
&apic_physflat,
@@ -41,36 +44,39 @@ static struct genapic *apic_probe[] __initdata = {
/*
* Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
*/
-void __init setup_apic_routing(void)
+void __init default_setup_apic_routing(void)
{
- if (genapic == &apic_x2apic_phys || genapic == &apic_x2apic_cluster) {
+ if (apic == &apic_x2apic_phys || apic == &apic_x2apic_cluster) {
if (!intr_remapping_enabled)
- genapic = &apic_flat;
+ apic = &apic_flat;
}
- if (genapic == &apic_flat) {
+ if (apic == &apic_flat) {
if (max_physical_apicid >= 8)
- genapic = &apic_physflat;
- printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
+ apic = &apic_physflat;
+ printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
}
+
+ if (x86_quirks->update_genapic)
+ x86_quirks->update_genapic();
}
/* Same for both flat and physical. */
void apic_send_IPI_self(int vector)
{
- __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
+ __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
}
-int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
int i;
for (i = 0; apic_probe[i]; ++i) {
if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
- genapic = apic_probe[i];
+ apic = apic_probe[i];
printk(KERN_INFO "Setting APIC routing to %s.\n",
- genapic->name);
+ apic->name);
return 1;
}
}
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index c0262791bda..249d2d3c034 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -19,7 +19,6 @@
#include <asm/smp.h>
#include <asm/ipi.h>
#include <asm/genapic.h>
-#include <mach_apicdef.h>
#ifdef CONFIG_ACPI
#include <acpi/acpi_bus.h>
@@ -30,12 +29,12 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 1;
}
-static cpumask_t flat_target_cpus(void)
+static const struct cpumask *flat_target_cpus(void)
{
- return cpu_online_map;
+ return cpu_online_mask;
}
-static cpumask_t flat_vector_allocation_domain(int cpu)
+static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
{
/* Careful. Some cpus do not strictly honor the set of cpus
* specified in the interrupt destination when using lowest
@@ -45,8 +44,8 @@ static cpumask_t flat_vector_allocation_domain(int cpu)
* deliver interrupts to the wrong hyperthread when only one
* hyperthread was specified in the interrupt desitination.
*/
- cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
- return domain;
+ cpumask_clear(retmask);
+ cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
}
/*
@@ -69,48 +68,73 @@ static void flat_init_apic_ldr(void)
apic_write(APIC_LDR, val);
}
-static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
+static inline void _flat_send_IPI_mask(unsigned long mask, int vector)
{
- unsigned long mask = cpus_addr(cpumask)[0];
unsigned long flags;
local_irq_save(flags);
- __send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
+ __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
local_irq_restore(flags);
}
+static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
+{
+ unsigned long mask = cpumask_bits(cpumask)[0];
+
+ _flat_send_IPI_mask(mask, vector);
+}
+
+static void
+ flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
+{
+ unsigned long mask = cpumask_bits(cpumask)[0];
+ int cpu = smp_processor_id();
+
+ if (cpu < BITS_PER_LONG)
+ clear_bit(cpu, &mask);
+
+ _flat_send_IPI_mask(mask, vector);
+}
+
static void flat_send_IPI_allbutself(int vector)
{
+ int cpu = smp_processor_id();
#ifdef CONFIG_HOTPLUG_CPU
int hotplug = 1;
#else
int hotplug = 0;
#endif
if (hotplug || vector == NMI_VECTOR) {
- cpumask_t allbutme = cpu_online_map;
+ if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) {
+ unsigned long mask = cpumask_bits(cpu_online_mask)[0];
- cpu_clear(smp_processor_id(), allbutme);
+ if (cpu < BITS_PER_LONG)
+ clear_bit(cpu, &mask);
- if (!cpus_empty(allbutme))
- flat_send_IPI_mask(allbutme, vector);
+ _flat_send_IPI_mask(mask, vector);
+ }
} else if (num_online_cpus() > 1) {
- __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
+ __default_send_IPI_shortcut(APIC_DEST_ALLBUT,
+ vector, apic->dest_logical);
}
}
static void flat_send_IPI_all(int vector)
{
- if (vector == NMI_VECTOR)
- flat_send_IPI_mask(cpu_online_map, vector);
- else
- __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
+ if (vector == NMI_VECTOR) {
+ flat_send_IPI_mask(cpu_online_mask, vector);
+ } else {
+ __default_send_IPI_shortcut(APIC_DEST_ALLINC,
+ vector, apic->dest_logical);
+ }
}
-static unsigned int get_apic_id(unsigned long x)
+static unsigned int flat_get_apic_id(unsigned long x)
{
unsigned int id;
id = (((x)>>24) & 0xFFu);
+
return id;
}
@@ -126,7 +150,7 @@ static unsigned int read_xapic_id(void)
{
unsigned int id;
- id = get_apic_id(apic_read(APIC_ID));
+ id = flat_get_apic_id(apic_read(APIC_ID));
return id;
}
@@ -135,34 +159,76 @@ static int flat_apic_id_registered(void)
return physid_isset(read_xapic_id(), phys_cpu_present_map);
}
-static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int flat_cpu_mask_to_apicid(const struct cpumask *cpumask)
+{
+ return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
+}
+
+static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
{
- return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
+ unsigned long mask1 = cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
+ unsigned long mask2 = cpumask_bits(andmask)[0] & APIC_ALL_CPUS;
+
+ return mask1 & mask2;
}
-static unsigned int phys_pkg_id(int index_msb)
+static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
{
return hard_smp_processor_id() >> index_msb;
}
struct genapic apic_flat = {
- .name = "flat",
- .acpi_madt_oem_check = flat_acpi_madt_oem_check,
- .int_delivery_mode = dest_LowestPrio,
- .int_dest_mode = (APIC_DEST_LOGICAL != 0),
- .target_cpus = flat_target_cpus,
- .vector_allocation_domain = flat_vector_allocation_domain,
- .apic_id_registered = flat_apic_id_registered,
- .init_apic_ldr = flat_init_apic_ldr,
- .send_IPI_all = flat_send_IPI_all,
- .send_IPI_allbutself = flat_send_IPI_allbutself,
- .send_IPI_mask = flat_send_IPI_mask,
- .send_IPI_self = apic_send_IPI_self,
- .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
- .phys_pkg_id = phys_pkg_id,
- .get_apic_id = get_apic_id,
- .set_apic_id = set_apic_id,
- .apic_id_mask = (0xFFu<<24),
+ .name = "flat",
+ .probe = NULL,
+ .acpi_madt_oem_check = flat_acpi_madt_oem_check,
+ .apic_id_registered = flat_apic_id_registered,
+
+ .irq_delivery_mode = dest_LowestPrio,
+ .irq_dest_mode = 1, /* logical */
+
+ .target_cpus = flat_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = NULL,
+ .check_apicid_present = NULL,
+
+ .vector_allocation_domain = flat_vector_allocation_domain,
+ .init_apic_ldr = flat_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .multi_timer_check = NULL,
+ .apicid_to_node = NULL,
+ .cpu_to_logical_apicid = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = flat_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = flat_get_apic_id,
+ .set_apic_id = set_apic_id,
+ .apic_id_mask = 0xFFu << 24,
+
+ .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = flat_send_IPI_mask,
+ .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = flat_send_IPI_allbutself,
+ .send_IPI_all = flat_send_IPI_all,
+ .send_IPI_self = apic_send_IPI_self,
+
+ .wakeup_cpu = NULL,
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+ .wait_for_init_deassert = NULL,
+ .smp_callin_clear_local_apic = NULL,
+ .store_NMI_vector = NULL,
+ .inquire_remote_apic = NULL,
};
/*
@@ -188,35 +254,39 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 0;
}
-static cpumask_t physflat_target_cpus(void)
+static const struct cpumask *physflat_target_cpus(void)
{
- return cpu_online_map;
+ return cpu_online_mask;
}
-static cpumask_t physflat_vector_allocation_domain(int cpu)
+static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask)
{
- return cpumask_of_cpu(cpu);
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
}
-static void physflat_send_IPI_mask(cpumask_t cpumask, int vector)
+static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
{
- send_IPI_mask_sequence(cpumask, vector);
+ default_send_IPI_mask_sequence_phys(cpumask, vector);
}
-static void physflat_send_IPI_allbutself(int vector)
+static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
+ int vector)
{
- cpumask_t allbutme = cpu_online_map;
+ default_send_IPI_mask_allbutself_phys(cpumask, vector);
+}
- cpu_clear(smp_processor_id(), allbutme);
- physflat_send_IPI_mask(allbutme, vector);
+static void physflat_send_IPI_allbutself(int vector)
+{
+ default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
}
static void physflat_send_IPI_all(int vector)
{
- physflat_send_IPI_mask(cpu_online_map, vector);
+ physflat_send_IPI_mask(cpu_online_mask, vector);
}
-static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask)
{
int cpu;
@@ -224,29 +294,84 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
* We're using fixed IRQ delivery, can only return one phys APIC ID.
* May as well be the first.
*/
- cpu = first_cpu(cpumask);
+ cpu = cpumask_first(cpumask);
if ((unsigned)cpu < nr_cpu_ids)
return per_cpu(x86_cpu_to_apicid, cpu);
else
return BAD_APICID;
}
+static unsigned int
+physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ for_each_cpu_and(cpu, cpumask, andmask) {
+ if (cpumask_test_cpu(cpu, cpu_online_mask))
+ break;
+ }
+ if (cpu < nr_cpu_ids)
+ return per_cpu(x86_cpu_to_apicid, cpu);
+
+ return BAD_APICID;
+}
+
struct genapic apic_physflat = {
- .name = "physical flat",
- .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
- .int_delivery_mode = dest_Fixed,
- .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
- .target_cpus = physflat_target_cpus,
- .vector_allocation_domain = physflat_vector_allocation_domain,
- .apic_id_registered = flat_apic_id_registered,
- .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/
- .send_IPI_all = physflat_send_IPI_all,
- .send_IPI_allbutself = physflat_send_IPI_allbutself,
- .send_IPI_mask = physflat_send_IPI_mask,
- .send_IPI_self = apic_send_IPI_self,
- .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
- .phys_pkg_id = phys_pkg_id,
- .get_apic_id = get_apic_id,
- .set_apic_id = set_apic_id,
- .apic_id_mask = (0xFFu<<24),
+
+ .name = "physical flat",
+ .probe = NULL,
+ .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
+ .apic_id_registered = flat_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 0, /* physical */
+
+ .target_cpus = physflat_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = 0,
+ .check_apicid_used = NULL,
+ .check_apicid_present = NULL,
+
+ .vector_allocation_domain = physflat_vector_allocation_domain,
+ /* not needed, but shouldn't hurt: */
+ .init_apic_ldr = flat_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .multi_timer_check = NULL,
+ .apicid_to_node = NULL,
+ .cpu_to_logical_apicid = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = flat_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = flat_get_apic_id,
+ .set_apic_id = set_apic_id,
+ .apic_id_mask = 0xFFu << 24,
+
+ .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = physflat_send_IPI_mask,
+ .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = physflat_send_IPI_allbutself,
+ .send_IPI_all = physflat_send_IPI_all,
+ .send_IPI_self = apic_send_IPI_self,
+
+ .wakeup_cpu = NULL,
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+ .wait_for_init_deassert = NULL,
+ .smp_callin_clear_local_apic = NULL,
+ .store_NMI_vector = NULL,
+ .inquire_remote_apic = NULL,
};
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c
index f6a2c8eb48a..7c87156b641 100644
--- a/arch/x86/kernel/genx2apic_cluster.c
+++ b/arch/x86/kernel/genx2apic_cluster.c
@@ -22,23 +22,22 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
-static cpumask_t x2apic_target_cpus(void)
+static const struct cpumask *x2apic_target_cpus(void)
{
- return cpumask_of_cpu(0);
+ return cpumask_of(0);
}
/*
* for now each logical cpu is in its own vector allocation domain.
*/
-static cpumask_t x2apic_vector_allocation_domain(int cpu)
+static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
{
- cpumask_t domain = CPU_MASK_NONE;
- cpu_set(cpu, domain);
- return domain;
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
}
-static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
- unsigned int dest)
+static void
+ __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
{
unsigned long cfg;
@@ -56,32 +55,58 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
* at once. We have 16 cpu's in a cluster. This will minimize IPI register
* writes.
*/
-static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
{
+ unsigned long query_cpu;
unsigned long flags;
+
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask) {
+ __x2apic_send_IPI_dest(
+ per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+ vector, apic->dest_logical);
+ }
+ local_irq_restore(flags);
+}
+
+static void
+ x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ unsigned long this_cpu = smp_processor_id();
unsigned long query_cpu;
+ unsigned long flags;
local_irq_save(flags);
- for_each_cpu_mask(query_cpu, mask) {
- __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, APIC_DEST_LOGICAL);
+ for_each_cpu(query_cpu, mask) {
+ if (query_cpu == this_cpu)
+ continue;
+ __x2apic_send_IPI_dest(
+ per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+ vector, apic->dest_logical);
}
local_irq_restore(flags);
}
static void x2apic_send_IPI_allbutself(int vector)
{
- cpumask_t mask = cpu_online_map;
-
- cpu_clear(smp_processor_id(), mask);
+ unsigned long this_cpu = smp_processor_id();
+ unsigned long query_cpu;
+ unsigned long flags;
- if (!cpus_empty(mask))
- x2apic_send_IPI_mask(mask, vector);
+ local_irq_save(flags);
+ for_each_online_cpu(query_cpu) {
+ if (query_cpu == this_cpu)
+ continue;
+ __x2apic_send_IPI_dest(
+ per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+ vector, apic->dest_logical);
+ }
+ local_irq_restore(flags);
}
static void x2apic_send_IPI_all(int vector)
{
- x2apic_send_IPI_mask(cpu_online_map, vector);
+ x2apic_send_IPI_mask(cpu_online_mask, vector);
}
static int x2apic_apic_id_registered(void)
@@ -89,22 +114,42 @@ static int x2apic_apic_id_registered(void)
return 1;
}
-static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
{
- int cpu;
-
/*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * We're using fixed IRQ delivery, can only return one logical APIC ID.
* May as well be the first.
*/
- cpu = first_cpu(cpumask);
- if ((unsigned)cpu < NR_CPUS)
+ int cpu = cpumask_first(cpumask);
+
+ if ((unsigned)cpu < nr_cpu_ids)
return per_cpu(x86_cpu_to_logical_apicid, cpu);
else
return BAD_APICID;
}
-static unsigned int get_apic_id(unsigned long x)
+static unsigned int
+x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one logical APIC ID.
+ * May as well be the first.
+ */
+ for_each_cpu_and(cpu, cpumask, andmask) {
+ if (cpumask_test_cpu(cpu, cpu_online_mask))
+ break;
+ }
+
+ if (cpu < nr_cpu_ids)
+ return per_cpu(x86_cpu_to_logical_apicid, cpu);
+
+ return BAD_APICID;
+}
+
+static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
{
unsigned int id;
@@ -120,7 +165,7 @@ static unsigned long set_apic_id(unsigned int id)
return x;
}
-static unsigned int phys_pkg_id(int index_msb)
+static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb)
{
return current_cpu_data.initial_apicid >> index_msb;
}
@@ -135,25 +180,58 @@ static void init_x2apic_ldr(void)
int cpu = smp_processor_id();
per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
- return;
}
struct genapic apic_x2apic_cluster = {
- .name = "cluster x2apic",
- .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
- .int_delivery_mode = dest_LowestPrio,
- .int_dest_mode = (APIC_DEST_LOGICAL != 0),
- .target_cpus = x2apic_target_cpus,
- .vector_allocation_domain = x2apic_vector_allocation_domain,
- .apic_id_registered = x2apic_apic_id_registered,
- .init_apic_ldr = init_x2apic_ldr,
- .send_IPI_all = x2apic_send_IPI_all,
- .send_IPI_allbutself = x2apic_send_IPI_allbutself,
- .send_IPI_mask = x2apic_send_IPI_mask,
- .send_IPI_self = x2apic_send_IPI_self,
- .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
- .phys_pkg_id = phys_pkg_id,
- .get_apic_id = get_apic_id,
- .set_apic_id = set_apic_id,
- .apic_id_mask = (0xFFFFFFFFu),
+
+ .name = "cluster x2apic",
+ .probe = NULL,
+ .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
+ .apic_id_registered = x2apic_apic_id_registered,
+
+ .irq_delivery_mode = dest_LowestPrio,
+ .irq_dest_mode = 1, /* logical */
+
+ .target_cpus = x2apic_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = NULL,
+ .check_apicid_present = NULL,
+
+ .vector_allocation_domain = x2apic_vector_allocation_domain,
+ .init_apic_ldr = init_x2apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .multi_timer_check = NULL,
+ .apicid_to_node = NULL,
+ .cpu_to_logical_apicid = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = x2apic_cluster_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = x2apic_cluster_phys_get_apic_id,
+ .set_apic_id = set_apic_id,
+ .apic_id_mask = 0xFFFFFFFFu,
+
+ .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = x2apic_send_IPI_mask,
+ .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = x2apic_send_IPI_allbutself,
+ .send_IPI_all = x2apic_send_IPI_all,
+ .send_IPI_self = x2apic_send_IPI_self,
+
+ .wakeup_cpu = NULL,
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+ .wait_for_init_deassert = NULL,
+ .smp_callin_clear_local_apic = NULL,
+ .store_NMI_vector = NULL,
+ .inquire_remote_apic = NULL,
};
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
index d042211768b..5cbae8aa040 100644
--- a/arch/x86/kernel/genx2apic_phys.c
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -29,16 +29,15 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
-static cpumask_t x2apic_target_cpus(void)
+static const struct cpumask *x2apic_target_cpus(void)
{
- return cpumask_of_cpu(0);
+ return cpumask_of(0);
}
-static cpumask_t x2apic_vector_allocation_domain(int cpu)
+static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
{
- cpumask_t domain = CPU_MASK_NONE;
- cpu_set(cpu, domain);
- return domain;
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
}
static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
@@ -54,32 +53,55 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
x2apic_icr_write(cfg, apicid);
}
-static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
{
- unsigned long flags;
unsigned long query_cpu;
+ unsigned long flags;
local_irq_save(flags);
- for_each_cpu_mask(query_cpu, mask) {
+ for_each_cpu(query_cpu, mask) {
__x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
vector, APIC_DEST_PHYSICAL);
}
local_irq_restore(flags);
}
-static void x2apic_send_IPI_allbutself(int vector)
+static void
+ x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
{
- cpumask_t mask = cpu_online_map;
+ unsigned long this_cpu = smp_processor_id();
+ unsigned long query_cpu;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask) {
+ if (query_cpu != this_cpu)
+ __x2apic_send_IPI_dest(
+ per_cpu(x86_cpu_to_apicid, query_cpu),
+ vector, APIC_DEST_PHYSICAL);
+ }
+ local_irq_restore(flags);
+}
- cpu_clear(smp_processor_id(), mask);
+static void x2apic_send_IPI_allbutself(int vector)
+{
+ unsigned long this_cpu = smp_processor_id();
+ unsigned long query_cpu;
+ unsigned long flags;
- if (!cpus_empty(mask))
- x2apic_send_IPI_mask(mask, vector);
+ local_irq_save(flags);
+ for_each_online_cpu(query_cpu) {
+ if (query_cpu == this_cpu)
+ continue;
+ __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
+ vector, APIC_DEST_PHYSICAL);
+ }
+ local_irq_restore(flags);
}
static void x2apic_send_IPI_all(int vector)
{
- x2apic_send_IPI_mask(cpu_online_map, vector);
+ x2apic_send_IPI_mask(cpu_online_mask, vector);
}
static int x2apic_apic_id_registered(void)
@@ -87,68 +109,115 @@ static int x2apic_apic_id_registered(void)
return 1;
}
-static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
{
- int cpu;
-
/*
* We're using fixed IRQ delivery, can only return one phys APIC ID.
* May as well be the first.
*/
- cpu = first_cpu(cpumask);
- if ((unsigned)cpu < NR_CPUS)
+ int cpu = cpumask_first(cpumask);
+
+ if ((unsigned)cpu < nr_cpu_ids)
return per_cpu(x86_cpu_to_apicid, cpu);
else
return BAD_APICID;
}
-static unsigned int get_apic_id(unsigned long x)
+static unsigned int
+x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
{
- unsigned int id;
+ int cpu;
- id = x;
- return id;
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ for_each_cpu_and(cpu, cpumask, andmask) {
+ if (cpumask_test_cpu(cpu, cpu_online_mask))
+ break;
+ }
+
+ if (cpu < nr_cpu_ids)
+ return per_cpu(x86_cpu_to_apicid, cpu);
+
+ return BAD_APICID;
}
-static unsigned long set_apic_id(unsigned int id)
+static unsigned int x2apic_phys_get_apic_id(unsigned long x)
{
- unsigned long x;
-
- x = id;
return x;
}
-static unsigned int phys_pkg_id(int index_msb)
+static unsigned long set_apic_id(unsigned int id)
+{
+ return id;
+}
+
+static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
{
return current_cpu_data.initial_apicid >> index_msb;
}
-void x2apic_send_IPI_self(int vector)
+static void x2apic_send_IPI_self(int vector)
{
apic_write(APIC_SELF_IPI, vector);
}
-void init_x2apic_ldr(void)
+static void init_x2apic_ldr(void)
{
- return;
}
struct genapic apic_x2apic_phys = {
- .name = "physical x2apic",
- .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
- .int_delivery_mode = dest_Fixed,
- .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
- .target_cpus = x2apic_target_cpus,
- .vector_allocation_domain = x2apic_vector_allocation_domain,
- .apic_id_registered = x2apic_apic_id_registered,
- .init_apic_ldr = init_x2apic_ldr,
- .send_IPI_all = x2apic_send_IPI_all,
- .send_IPI_allbutself = x2apic_send_IPI_allbutself,
- .send_IPI_mask = x2apic_send_IPI_mask,
- .send_IPI_self = x2apic_send_IPI_self,
- .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
- .phys_pkg_id = phys_pkg_id,
- .get_apic_id = get_apic_id,
- .set_apic_id = set_apic_id,
- .apic_id_mask = (0xFFFFFFFFu),
+
+ .name = "physical x2apic",
+ .probe = NULL,
+ .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
+ .apic_id_registered = x2apic_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 0, /* physical */
+
+ .target_cpus = x2apic_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = 0,
+ .check_apicid_used = NULL,
+ .check_apicid_present = NULL,
+
+ .vector_allocation_domain = x2apic_vector_allocation_domain,
+ .init_apic_ldr = init_x2apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .multi_timer_check = NULL,
+ .apicid_to_node = NULL,
+ .cpu_to_logical_apicid = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = x2apic_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = x2apic_phys_get_apic_id,
+ .set_apic_id = set_apic_id,
+ .apic_id_mask = 0xFFFFFFFFu,
+
+ .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = x2apic_send_IPI_mask,
+ .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = x2apic_send_IPI_allbutself,
+ .send_IPI_all = x2apic_send_IPI_all,
+ .send_IPI_self = x2apic_send_IPI_self,
+
+ .wakeup_cpu = NULL,
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+ .wait_for_init_deassert = NULL,
+ .smp_callin_clear_local_apic = NULL,
+ .store_NMI_vector = NULL,
+ .inquire_remote_apic = NULL,
};
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 2c7dbdb9827..6adb5e6f4d9 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -10,6 +10,7 @@
#include <linux/kernel.h>
#include <linux/threads.h>
+#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/string.h>
#include <linux/ctype.h>
@@ -17,10 +18,14 @@
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/hardirq.h>
+#include <linux/timer.h>
+#include <linux/proc_fs.h>
+#include <asm/current.h>
#include <asm/smp.h>
#include <asm/ipi.h>
#include <asm/genapic.h>
#include <asm/pgtable.h>
+#include <asm/uv/uv.h>
#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
#include <asm/uv/bios.h>
@@ -75,16 +80,15 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second);
/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
-static cpumask_t uv_target_cpus(void)
+static const struct cpumask *uv_target_cpus(void)
{
- return cpumask_of_cpu(0);
+ return cpumask_of(0);
}
-static cpumask_t uv_vector_allocation_domain(int cpu)
+static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
{
- cpumask_t domain = CPU_MASK_NONE;
- cpu_set(cpu, domain);
- return domain;
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
}
int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
@@ -114,37 +118,49 @@ static void uv_send_IPI_one(int cpu, int vector)
int pnode;
apicid = per_cpu(x86_cpu_to_apicid, cpu);
- lapicid = apicid & 0x3f; /* ZZZ macro needed */
+ lapicid = apicid & 0x3f; /* ZZZ macro needed */
pnode = uv_apicid_to_pnode(apicid);
- val =
- (1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid <<
- UVH_IPI_INT_APIC_ID_SHFT) |
- (vector << UVH_IPI_INT_VECTOR_SHFT);
+
+ val = ( 1UL << UVH_IPI_INT_SEND_SHFT ) |
+ ( lapicid << UVH_IPI_INT_APIC_ID_SHFT ) |
+ ( vector << UVH_IPI_INT_VECTOR_SHFT );
+
uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
}
-static void uv_send_IPI_mask(cpumask_t mask, int vector)
+static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
{
unsigned int cpu;
- for_each_possible_cpu(cpu)
- if (cpu_isset(cpu, mask))
+ for_each_cpu(cpu, mask)
+ uv_send_IPI_one(cpu, vector);
+}
+
+static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask) {
+ if (cpu != this_cpu)
uv_send_IPI_one(cpu, vector);
+ }
}
static void uv_send_IPI_allbutself(int vector)
{
- cpumask_t mask = cpu_online_map;
-
- cpu_clear(smp_processor_id(), mask);
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
- if (!cpus_empty(mask))
- uv_send_IPI_mask(mask, vector);
+ for_each_online_cpu(cpu) {
+ if (cpu != this_cpu)
+ uv_send_IPI_one(cpu, vector);
+ }
}
static void uv_send_IPI_all(int vector)
{
- uv_send_IPI_mask(cpu_online_map, vector);
+ uv_send_IPI_mask(cpu_online_mask, vector);
}
static int uv_apic_id_registered(void)
@@ -156,22 +172,41 @@ static void uv_init_apic_ldr(void)
{
}
-static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
{
- int cpu;
-
/*
* We're using fixed IRQ delivery, can only return one phys APIC ID.
* May as well be the first.
*/
- cpu = first_cpu(cpumask);
+ int cpu = cpumask_first(cpumask);
+
if ((unsigned)cpu < nr_cpu_ids)
return per_cpu(x86_cpu_to_apicid, cpu);
else
return BAD_APICID;
}
-static unsigned int get_apic_id(unsigned long x)
+static unsigned int
+uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ for_each_cpu_and(cpu, cpumask, andmask) {
+ if (cpumask_test_cpu(cpu, cpu_online_mask))
+ break;
+ }
+ if (cpu < nr_cpu_ids)
+ return per_cpu(x86_cpu_to_apicid, cpu);
+
+ return BAD_APICID;
+}
+
+static unsigned int x2apic_get_apic_id(unsigned long x)
{
unsigned int id;
@@ -193,10 +228,10 @@ static unsigned long set_apic_id(unsigned int id)
static unsigned int uv_read_apic_id(void)
{
- return get_apic_id(apic_read(APIC_ID));
+ return x2apic_get_apic_id(apic_read(APIC_ID));
}
-static unsigned int phys_pkg_id(int index_msb)
+static int uv_phys_pkg_id(int initial_apicid, int index_msb)
{
return uv_read_apic_id() >> index_msb;
}
@@ -207,23 +242,57 @@ static void uv_send_IPI_self(int vector)
}
struct genapic apic_x2apic_uv_x = {
- .name = "UV large system",
- .acpi_madt_oem_check = uv_acpi_madt_oem_check,
- .int_delivery_mode = dest_Fixed,
- .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
- .target_cpus = uv_target_cpus,
- .vector_allocation_domain = uv_vector_allocation_domain,
- .apic_id_registered = uv_apic_id_registered,
- .init_apic_ldr = uv_init_apic_ldr,
- .send_IPI_all = uv_send_IPI_all,
- .send_IPI_allbutself = uv_send_IPI_allbutself,
- .send_IPI_mask = uv_send_IPI_mask,
- .send_IPI_self = uv_send_IPI_self,
- .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
- .phys_pkg_id = phys_pkg_id,
- .get_apic_id = get_apic_id,
- .set_apic_id = set_apic_id,
- .apic_id_mask = (0xFFFFFFFFu),
+
+ .name = "UV large system",
+ .probe = NULL,
+ .acpi_madt_oem_check = uv_acpi_madt_oem_check,
+ .apic_id_registered = uv_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 1, /* logical */
+
+ .target_cpus = uv_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = NULL,
+ .check_apicid_present = NULL,
+
+ .vector_allocation_domain = uv_vector_allocation_domain,
+ .init_apic_ldr = uv_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .multi_timer_check = NULL,
+ .apicid_to_node = NULL,
+ .cpu_to_logical_apicid = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = uv_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = x2apic_get_apic_id,
+ .set_apic_id = set_apic_id,
+ .apic_id_mask = 0xFFFFFFFFu,
+
+ .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = uv_send_IPI_mask,
+ .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = uv_send_IPI_allbutself,
+ .send_IPI_all = uv_send_IPI_all,
+ .send_IPI_self = uv_send_IPI_self,
+
+ .wakeup_cpu = NULL,
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+ .wait_for_init_deassert = NULL,
+ .smp_callin_clear_local_apic = NULL,
+ .store_NMI_vector = NULL,
+ .inquire_remote_apic = NULL,
};
static __cpuinit void set_x2apic_extra_bits(int pnode)
@@ -356,6 +425,103 @@ static __init void uv_rtc_init(void)
}
/*
+ * percpu heartbeat timer
+ */
+static void uv_heartbeat(unsigned long ignored)
+{
+ struct timer_list *timer = &uv_hub_info->scir.timer;
+ unsigned char bits = uv_hub_info->scir.state;
+
+ /* flip heartbeat bit */
+ bits ^= SCIR_CPU_HEARTBEAT;
+
+ /* is this cpu idle? */
+ if (idle_cpu(raw_smp_processor_id()))
+ bits &= ~SCIR_CPU_ACTIVITY;
+ else
+ bits |= SCIR_CPU_ACTIVITY;
+
+ /* update system controller interface reg */
+ uv_set_scir_bits(bits);
+
+ /* enable next timer period */
+ mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
+}
+
+static void __cpuinit uv_heartbeat_enable(int cpu)
+{
+ if (!uv_cpu_hub_info(cpu)->scir.enabled) {
+ struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
+
+ uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
+ setup_timer(timer, uv_heartbeat, cpu);
+ timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
+ add_timer_on(timer, cpu);
+ uv_cpu_hub_info(cpu)->scir.enabled = 1;
+ }
+
+ /* check boot cpu */
+ if (!uv_cpu_hub_info(0)->scir.enabled)
+ uv_heartbeat_enable(0);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void __cpuinit uv_heartbeat_disable(int cpu)
+{
+ if (uv_cpu_hub_info(cpu)->scir.enabled) {
+ uv_cpu_hub_info(cpu)->scir.enabled = 0;
+ del_timer(&uv_cpu_hub_info(cpu)->scir.timer);
+ }
+ uv_set_cpu_scir_bits(cpu, 0xff);
+}
+
+/*
+ * cpu hotplug notifier
+ */
+static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ uv_heartbeat_enable(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ uv_heartbeat_disable(cpu);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static __init void uv_scir_register_cpu_notifier(void)
+{
+ hotcpu_notifier(uv_scir_cpu_notify, 0);
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+
+static __init void uv_scir_register_cpu_notifier(void)
+{
+}
+
+static __init int uv_init_heartbeat(void)
+{
+ int cpu;
+
+ if (is_uv_system())
+ for_each_online_cpu(cpu)
+ uv_heartbeat_enable(cpu);
+ return 0;
+}
+
+late_initcall(uv_init_heartbeat);
+
+#endif /* !CONFIG_HOTPLUG_CPU */
+
+/*
* Called on each cpu to initialize the per_cpu UV data area.
* ZZZ hotplug not supported yet
*/
@@ -428,7 +594,7 @@ void __init uv_system_init(void)
uv_bios_init();
uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
- &uv_coherency_id, &uv_region_size);
+ &sn_coherency_id, &sn_region_size);
uv_rtc_init();
for_each_present_cpu(cpu) {
@@ -439,8 +605,7 @@ void __init uv_system_init(void)
uv_blade_info[blade].nr_possible_cpus++;
uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
- uv_cpu_hub_info(cpu)->lowmem_remap_top =
- lowmem_redir_base + lowmem_redir_size;
+ uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
uv_cpu_hub_info(cpu)->m_val = m_val;
uv_cpu_hub_info(cpu)->n_val = m_val;
uv_cpu_hub_info(cpu)->numa_blade_id = blade;
@@ -450,7 +615,8 @@ void __init uv_system_init(void)
uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
- uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id;
+ uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
+ uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
uv_node_to_blade[nid] = blade;
uv_cpu_to_blade[cpu] = blade;
max_pnode = max(pnode, max_pnode);
@@ -467,4 +633,6 @@ void __init uv_system_init(void)
map_mmioh_high(max_pnode);
uv_cpu_init();
+ uv_scir_register_cpu_notifier();
+ proc_mkdir("sgi_uv", NULL);
}
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 1dcb0f13897..3e66bd364a9 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -35,7 +35,6 @@ void __init reserve_ebda_region(void)
/* start of EBDA area */
ebda_addr = get_bios_ebda();
- printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem);
/* Fixup: bios puts an EBDA in the top 64K segment */
/* of conventional memory, but does not adjust lowmem. */
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index fa1d25dd83e..ac108d1fe18 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -12,9 +12,12 @@
#include <asm/sections.h>
#include <asm/e820.h>
#include <asm/bios_ebda.h>
+#include <asm/trampoline.h>
void __init i386_start_kernel(void)
{
+ reserve_trampoline_memory();
+
reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index d16084f9064..f5b27224769 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -24,27 +24,7 @@
#include <asm/kdebug.h>
#include <asm/e820.h>
#include <asm/bios_ebda.h>
-
-/* boot cpu pda */
-static struct x8664_pda _boot_cpu_pda __read_mostly;
-
-#ifdef CONFIG_SMP
-/*
- * We install an empty cpu_pda pointer table to indicate to early users
- * (numa_set_node) that the cpu_pda pointer table for cpus other than
- * the boot cpu is not yet setup.
- */
-static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
-#else
-static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
-#endif
-
-void __init x86_64_init_pda(void)
-{
- _cpu_pda = __cpu_pda;
- cpu_pda(0) = &_boot_cpu_pda;
- pda_init(0);
-}
+#include <asm/trampoline.h>
static void __init zap_identity_mappings(void)
{
@@ -111,8 +91,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
if (console_loglevel == 10)
early_printk("Kernel alive\n");
- x86_64_init_pda();
-
x86_64_start_reservations(real_mode_data);
}
@@ -120,6 +98,8 @@ void __init x86_64_start_reservations(char *real_mode_data)
{
copy_bootdata(__va(real_mode_data));
+ reserve_trampoline_memory();
+
reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index e835b4eea70..2a0aad7718d 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -19,6 +19,7 @@
#include <asm/asm-offsets.h>
#include <asm/setup.h>
#include <asm/processor-flags.h>
+#include <asm/percpu.h>
/* Physical address */
#define pa(X) ((X) - __PAGE_OFFSET)
@@ -429,14 +430,34 @@ is386: movl $2,%ecx # set MP
ljmp $(__KERNEL_CS),$1f
1: movl $(__KERNEL_DS),%eax # reload all the segment registers
movl %eax,%ss # after changing gdt.
- movl %eax,%fs # gets reset once there's real percpu
movl $(__USER_DS),%eax # DS/ES contains default USER segment
movl %eax,%ds
movl %eax,%es
- xorl %eax,%eax # Clear GS and LDT
+ movl $(__KERNEL_PERCPU), %eax
+ movl %eax,%fs # set this cpu's percpu
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+ /*
+ * The linker can't handle this by relocation. Manually set
+ * base address in stack canary segment descriptor.
+ */
+ cmpb $0,ready
+ jne 1f
+ movl $per_cpu__gdt_page,%eax
+ movl $per_cpu__stack_canary,%ecx
+ subl $20, %ecx
+ movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
+ shrl $16, %ecx
+ movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
+ movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
+1:
+#endif
+ movl $(__KERNEL_STACK_CANARY),%eax
movl %eax,%gs
+
+ xorl %eax,%eax # Clear LDT
lldt %ax
cld # gcc2 wants the direction flag cleared at all times
@@ -446,8 +467,6 @@ is386: movl $2,%ecx # set MP
movb $1, ready
cmpb $0,%cl # the first CPU calls start_kernel
je 1f
- movl $(__KERNEL_PERCPU), %eax
- movl %eax,%fs # set this cpu's percpu
movl (stack_start), %esp
1:
#endif /* CONFIG_SMP */
@@ -548,12 +567,8 @@ early_fault:
pushl %eax
pushl %edx /* trapno */
pushl $fault_msg
-#ifdef CONFIG_EARLY_PRINTK
- call early_printk
-#else
call printk
#endif
-#endif
call dump_stack
hlt_loop:
hlt
@@ -580,11 +595,10 @@ ignore_int:
pushl 32(%esp)
pushl 40(%esp)
pushl $int_msg
-#ifdef CONFIG_EARLY_PRINTK
- call early_printk
-#else
call printk
-#endif
+
+ call dump_stack
+
addl $(5*4),%esp
popl %ds
popl %es
@@ -660,7 +674,7 @@ early_recursion_flag:
.long 0
int_msg:
- .asciz "Unknown interrupt or fault at EIP %p %p %p\n"
+ .asciz "Unknown interrupt or fault at: %p %p %p\n"
fault_msg:
/* fault info: */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 26cfdc1d7c7..2e648e3a5ea 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,6 +19,7 @@
#include <asm/msr.h>
#include <asm/cache.h>
#include <asm/processor-flags.h>
+#include <asm/percpu.h>
#ifdef CONFIG_PARAVIRT
#include <asm/asm-offsets.h>
@@ -226,12 +227,15 @@ ENTRY(secondary_startup_64)
movl %eax,%fs
movl %eax,%gs
- /*
- * Setup up a dummy PDA. this is just for some early bootup code
- * that does in_interrupt()
- */
+ /* Set up %gs.
+ *
+ * The base of %gs always points to the bottom of the irqstack
+ * union. If the stack protector canary is enabled, it is
+ * located at %gs:40. Note that, on SMP, the boot cpu uses
+ * init data section till per cpu areas are set up.
+ */
movl $MSR_GS_BASE,%ecx
- movq $empty_zero_page,%rax
+ movq initial_gs(%rip),%rax
movq %rax,%rdx
shrq $32,%rdx
wrmsr
@@ -257,6 +261,8 @@ ENTRY(secondary_startup_64)
.align 8
ENTRY(initial_code)
.quad x86_64_start_kernel
+ ENTRY(initial_gs)
+ .quad INIT_PER_CPU_VAR(irq_stack_union)
__FINITDATA
ENTRY(stack_start)
@@ -305,7 +311,7 @@ ENTRY(early_idt_handler)
call dump_stack
#ifdef CONFIG_KALLSYMS
leaq early_idt_ripmsg(%rip),%rdi
- movq 8(%rsp),%rsi # get rip again
+ movq 0(%rsp),%rsi # get rip again
call __print_symbol
#endif
#endif /* EARLY_PRINTK */
@@ -401,7 +407,8 @@ NEXT_PAGE(level2_spare_pgt)
.globl early_gdt_descr
early_gdt_descr:
.word GDT_ENTRIES*8-1
- .quad per_cpu__gdt_page
+early_gdt_descr_base:
+ .quad INIT_PER_CPU_VAR(gdt_page)
ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 067d8de913f..388254f69a2 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -33,7 +33,9 @@
* HPET address is set in acpi/boot.c, when an ACPI entry exists
*/
unsigned long hpet_address;
-unsigned long hpet_num_timers;
+#ifdef CONFIG_PCI_MSI
+static unsigned long hpet_num_timers;
+#endif
static void __iomem *hpet_virt_address;
struct hpet_dev {
@@ -246,7 +248,7 @@ static void hpet_legacy_clockevent_register(void)
* Start hpet with the boot cpu mask and make it
* global after the IO_APIC has been initialized.
*/
- hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+ hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
clockevents_register_device(&hpet_clockevent);
global_clock_event = &hpet_clockevent;
printk(KERN_DEBUG "hpet clockevent registered\n");
@@ -301,7 +303,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
hpet_setup_msi_irq(hdev->irq);
disable_irq(hdev->irq);
- irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu));
+ irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
enable_irq(hdev->irq);
}
break;
@@ -449,7 +451,7 @@ static int hpet_setup_irq(struct hpet_dev *dev)
return -1;
disable_irq(dev->irq);
- irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu));
+ irq_set_affinity(dev->irq, cpumask_of(dev->cpu));
enable_irq(dev->irq);
printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
@@ -500,7 +502,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
/* 5 usec minimum reprogramming delta. */
evt->min_delta_ns = 5000;
- evt->cpumask = cpumask_of_cpu(hdev->cpu);
+ evt->cpumask = cpumask_of(hdev->cpu);
clockevents_register_device(evt);
}
@@ -626,11 +628,12 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
switch (action & 0xf) {
case CPU_ONLINE:
- INIT_DELAYED_WORK(&work.work, hpet_work);
+ INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work);
init_completion(&work.complete);
/* FIXME: add schedule_work_on() */
schedule_delayed_work_on(cpu, &work.work, 0);
wait_for_completion(&work.complete);
+ destroy_timer_on_stack(&work.work.timer);
break;
case CPU_DEAD:
if (hdev) {
@@ -811,7 +814,7 @@ int __init hpet_enable(void)
out_nohpet:
hpet_clear_mapping();
- boot_hpet_disable = 1;
+ hpet_address = 0;
return 0;
}
@@ -834,10 +837,11 @@ static __init int hpet_late_init(void)
hpet_address = force_hpet_address;
hpet_enable();
- if (!hpet_virt_address)
- return -ENODEV;
}
+ if (!hpet_virt_address)
+ return -ENODEV;
+
hpet_reserve_platform_timers(hpet_readl(HPET_ID));
for_each_online_cpu(cpu) {
@@ -893,7 +897,7 @@ static unsigned long hpet_rtc_flags;
static int hpet_prev_update_sec;
static struct rtc_time hpet_alarm_time;
static unsigned long hpet_pie_count;
-static unsigned long hpet_t1_cmp;
+static u32 hpet_t1_cmp;
static unsigned long hpet_default_delta;
static unsigned long hpet_pie_delta;
static unsigned long hpet_pie_limit;
@@ -901,6 +905,14 @@ static unsigned long hpet_pie_limit;
static rtc_irq_handler irq_handler;
/*
+ * Check that the hpet counter c1 is ahead of the c2
+ */
+static inline int hpet_cnt_ahead(u32 c1, u32 c2)
+{
+ return (s32)(c2 - c1) < 0;
+}
+
+/*
* Registers a IRQ handler.
*/
int hpet_register_irq_handler(rtc_irq_handler handler)
@@ -1071,7 +1083,7 @@ static void hpet_rtc_timer_reinit(void)
hpet_t1_cmp += delta;
hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
lost_ints++;
- } while ((long)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0);
+ } while (!hpet_cnt_ahead(hpet_t1_cmp, hpet_readl(HPET_COUNTER)));
if (lost_ints) {
if (hpet_rtc_flags & RTC_PIE)
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 1f20608d4ca..b0f61f0dcd0 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -58,7 +58,7 @@ void __cpuinit mxcsr_feature_mask_init(void)
stts();
}
-void __init init_thread_xstate(void)
+void __cpuinit init_thread_xstate(void)
{
if (!HAVE_HWFP) {
xstate_size = sizeof(struct i387_soft_struct);
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
index dbd6c1d1b63..b42ca694dc6 100644
--- a/arch/x86/kernel/i8237.c
+++ b/arch/x86/kernel/i8237.c
@@ -28,10 +28,10 @@ static int i8237A_resume(struct sys_device *dev)
flags = claim_dma_lock();
- dma_outb(DMA1_RESET_REG, 0);
- dma_outb(DMA2_RESET_REG, 0);
+ dma_outb(0, DMA1_RESET_REG);
+ dma_outb(0, DMA2_RESET_REG);
- for (i = 0;i < 8;i++) {
+ for (i = 0; i < 8; i++) {
set_dma_addr(i, 0x000000);
/* DMA count is a bit weird so this is not 0 */
set_dma_count(i, 1);
@@ -51,14 +51,14 @@ static int i8237A_suspend(struct sys_device *dev, pm_message_t state)
}
static struct sysdev_class i8237_sysdev_class = {
- .name = "i8237",
- .suspend = i8237A_suspend,
- .resume = i8237A_resume,
+ .name = "i8237",
+ .suspend = i8237A_suspend,
+ .resume = i8237A_resume,
};
static struct sys_device device_i8237A = {
- .id = 0,
- .cls = &i8237_sysdev_class,
+ .id = 0,
+ .cls = &i8237_sysdev_class,
};
static int __init i8237A_init_sysfs(void)
@@ -68,5 +68,4 @@ static int __init i8237A_init_sysfs(void)
error = sysdev_register(&device_i8237A);
return error;
}
-
device_initcall(i8237A_init_sysfs);
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index c1b5e3ece1f..10f92fb532f 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -114,7 +114,7 @@ void __init setup_pit_timer(void)
* Start pit with the boot cpu mask and make it global after the
* IO_APIC has been initialized.
*/
- pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+ pit_clockevent.cpumask = cpumask_of(smp_processor_id());
pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC,
pit_clockevent.shift);
pit_clockevent.max_delta_ns =
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 4b8a53d841f..11d5093eb28 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -11,15 +11,15 @@
#include <linux/kernel_stat.h>
#include <linux/sysdev.h>
#include <linux/bitops.h>
+#include <linux/acpi.h>
+#include <linux/io.h>
+#include <linux/delay.h>
-#include <asm/acpi.h>
#include <asm/atomic.h>
#include <asm/system.h>
-#include <asm/io.h>
#include <asm/timer.h>
#include <asm/hw_irq.h>
#include <asm/pgtable.h>
-#include <asm/delay.h>
#include <asm/desc.h>
#include <asm/apic.h>
#include <asm/arch_hooks.h>
@@ -323,7 +323,7 @@ void init_8259A(int auto_eoi)
outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
/* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 on x86-64,
- to 0x20-0x27 on i386 */
+ to 0x20-0x27 on i386 */
outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
/* 8259A-1 (the master) has a slave on IR2 */
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index a4f93b4120c..df3bf269bea 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -10,11 +10,9 @@
#include <asm/pgtable.h>
#include <asm/desc.h>
-static struct fs_struct init_fs = INIT_FS;
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
-EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
/*
* Initial thread structure.
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 7a3f2028e2e..7248ca11bdc 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -1,7 +1,7 @@
/*
* Intel IO-APIC support for multi-Pentium hosts.
*
- * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
+ * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
*
* Many thanks to Stig Venaas for trying out countless experimental
* patches and reporting/debugging problems patiently!
@@ -46,6 +46,7 @@
#include <asm/idle.h>
#include <asm/io.h>
#include <asm/smp.h>
+#include <asm/cpu.h>
#include <asm/desc.h>
#include <asm/proto.h>
#include <asm/acpi.h>
@@ -61,9 +62,7 @@
#include <asm/uv/uv_hub.h>
#include <asm/uv/uv_irq.h>
-#include <mach_ipi.h>
-#include <mach_apic.h>
-#include <mach_apicdef.h>
+#include <asm/genapic.h>
#define __apicdebuginit(type) static type __init
@@ -82,11 +81,11 @@ static DEFINE_SPINLOCK(vector_lock);
int nr_ioapic_registers[MAX_IO_APICS];
/* I/O APIC entries */
-struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
+struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
int nr_ioapics;
/* MP IRQ source entries */
-struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
/* # of MP IRQ source entries */
int mp_irq_entries;
@@ -99,103 +98,293 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
int skip_ioapic_setup;
+void arch_disable_smp_support(void)
+{
+#ifdef CONFIG_PCI
+ noioapicquirk = 1;
+ noioapicreroute = -1;
+#endif
+ skip_ioapic_setup = 1;
+}
+
static int __init parse_noapic(char *str)
{
/* disable IO-APIC */
- disable_ioapic_setup();
+ arch_disable_smp_support();
return 0;
}
early_param("noapic", parse_noapic);
struct irq_pin_list;
+
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+
+struct irq_pin_list {
+ int apic, pin;
+ struct irq_pin_list *next;
+};
+
+static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+{
+ struct irq_pin_list *pin;
+ int node;
+
+ node = cpu_to_node(cpu);
+
+ pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
+
+ return pin;
+}
+
struct irq_cfg {
- unsigned int irq;
struct irq_pin_list *irq_2_pin;
- cpumask_t domain;
- cpumask_t old_domain;
+ cpumask_var_t domain;
+ cpumask_var_t old_domain;
unsigned move_cleanup_count;
u8 vector;
u8 move_in_progress : 1;
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+ u8 move_desc_pending : 1;
+#endif
};
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_cfg irq_cfgx[] = {
+#else
static struct irq_cfg irq_cfgx[NR_IRQS] = {
- [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
- [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
- [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
- [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
- [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
- [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
- [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
- [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
- [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
- [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
- [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
- [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
- [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
- [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
- [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
- [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+#endif
+ [0] = { .vector = IRQ0_VECTOR, },
+ [1] = { .vector = IRQ1_VECTOR, },
+ [2] = { .vector = IRQ2_VECTOR, },
+ [3] = { .vector = IRQ3_VECTOR, },
+ [4] = { .vector = IRQ4_VECTOR, },
+ [5] = { .vector = IRQ5_VECTOR, },
+ [6] = { .vector = IRQ6_VECTOR, },
+ [7] = { .vector = IRQ7_VECTOR, },
+ [8] = { .vector = IRQ8_VECTOR, },
+ [9] = { .vector = IRQ9_VECTOR, },
+ [10] = { .vector = IRQ10_VECTOR, },
+ [11] = { .vector = IRQ11_VECTOR, },
+ [12] = { .vector = IRQ12_VECTOR, },
+ [13] = { .vector = IRQ13_VECTOR, },
+ [14] = { .vector = IRQ14_VECTOR, },
+ [15] = { .vector = IRQ15_VECTOR, },
};
-#define for_each_irq_cfg(irq, cfg) \
- for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
+int __init arch_early_irq_init(void)
+{
+ struct irq_cfg *cfg;
+ struct irq_desc *desc;
+ int count;
+ int i;
+
+ cfg = irq_cfgx;
+ count = ARRAY_SIZE(irq_cfgx);
+ for (i = 0; i < count; i++) {
+ desc = irq_to_desc(i);
+ desc->chip_data = &cfg[i];
+ alloc_bootmem_cpumask_var(&cfg[i].domain);
+ alloc_bootmem_cpumask_var(&cfg[i].old_domain);
+ if (i < NR_IRQS_LEGACY)
+ cpumask_setall(cfg[i].domain);
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_SPARSE_IRQ
static struct irq_cfg *irq_cfg(unsigned int irq)
{
- return irq < nr_irqs ? irq_cfgx + irq : NULL;
+ struct irq_cfg *cfg = NULL;
+ struct irq_desc *desc;
+
+ desc = irq_to_desc(irq);
+ if (desc)
+ cfg = desc->chip_data;
+
+ return cfg;
}
-static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
+static struct irq_cfg *get_one_free_irq_cfg(int cpu)
{
- return irq_cfg(irq);
+ struct irq_cfg *cfg;
+ int node;
+
+ node = cpu_to_node(cpu);
+
+ cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
+ if (cfg) {
+ if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
+ kfree(cfg);
+ cfg = NULL;
+ } else if (!alloc_cpumask_var_node(&cfg->old_domain,
+ GFP_ATOMIC, node)) {
+ free_cpumask_var(cfg->domain);
+ kfree(cfg);
+ cfg = NULL;
+ } else {
+ cpumask_clear(cfg->domain);
+ cpumask_clear(cfg->old_domain);
+ }
+ }
+
+ return cfg;
}
-/*
- * Rough estimation of how many shared IRQs there are, can be changed
- * anytime.
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+int arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+ struct irq_cfg *cfg;
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * the indexing order of this array favors 1:1 mappings
- * between pins and IRQs.
- */
+ cfg = desc->chip_data;
+ if (!cfg) {
+ desc->chip_data = get_one_free_irq_cfg(cpu);
+ if (!desc->chip_data) {
+ printk(KERN_ERR "can not alloc irq_cfg\n");
+ BUG_ON(1);
+ }
+ }
-struct irq_pin_list {
- int apic, pin;
- struct irq_pin_list *next;
-};
+ return 0;
+}
-static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
-static struct irq_pin_list *irq_2_pin_ptr;
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-static void __init irq_2_pin_init(void)
+static void
+init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
{
- struct irq_pin_list *pin = irq_2_pin_head;
- int i;
+ struct irq_pin_list *old_entry, *head, *tail, *entry;
- for (i = 1; i < PIN_MAP_SIZE; i++)
- pin[i-1].next = &pin[i];
+ cfg->irq_2_pin = NULL;
+ old_entry = old_cfg->irq_2_pin;
+ if (!old_entry)
+ return;
- irq_2_pin_ptr = &pin[0];
+ entry = get_one_free_irq_2_pin(cpu);
+ if (!entry)
+ return;
+
+ entry->apic = old_entry->apic;
+ entry->pin = old_entry->pin;
+ head = entry;
+ tail = entry;
+ old_entry = old_entry->next;
+ while (old_entry) {
+ entry = get_one_free_irq_2_pin(cpu);
+ if (!entry) {
+ entry = head;
+ while (entry) {
+ head = entry->next;
+ kfree(entry);
+ entry = head;
+ }
+ /* still use the old one */
+ return;
+ }
+ entry->apic = old_entry->apic;
+ entry->pin = old_entry->pin;
+ tail->next = entry;
+ tail = entry;
+ old_entry = old_entry->next;
+ }
+
+ tail->next = NULL;
+ cfg->irq_2_pin = head;
}
-static struct irq_pin_list *get_one_free_irq_2_pin(void)
+static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
{
- struct irq_pin_list *pin = irq_2_pin_ptr;
+ struct irq_pin_list *entry, *next;
- if (!pin)
- panic("can not get more irq_2_pin\n");
+ if (old_cfg->irq_2_pin == cfg->irq_2_pin)
+ return;
- irq_2_pin_ptr = pin->next;
- pin->next = NULL;
- return pin;
+ entry = old_cfg->irq_2_pin;
+
+ while (entry) {
+ next = entry->next;
+ kfree(entry);
+ entry = next;
+ }
+ old_cfg->irq_2_pin = NULL;
+}
+
+void arch_init_copy_chip_data(struct irq_desc *old_desc,
+ struct irq_desc *desc, int cpu)
+{
+ struct irq_cfg *cfg;
+ struct irq_cfg *old_cfg;
+
+ cfg = get_one_free_irq_cfg(cpu);
+
+ if (!cfg)
+ return;
+
+ desc->chip_data = cfg;
+
+ old_cfg = old_desc->chip_data;
+
+ memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
+
+ init_copy_irq_2_pin(old_cfg, cfg, cpu);
+}
+
+static void free_irq_cfg(struct irq_cfg *old_cfg)
+{
+ kfree(old_cfg);
+}
+
+void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+ struct irq_cfg *old_cfg, *cfg;
+
+ old_cfg = old_desc->chip_data;
+ cfg = desc->chip_data;
+
+ if (old_cfg == cfg)
+ return;
+
+ if (old_cfg) {
+ free_irq_2_pin(old_cfg, cfg);
+ free_irq_cfg(old_cfg);
+ old_desc->chip_data = NULL;
+ }
}
+static void
+set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
+{
+ struct irq_cfg *cfg = desc->chip_data;
+
+ if (!cfg->move_in_progress) {
+ /* it means that domain is not changed */
+ if (!cpumask_intersects(desc->affinity, mask))
+ cfg->move_desc_pending = 1;
+ }
+}
+#endif
+
+#else
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+ return irq < nr_irqs ? irq_cfgx + irq : NULL;
+}
+
+#endif
+
+#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
+static inline void
+set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
+{
+}
+#endif
+
struct io_apic {
unsigned int index;
unsigned int unused[3];
@@ -205,7 +394,7 @@ struct io_apic {
static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
{
return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
- + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
+ + (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
}
static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -237,11 +426,10 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned
writel(value, &io_apic->data);
}
-static bool io_apic_level_ack_pending(unsigned int irq)
+static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
{
struct irq_pin_list *entry;
unsigned long flags;
- struct irq_cfg *cfg = irq_cfg(irq);
spin_lock_irqsave(&ioapic_lock, flags);
entry = cfg->irq_2_pin;
@@ -298,7 +486,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
io_apic_write(apic, 0x10 + 2*pin, eu.w1);
}
-static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
unsigned long flags;
spin_lock_irqsave(&ioapic_lock, flags);
@@ -323,13 +511,32 @@ static void ioapic_mask_entry(int apic, int pin)
}
#ifdef CONFIG_SMP
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+static void send_cleanup_vector(struct irq_cfg *cfg)
+{
+ cpumask_var_t cleanup_mask;
+
+ if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+ unsigned int i;
+ cfg->move_cleanup_count = 0;
+ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ cfg->move_cleanup_count++;
+ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+ } else {
+ cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+ cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
+ apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+ free_cpumask_var(cleanup_mask);
+ }
+ cfg->move_in_progress = 0;
+}
+
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
{
int apic, pin;
- struct irq_cfg *cfg;
struct irq_pin_list *entry;
+ u8 vector = cfg->vector;
- cfg = irq_cfg(irq);
entry = cfg->irq_2_pin;
for (;;) {
unsigned int reg;
@@ -359,36 +566,63 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
}
}
-static int assign_irq_vector(int irq, cpumask_t mask);
+static int
+assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
+
+/*
+ * Either sets desc->affinity to a valid value, and returns
+ * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
+ * leaves desc->affinity untouched.
+ */
+static unsigned int
+set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
+{
+ struct irq_cfg *cfg;
+ unsigned int irq;
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+ if (!cpumask_intersects(mask, cpu_online_mask))
+ return BAD_APICID;
+
+ irq = desc->irq;
+ cfg = desc->chip_data;
+ if (assign_irq_vector(irq, cfg, mask))
+ return BAD_APICID;
+
+ cpumask_and(desc->affinity, cfg->domain, mask);
+ set_extra_move_desc(desc, mask);
+
+ return apic->cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask);
+}
+
+static void
+set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
{
struct irq_cfg *cfg;
unsigned long flags;
unsigned int dest;
- cpumask_t tmp;
- struct irq_desc *desc;
+ unsigned int irq;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
- return;
+ irq = desc->irq;
+ cfg = desc->chip_data;
- cfg = irq_cfg(irq);
- if (assign_irq_vector(irq, mask))
- return;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ dest = set_desc_affinity(desc, mask);
+ if (dest != BAD_APICID) {
+ /* Only the high 8 bits are valid. */
+ dest = SET_APIC_LOGICAL_ID(dest);
+ __target_IO_APIC_irq(irq, dest, cfg);
+ }
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
- /*
- * Only the high 8 bits are valid.
- */
- dest = SET_APIC_LOGICAL_ID(dest);
+static void
+set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
+{
+ struct irq_desc *desc;
desc = irq_to_desc(irq);
- spin_lock_irqsave(&ioapic_lock, flags);
- __target_IO_APIC_irq(irq, dest, cfg->vector);
- desc->affinity = mask;
- spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ set_ioapic_affinity_irq_desc(desc, mask);
}
#endif /* CONFIG_SMP */
@@ -397,16 +631,18 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
* shared ISA-space IRQs, so we have to support them. We are super
* fast in the common case, and fast for shared ISA-space IRQs.
*/
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
{
- struct irq_cfg *cfg;
struct irq_pin_list *entry;
- /* first time to refer irq_cfg, so with new */
- cfg = irq_cfg_alloc(irq);
entry = cfg->irq_2_pin;
if (!entry) {
- entry = get_one_free_irq_2_pin();
+ entry = get_one_free_irq_2_pin(cpu);
+ if (!entry) {
+ printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
+ apic, pin);
+ return;
+ }
cfg->irq_2_pin = entry;
entry->apic = apic;
entry->pin = pin;
@@ -421,7 +657,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
entry = entry->next;
}
- entry->next = get_one_free_irq_2_pin();
+ entry->next = get_one_free_irq_2_pin(cpu);
entry = entry->next;
entry->apic = apic;
entry->pin = pin;
@@ -430,11 +666,10 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
/*
* Reroute an IRQ to a different pin.
*/
-static void __init replace_pin_at_irq(unsigned int irq,
+static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
int oldapic, int oldpin,
int newapic, int newpin)
{
- struct irq_cfg *cfg = irq_cfg(irq);
struct irq_pin_list *entry = cfg->irq_2_pin;
int replaced = 0;
@@ -451,18 +686,16 @@ static void __init replace_pin_at_irq(unsigned int irq,
/* why? call replace before add? */
if (!replaced)
- add_pin_to_irq(irq, newapic, newpin);
+ add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
}
-static inline void io_apic_modify_irq(unsigned int irq,
+static inline void io_apic_modify_irq(struct irq_cfg *cfg,
int mask_and, int mask_or,
void (*final)(struct irq_pin_list *entry))
{
int pin;
- struct irq_cfg *cfg;
struct irq_pin_list *entry;
- cfg = irq_cfg(irq);
for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
unsigned int reg;
pin = entry->pin;
@@ -475,13 +708,13 @@ static inline void io_apic_modify_irq(unsigned int irq,
}
}
-static void __unmask_IO_APIC_irq(unsigned int irq)
+static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
{
- io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
+ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
}
#ifdef CONFIG_X86_64
-void io_apic_sync(struct irq_pin_list *entry)
+static void io_apic_sync(struct irq_pin_list *entry)
{
/*
* Synchronize the IO-APIC and the CPU by doing
@@ -492,47 +725,64 @@ void io_apic_sync(struct irq_pin_list *entry)
readl(&io_apic->data);
}
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
{
- io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+ io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
}
#else /* CONFIG_X86_32 */
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
{
- io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
+ io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
}
-static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
+static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
{
- io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
IO_APIC_REDIR_MASKED, NULL);
}
-static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
+static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
{
- io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
+ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
}
#endif /* CONFIG_X86_32 */
-static void mask_IO_APIC_irq (unsigned int irq)
+static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
{
+ struct irq_cfg *cfg = desc->chip_data;
unsigned long flags;
+ BUG_ON(!cfg);
+
spin_lock_irqsave(&ioapic_lock, flags);
- __mask_IO_APIC_irq(irq);
+ __mask_IO_APIC_irq(cfg);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
-static void unmask_IO_APIC_irq (unsigned int irq)
+static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
{
+ struct irq_cfg *cfg = desc->chip_data;
unsigned long flags;
spin_lock_irqsave(&ioapic_lock, flags);
- __unmask_IO_APIC_irq(irq);
+ __unmask_IO_APIC_irq(cfg);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
+static void mask_IO_APIC_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ mask_IO_APIC_irq_desc(desc);
+}
+static void unmask_IO_APIC_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ unmask_IO_APIC_irq_desc(desc);
+}
+
static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
{
struct IO_APIC_route_entry entry;
@@ -556,23 +806,6 @@ static void clear_IO_APIC (void)
clear_IO_APIC_pin(apic, pin);
}
-#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32)
-void send_IPI_self(int vector)
-{
- unsigned int cfg;
-
- /*
- * Wait for idle.
- */
- apic_wait_icr_idle();
- cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- apic_write(APIC_ICR, cfg);
-}
-#endif /* !CONFIG_SMP && CONFIG_X86_32*/
-
#ifdef CONFIG_X86_32
/*
* support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
@@ -704,10 +937,10 @@ static int find_irq_entry(int apic, int pin, int type)
int i;
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mp_irqtype == type &&
- (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
- mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
- mp_irqs[i].mp_dstirq == pin)
+ if (mp_irqs[i].irqtype == type &&
+ (mp_irqs[i].dstapic == mp_ioapics[apic].apicid ||
+ mp_irqs[i].dstapic == MP_APIC_ALL) &&
+ mp_irqs[i].dstirq == pin)
return i;
return -1;
@@ -721,13 +954,13 @@ static int __init find_isa_irq_pin(int irq, int type)
int i;
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mp_srcbus;
+ int lbus = mp_irqs[i].srcbus;
if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mp_irqtype == type) &&
- (mp_irqs[i].mp_srcbusirq == irq))
+ (mp_irqs[i].irqtype == type) &&
+ (mp_irqs[i].srcbusirq == irq))
- return mp_irqs[i].mp_dstirq;
+ return mp_irqs[i].dstirq;
}
return -1;
}
@@ -737,17 +970,17 @@ static int __init find_isa_irq_apic(int irq, int type)
int i;
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mp_srcbus;
+ int lbus = mp_irqs[i].srcbus;
if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mp_irqtype == type) &&
- (mp_irqs[i].mp_srcbusirq == irq))
+ (mp_irqs[i].irqtype == type) &&
+ (mp_irqs[i].srcbusirq == irq))
break;
}
if (i < mp_irq_entries) {
int apic;
for(apic = 0; apic < nr_ioapics; apic++) {
- if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
+ if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic)
return apic;
}
}
@@ -772,23 +1005,23 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
return -1;
}
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mp_srcbus;
+ int lbus = mp_irqs[i].srcbus;
for (apic = 0; apic < nr_ioapics; apic++)
- if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
- mp_irqs[i].mp_dstapic == MP_APIC_ALL)
+ if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
+ mp_irqs[i].dstapic == MP_APIC_ALL)
break;
if (!test_bit(lbus, mp_bus_not_pci) &&
- !mp_irqs[i].mp_irqtype &&
+ !mp_irqs[i].irqtype &&
(bus == lbus) &&
- (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
- int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
+ (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
+ int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
if (!(apic || IO_APIC_IRQ(irq)))
continue;
- if (pin == (mp_irqs[i].mp_srcbusirq & 3))
+ if (pin == (mp_irqs[i].srcbusirq & 3))
return irq;
/*
* Use the first all-but-pin matching entry as a
@@ -809,7 +1042,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
*/
static int EISA_ELCR(unsigned int irq)
{
- if (irq < 16) {
+ if (irq < NR_IRQS_LEGACY) {
unsigned int port = 0x4d0 + (irq >> 3);
return (inb(port) >> (irq & 7)) & 1;
}
@@ -831,7 +1064,7 @@ static int EISA_ELCR(unsigned int irq)
* EISA conforming in the MP table, that means its trigger type must
* be read in from the ELCR */
-#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq))
#define default_EISA_polarity(idx) default_ISA_polarity(idx)
/* PCI interrupts are always polarity one level triggered,
@@ -848,13 +1081,13 @@ static int EISA_ELCR(unsigned int irq)
static int MPBIOS_polarity(int idx)
{
- int bus = mp_irqs[idx].mp_srcbus;
+ int bus = mp_irqs[idx].srcbus;
int polarity;
/*
* Determine IRQ line polarity (high active or low active):
*/
- switch (mp_irqs[idx].mp_irqflag & 3)
+ switch (mp_irqs[idx].irqflag & 3)
{
case 0: /* conforms, ie. bus-type dependent polarity */
if (test_bit(bus, mp_bus_not_pci))
@@ -890,13 +1123,13 @@ static int MPBIOS_polarity(int idx)
static int MPBIOS_trigger(int idx)
{
- int bus = mp_irqs[idx].mp_srcbus;
+ int bus = mp_irqs[idx].srcbus;
int trigger;
/*
* Determine IRQ trigger mode (edge or level sensitive):
*/
- switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
+ switch ((mp_irqs[idx].irqflag>>2) & 3)
{
case 0: /* conforms, ie. bus-type dependent */
if (test_bit(bus, mp_bus_not_pci))
@@ -974,16 +1207,16 @@ int (*ioapic_renumber_irq)(int ioapic, int irq);
static int pin_2_irq(int idx, int apic, int pin)
{
int irq, i;
- int bus = mp_irqs[idx].mp_srcbus;
+ int bus = mp_irqs[idx].srcbus;
/*
* Debugging check, we are in big trouble if this message pops up!
*/
- if (mp_irqs[idx].mp_dstirq != pin)
+ if (mp_irqs[idx].dstirq != pin)
printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
if (test_bit(bus, mp_bus_not_pci)) {
- irq = mp_irqs[idx].mp_srcbusirq;
+ irq = mp_irqs[idx].srcbusirq;
} else {
/*
* PCI IRQs are mapped in order
@@ -1034,7 +1267,8 @@ void unlock_vector_lock(void)
spin_unlock(&vector_lock);
}
-static int __assign_irq_vector(int irq, cpumask_t mask)
+static int
+__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
{
/*
* NOTE! The local APIC isn't very good at handling
@@ -1049,52 +1283,49 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
*/
static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
unsigned int old_vector;
- int cpu;
- struct irq_cfg *cfg;
-
- cfg = irq_cfg(irq);
-
- /* Only try and allocate irqs on cpus that are present */
- cpus_and(mask, mask, cpu_online_map);
+ int cpu, err;
+ cpumask_var_t tmp_mask;
if ((cfg->move_in_progress) || cfg->move_cleanup_count)
return -EBUSY;
+ if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
+ return -ENOMEM;
+
old_vector = cfg->vector;
if (old_vector) {
- cpumask_t tmp;
- cpus_and(tmp, cfg->domain, mask);
- if (!cpus_empty(tmp))
+ cpumask_and(tmp_mask, mask, cpu_online_mask);
+ cpumask_and(tmp_mask, cfg->domain, tmp_mask);
+ if (!cpumask_empty(tmp_mask)) {
+ free_cpumask_var(tmp_mask);
return 0;
+ }
}
- for_each_cpu_mask_nr(cpu, mask) {
- cpumask_t domain, new_mask;
+ /* Only try and allocate irqs on cpus that are present */
+ err = -ENOSPC;
+ for_each_cpu_and(cpu, mask, cpu_online_mask) {
int new_cpu;
int vector, offset;
- domain = vector_allocation_domain(cpu);
- cpus_and(new_mask, domain, cpu_online_map);
+ apic->vector_allocation_domain(cpu, tmp_mask);
vector = current_vector;
offset = current_offset;
next:
vector += 8;
if (vector >= first_system_vector) {
- /* If we run out of vectors on large boxen, must share them. */
+ /* If out of vectors on large boxen, must share them. */
offset = (offset + 1) % 8;
vector = FIRST_DEVICE_VECTOR + offset;
}
if (unlikely(current_vector == vector))
continue;
-#ifdef CONFIG_X86_64
- if (vector == IA32_SYSCALL_VECTOR)
- goto next;
-#else
- if (vector == SYSCALL_VECTOR)
+
+ if (test_bit(vector, used_vectors))
goto next;
-#endif
- for_each_cpu_mask_nr(new_cpu, new_mask)
+
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
if (per_cpu(vector_irq, new_cpu)[vector] != -1)
goto next;
/* Found one! */
@@ -1102,44 +1333,56 @@ next:
current_offset = offset;
if (old_vector) {
cfg->move_in_progress = 1;
- cfg->old_domain = cfg->domain;
+ cpumask_copy(cfg->old_domain, cfg->domain);
}
- for_each_cpu_mask_nr(new_cpu, new_mask)
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
per_cpu(vector_irq, new_cpu)[vector] = irq;
cfg->vector = vector;
- cfg->domain = domain;
- return 0;
+ cpumask_copy(cfg->domain, tmp_mask);
+ err = 0;
+ break;
}
- return -ENOSPC;
+ free_cpumask_var(tmp_mask);
+ return err;
}
-static int assign_irq_vector(int irq, cpumask_t mask)
+static int
+assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
{
int err;
unsigned long flags;
spin_lock_irqsave(&vector_lock, flags);
- err = __assign_irq_vector(irq, mask);
+ err = __assign_irq_vector(irq, cfg, mask);
spin_unlock_irqrestore(&vector_lock, flags);
return err;
}
-static void __clear_irq_vector(int irq)
+static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
{
- struct irq_cfg *cfg;
- cpumask_t mask;
int cpu, vector;
- cfg = irq_cfg(irq);
BUG_ON(!cfg->vector);
vector = cfg->vector;
- cpus_and(mask, cfg->domain, cpu_online_map);
- for_each_cpu_mask_nr(cpu, mask)
+ for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
per_cpu(vector_irq, cpu)[vector] = -1;
cfg->vector = 0;
- cpus_clear(cfg->domain);
+ cpumask_clear(cfg->domain);
+
+ if (likely(!cfg->move_in_progress))
+ return;
+ for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
+ vector++) {
+ if (per_cpu(vector_irq, cpu)[vector] != irq)
+ continue;
+ per_cpu(vector_irq, cpu)[vector] = -1;
+ break;
+ }
+ }
+ cfg->move_in_progress = 0;
}
void __setup_vector_irq(int cpu)
@@ -1148,10 +1391,12 @@ void __setup_vector_irq(int cpu)
/* This function must be called with vector_lock held */
int irq, vector;
struct irq_cfg *cfg;
+ struct irq_desc *desc;
/* Mark the inuse vectors */
- for_each_irq_cfg(irq, cfg) {
- if (!cpu_isset(cpu, cfg->domain))
+ for_each_irq_desc(irq, desc) {
+ cfg = desc->chip_data;
+ if (!cpumask_test_cpu(cpu, cfg->domain))
continue;
vector = cfg->vector;
per_cpu(vector_irq, cpu)[vector] = irq;
@@ -1163,7 +1408,7 @@ void __setup_vector_irq(int cpu)
continue;
cfg = irq_cfg(irq);
- if (!cpu_isset(cpu, cfg->domain))
+ if (!cpumask_test_cpu(cpu, cfg->domain))
per_cpu(vector_irq, cpu)[vector] = -1;
}
}
@@ -1201,11 +1446,8 @@ static inline int IO_APIC_irq_trigger(int irq)
}
#endif
-static void ioapic_register_intr(int irq, unsigned long trigger)
+static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
{
- struct irq_desc *desc;
-
- desc = irq_to_desc(irq);
if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
trigger == IOAPIC_LEVEL)
@@ -1236,10 +1478,10 @@ static void ioapic_register_intr(int irq, unsigned long trigger)
handle_edge_irq, "edge");
}
-static int setup_ioapic_entry(int apic, int irq,
- struct IO_APIC_route_entry *entry,
- unsigned int destination, int trigger,
- int polarity, int vector)
+int setup_ioapic_entry(int apic_id, int irq,
+ struct IO_APIC_route_entry *entry,
+ unsigned int destination, int trigger,
+ int polarity, int vector)
{
/*
* add it to the IO-APIC irq-routing table:
@@ -1248,25 +1490,25 @@ static int setup_ioapic_entry(int apic, int irq,
#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled) {
- struct intel_iommu *iommu = map_ioapic_to_ir(apic);
+ struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);
struct irte irte;
struct IR_IO_APIC_route_entry *ir_entry =
(struct IR_IO_APIC_route_entry *) entry;
int index;
if (!iommu)
- panic("No mapping iommu for ioapic %d\n", apic);
+ panic("No mapping iommu for ioapic %d\n", apic_id);
index = alloc_irte(iommu, irq, 1);
if (index < 0)
- panic("Failed to allocate IRTE for ioapic %d\n", apic);
+ panic("Failed to allocate IRTE for ioapic %d\n", apic_id);
memset(&irte, 0, sizeof(irte));
irte.present = 1;
- irte.dst_mode = INT_DEST_MODE;
+ irte.dst_mode = apic->irq_dest_mode;
irte.trigger_mode = trigger;
- irte.dlvry_mode = INT_DELIVERY_MODE;
+ irte.dlvry_mode = apic->irq_delivery_mode;
irte.vector = vector;
irte.dest_id = IRTE_DEST(destination);
@@ -1279,8 +1521,8 @@ static int setup_ioapic_entry(int apic, int irq,
} else
#endif
{
- entry->delivery_mode = INT_DELIVERY_MODE;
- entry->dest_mode = INT_DEST_MODE;
+ entry->delivery_mode = apic->irq_delivery_mode;
+ entry->dest_mode = apic->irq_dest_mode;
entry->dest = destination;
}
@@ -1297,69 +1539,68 @@ static int setup_ioapic_entry(int apic, int irq,
return 0;
}
-static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
+static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc,
int trigger, int polarity)
{
struct irq_cfg *cfg;
struct IO_APIC_route_entry entry;
- cpumask_t mask;
+ unsigned int dest;
if (!IO_APIC_IRQ(irq))
return;
- cfg = irq_cfg(irq);
+ cfg = desc->chip_data;
- mask = TARGET_CPUS;
- if (assign_irq_vector(irq, mask))
+ if (assign_irq_vector(irq, cfg, apic->target_cpus()))
return;
- cpus_and(mask, cfg->domain, mask);
+ dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
apic_printk(APIC_VERBOSE,KERN_DEBUG
"IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
"IRQ %d Mode:%i Active:%i)\n",
- apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
+ apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector,
irq, trigger, polarity);
- if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
- cpu_mask_to_apicid(mask), trigger, polarity,
- cfg->vector)) {
+ if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
+ dest, trigger, polarity, cfg->vector)) {
printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
- mp_ioapics[apic].mp_apicid, pin);
- __clear_irq_vector(irq);
+ mp_ioapics[apic_id].apicid, pin);
+ __clear_irq_vector(irq, cfg);
return;
}
- ioapic_register_intr(irq, trigger);
- if (irq < 16)
+ ioapic_register_intr(irq, desc, trigger);
+ if (irq < NR_IRQS_LEGACY)
disable_8259A_irq(irq);
- ioapic_write_entry(apic, pin, entry);
+ ioapic_write_entry(apic_id, pin, entry);
}
static void __init setup_IO_APIC_irqs(void)
{
- int apic, pin, idx, irq;
+ int apic_id, pin, idx, irq;
int notcon = 0;
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+ int cpu = boot_cpu_id;
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
- for (apic = 0; apic < nr_ioapics; apic++) {
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
+ for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
- idx = find_irq_entry(apic, pin, mp_INT);
+ idx = find_irq_entry(apic_id, pin, mp_INT);
if (idx == -1) {
if (!notcon) {
notcon = 1;
apic_printk(APIC_VERBOSE,
KERN_DEBUG " %d-%d",
- mp_ioapics[apic].mp_apicid,
- pin);
+ mp_ioapics[apic_id].apicid, pin);
} else
apic_printk(APIC_VERBOSE, " %d-%d",
- mp_ioapics[apic].mp_apicid,
- pin);
+ mp_ioapics[apic_id].apicid, pin);
continue;
}
if (notcon) {
@@ -1368,14 +1609,25 @@ static void __init setup_IO_APIC_irqs(void)
notcon = 0;
}
- irq = pin_2_irq(idx, apic, pin);
-#ifdef CONFIG_X86_32
- if (multi_timer_check(apic, irq))
+ irq = pin_2_irq(idx, apic_id, pin);
+
+ /*
+ * Skip the timer IRQ if there's a quirk handler
+ * installed and if it returns 1:
+ */
+ if (apic->multi_timer_check &&
+ apic->multi_timer_check(apic_id, irq))
continue;
-#endif
- add_pin_to_irq(irq, apic, pin);
- setup_IO_APIC_irq(apic, pin, irq,
+ desc = irq_to_desc_alloc_cpu(irq, cpu);
+ if (!desc) {
+ printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+ continue;
+ }
+ cfg = desc->chip_data;
+ add_pin_to_irq_cpu(cfg, cpu, apic_id, pin);
+
+ setup_IO_APIC_irq(apic_id, pin, irq, desc,
irq_trigger(idx), irq_polarity(idx));
}
}
@@ -1388,7 +1640,7 @@ static void __init setup_IO_APIC_irqs(void)
/*
* Set up the timer pin, possibly with the 8259A-master behind.
*/
-static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
+static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
int vector)
{
struct IO_APIC_route_entry entry;
@@ -1404,10 +1656,10 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
* We use logical delivery to get the timer IRQ
* to the first CPU.
*/
- entry.dest_mode = INT_DEST_MODE;
- entry.mask = 1; /* mask IRQ now */
- entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
- entry.delivery_mode = INT_DELIVERY_MODE;
+ entry.dest_mode = apic->irq_dest_mode;
+ entry.mask = 0; /* don't mask IRQ for edge */
+ entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus());
+ entry.delivery_mode = apic->irq_delivery_mode;
entry.polarity = 0;
entry.trigger = 0;
entry.vector = vector;
@@ -1421,7 +1673,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
/*
* Add it to the IO-APIC irq-routing table:
*/
- ioapic_write_entry(apic, pin, entry);
+ ioapic_write_entry(apic_id, pin, entry);
}
@@ -1434,6 +1686,7 @@ __apicdebuginit(void) print_IO_APIC(void)
union IO_APIC_reg_03 reg_03;
unsigned long flags;
struct irq_cfg *cfg;
+ struct irq_desc *desc;
unsigned int irq;
if (apic_verbosity == APIC_QUIET)
@@ -1442,7 +1695,7 @@ __apicdebuginit(void) print_IO_APIC(void)
printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
for (i = 0; i < nr_ioapics; i++)
printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
- mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
+ mp_ioapics[i].apicid, nr_ioapic_registers[i]);
/*
* We are a bit conservative about what we expect. We have to
@@ -1462,7 +1715,7 @@ __apicdebuginit(void) print_IO_APIC(void)
spin_unlock_irqrestore(&ioapic_lock, flags);
printk("\n");
- printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1523,8 +1776,11 @@ __apicdebuginit(void) print_IO_APIC(void)
}
}
printk(KERN_DEBUG "IRQ to pin mappings:\n");
- for_each_irq_cfg(irq, cfg) {
- struct irq_pin_list *entry = cfg->irq_2_pin;
+ for_each_irq_desc(irq, desc) {
+ struct irq_pin_list *entry;
+
+ cfg = desc->chip_data;
+ entry = cfg->irq_2_pin;
if (!entry)
continue;
printk(KERN_DEBUG "IRQ%d ", irq);
@@ -1830,7 +2086,7 @@ static void __init setup_ioapic_ids_from_mpc(void)
{
union IO_APIC_reg_00 reg_00;
physid_mask_t phys_id_present_map;
- int apic;
+ int apic_id;
int i;
unsigned char old_id;
unsigned long flags;
@@ -1849,26 +2105,26 @@ static void __init setup_ioapic_ids_from_mpc(void)
* This is broken; anything with a real cpu count has to
* circumvent this idiocy regardless.
*/
- phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
+ phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map);
/*
* Set the IOAPIC ID to the value stored in the MPC table.
*/
- for (apic = 0; apic < nr_ioapics; apic++) {
+ for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
/* Read the register 0 value */
spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic, 0);
+ reg_00.raw = io_apic_read(apic_id, 0);
spin_unlock_irqrestore(&ioapic_lock, flags);
- old_id = mp_ioapics[apic].mp_apicid;
+ old_id = mp_ioapics[apic_id].apicid;
- if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
+ if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
- apic, mp_ioapics[apic].mp_apicid);
+ apic_id, mp_ioapics[apic_id].apicid);
printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
reg_00.bits.ID);
- mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
+ mp_ioapics[apic_id].apicid = reg_00.bits.ID;
}
/*
@@ -1876,10 +2132,10 @@ static void __init setup_ioapic_ids_from_mpc(void)
* system must have a unique ID or we get lots of nice
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
- if (check_apicid_used(phys_id_present_map,
- mp_ioapics[apic].mp_apicid)) {
+ if (apic->check_apicid_used(phys_id_present_map,
+ mp_ioapics[apic_id].apicid)) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
- apic, mp_ioapics[apic].mp_apicid);
+ apic_id, mp_ioapics[apic_id].apicid);
for (i = 0; i < get_physical_broadcast(); i++)
if (!physid_isset(i, phys_id_present_map))
break;
@@ -1888,13 +2144,13 @@ static void __init setup_ioapic_ids_from_mpc(void)
printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
i);
physid_set(i, phys_id_present_map);
- mp_ioapics[apic].mp_apicid = i;
+ mp_ioapics[apic_id].apicid = i;
} else {
physid_mask_t tmp;
- tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
+ tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid);
apic_printk(APIC_VERBOSE, "Setting %d in the "
"phys_id_present_map\n",
- mp_ioapics[apic].mp_apicid);
+ mp_ioapics[apic_id].apicid);
physids_or(phys_id_present_map, phys_id_present_map, tmp);
}
@@ -1903,11 +2159,11 @@ static void __init setup_ioapic_ids_from_mpc(void)
* We need to adjust the IRQ routing table
* if the ID changed.
*/
- if (old_id != mp_ioapics[apic].mp_apicid)
+ if (old_id != mp_ioapics[apic_id].apicid)
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mp_dstapic == old_id)
- mp_irqs[i].mp_dstapic
- = mp_ioapics[apic].mp_apicid;
+ if (mp_irqs[i].dstapic == old_id)
+ mp_irqs[i].dstapic
+ = mp_ioapics[apic_id].apicid;
/*
* Read the right value from the MPC table and
@@ -1915,20 +2171,20 @@ static void __init setup_ioapic_ids_from_mpc(void)
*/
apic_printk(APIC_VERBOSE, KERN_INFO
"...changing IO-APIC physical APIC ID to %d ...",
- mp_ioapics[apic].mp_apicid);
+ mp_ioapics[apic_id].apicid);
- reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
+ reg_00.bits.ID = mp_ioapics[apic_id].apicid;
spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0, reg_00.raw);
+ io_apic_write(apic_id, 0, reg_00.raw);
spin_unlock_irqrestore(&ioapic_lock, flags);
/*
* Sanity check
*/
spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic, 0);
+ reg_00.raw = io_apic_read(apic_id, 0);
spin_unlock_irqrestore(&ioapic_lock, flags);
- if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
+ if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
printk("could not set ID!\n");
else
apic_printk(APIC_VERBOSE, " ok.\n");
@@ -2008,14 +2264,16 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
{
int was_pending = 0;
unsigned long flags;
+ struct irq_cfg *cfg;
spin_lock_irqsave(&ioapic_lock, flags);
- if (irq < 16) {
+ if (irq < NR_IRQS_LEGACY) {
disable_8259A_irq(irq);
if (i8259A_irq_pending(irq))
was_pending = 1;
}
- __unmask_IO_APIC_irq(irq);
+ cfg = irq_cfg(irq);
+ __unmask_IO_APIC_irq(cfg);
spin_unlock_irqrestore(&ioapic_lock, flags);
return was_pending;
@@ -2029,7 +2287,7 @@ static int ioapic_retrigger_irq(unsigned int irq)
unsigned long flags;
spin_lock_irqsave(&vector_lock, flags);
- send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
+ apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
spin_unlock_irqrestore(&vector_lock, flags);
return 1;
@@ -2037,7 +2295,7 @@ static int ioapic_retrigger_irq(unsigned int irq)
#else
static int ioapic_retrigger_irq(unsigned int irq)
{
- send_IPI_self(irq_cfg(irq)->vector);
+ apic->send_IPI_self(irq_cfg(irq)->vector);
return 1;
}
@@ -2078,35 +2336,35 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
* as simple as edge triggered migration and we can do the irq migration
* with a simple atomic update to IO-APIC RTE.
*/
-static void migrate_ioapic_irq(int irq, cpumask_t mask)
+static void
+migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
{
struct irq_cfg *cfg;
- struct irq_desc *desc;
- cpumask_t tmp, cleanup_mask;
struct irte irte;
int modify_ioapic_rte;
unsigned int dest;
unsigned long flags;
+ unsigned int irq;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
+ if (!cpumask_intersects(mask, cpu_online_mask))
return;
+ irq = desc->irq;
if (get_irte(irq, &irte))
return;
- if (assign_irq_vector(irq, mask))
+ cfg = desc->chip_data;
+ if (assign_irq_vector(irq, cfg, mask))
return;
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
+ set_extra_move_desc(desc, mask);
+
+ dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
- desc = irq_to_desc(irq);
modify_ioapic_rte = desc->status & IRQ_LEVEL;
if (modify_ioapic_rte) {
spin_lock_irqsave(&ioapic_lock, flags);
- __target_IO_APIC_irq(irq, dest, cfg->vector);
+ __target_IO_APIC_irq(irq, dest, cfg);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -2118,24 +2376,20 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask)
*/
modify_irte(irq, &irte);
- if (cfg->move_in_progress) {
- cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
- cfg->move_cleanup_count = cpus_weight(cleanup_mask);
- send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- cfg->move_in_progress = 0;
- }
+ if (cfg->move_in_progress)
+ send_cleanup_vector(cfg);
- desc->affinity = mask;
+ cpumask_copy(desc->affinity, mask);
}
-static int migrate_irq_remapped_level(int irq)
+static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
{
int ret = -1;
- struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_cfg *cfg = desc->chip_data;
- mask_IO_APIC_irq(irq);
+ mask_IO_APIC_irq_desc(desc);
- if (io_apic_level_ack_pending(irq)) {
+ if (io_apic_level_ack_pending(cfg)) {
/*
* Interrupt in progress. Migrating irq now will change the
* vector information in the IO-APIC RTE and that will confuse
@@ -2147,14 +2401,15 @@ static int migrate_irq_remapped_level(int irq)
}
/* everthing is clear. we have right of way */
- migrate_ioapic_irq(irq, desc->pending_mask);
+ migrate_ioapic_irq_desc(desc, desc->pending_mask);
ret = 0;
desc->status &= ~IRQ_MOVE_PENDING;
- cpus_clear(desc->pending_mask);
+ cpumask_clear(desc->pending_mask);
unmask:
- unmask_IO_APIC_irq(irq);
+ unmask_IO_APIC_irq_desc(desc);
+
return ret;
}
@@ -2184,28 +2439,33 @@ static void ir_irq_migration(struct work_struct *work)
/*
* Migrates the IRQ destination in the process context.
*/
-static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+ const struct cpumask *mask)
{
- struct irq_desc *desc = irq_to_desc(irq);
-
if (desc->status & IRQ_LEVEL) {
desc->status |= IRQ_MOVE_PENDING;
- desc->pending_mask = mask;
- migrate_irq_remapped_level(irq);
+ cpumask_copy(desc->pending_mask, mask);
+ migrate_irq_remapped_level_desc(desc);
return;
}
- migrate_ioapic_irq(irq, mask);
+ migrate_ioapic_irq_desc(desc, mask);
+}
+static void set_ir_ioapic_affinity_irq(unsigned int irq,
+ const struct cpumask *mask)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ set_ir_ioapic_affinity_irq_desc(desc, mask);
}
#endif
asmlinkage void smp_irq_move_cleanup_interrupt(void)
{
unsigned vector, me;
+
ack_APIC_irq();
-#ifdef CONFIG_X86_64
exit_idle();
-#endif
irq_enter();
me = smp_processor_id();
@@ -2215,6 +2475,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
struct irq_cfg *cfg;
irq = __get_cpu_var(vector_irq)[vector];
+ if (irq == -1)
+ continue;
+
desc = irq_to_desc(irq);
if (!desc)
continue;
@@ -2224,7 +2487,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
if (!cfg->move_cleanup_count)
goto unlock;
- if ((vector == cfg->vector) && cpu_isset(me, cfg->domain))
+ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
goto unlock;
__get_cpu_var(vector_irq)[vector] = -1;
@@ -2236,28 +2499,45 @@ unlock:
irq_exit();
}
-static void irq_complete_move(unsigned int irq)
+static void irq_complete_move(struct irq_desc **descp)
{
- struct irq_cfg *cfg = irq_cfg(irq);
+ struct irq_desc *desc = *descp;
+ struct irq_cfg *cfg = desc->chip_data;
unsigned vector, me;
- if (likely(!cfg->move_in_progress))
+ if (likely(!cfg->move_in_progress)) {
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+ if (likely(!cfg->move_desc_pending))
+ return;
+
+ /* domain has not changed, but affinity did */
+ me = smp_processor_id();
+ if (cpumask_test_cpu(me, desc->affinity)) {
+ *descp = desc = move_irq_desc(desc, me);
+ /* get the new one */
+ cfg = desc->chip_data;
+ cfg->move_desc_pending = 0;
+ }
+#endif
return;
+ }
vector = ~get_irq_regs()->orig_ax;
me = smp_processor_id();
- if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
- cpumask_t cleanup_mask;
- cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
- cfg->move_cleanup_count = cpus_weight(cleanup_mask);
- send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- cfg->move_in_progress = 0;
+ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) {
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+ *descp = desc = move_irq_desc(desc, me);
+ /* get the new one */
+ cfg = desc->chip_data;
+#endif
+ send_cleanup_vector(cfg);
}
}
#else
-static inline void irq_complete_move(unsigned int irq) {}
+static inline void irq_complete_move(struct irq_desc **descp) {}
#endif
+
#ifdef CONFIG_INTR_REMAP
static void ack_x2apic_level(unsigned int irq)
{
@@ -2268,11 +2548,14 @@ static void ack_x2apic_edge(unsigned int irq)
{
ack_x2APIC_irq();
}
+
#endif
static void ack_apic_edge(unsigned int irq)
{
- irq_complete_move(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ irq_complete_move(&desc);
move_native_irq(irq);
ack_APIC_irq();
}
@@ -2281,18 +2564,21 @@ atomic_t irq_mis_count;
static void ack_apic_level(unsigned int irq)
{
+ struct irq_desc *desc = irq_to_desc(irq);
+
#ifdef CONFIG_X86_32
unsigned long v;
int i;
#endif
+ struct irq_cfg *cfg;
int do_unmask_irq = 0;
- irq_complete_move(irq);
+ irq_complete_move(&desc);
#ifdef CONFIG_GENERIC_PENDING_IRQ
/* If we are moving the irq we need to mask it */
- if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
+ if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
do_unmask_irq = 1;
- mask_IO_APIC_irq(irq);
+ mask_IO_APIC_irq_desc(desc);
}
#endif
@@ -2316,7 +2602,8 @@ static void ack_apic_level(unsigned int irq)
* operation to prevent an edge-triggered interrupt escaping meanwhile.
* The idea is from Manfred Spraul. --macro
*/
- i = irq_cfg(irq)->vector;
+ cfg = desc->chip_data;
+ i = cfg->vector;
v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
#endif
@@ -2355,17 +2642,18 @@ static void ack_apic_level(unsigned int irq)
* accurate and is causing problems then it is a hardware bug
* and you can go talk to the chipset vendor about it.
*/
- if (!io_apic_level_ack_pending(irq))
+ cfg = desc->chip_data;
+ if (!io_apic_level_ack_pending(cfg))
move_masked_irq(irq);
- unmask_IO_APIC_irq(irq);
+ unmask_IO_APIC_irq_desc(desc);
}
#ifdef CONFIG_X86_32
if (!(v & (1 << (i & 0x1f)))) {
atomic_inc(&irq_mis_count);
spin_lock(&ioapic_lock);
- __mask_and_edge_IO_APIC_irq(irq);
- __unmask_and_level_IO_APIC_irq(irq);
+ __mask_and_edge_IO_APIC_irq(cfg);
+ __unmask_and_level_IO_APIC_irq(cfg);
spin_unlock(&ioapic_lock);
}
#endif
@@ -2416,20 +2704,19 @@ static inline void init_IO_APIC_traps(void)
* Also, we've got to be careful not to trash gate
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
- for_each_irq_cfg(irq, cfg) {
- if (IO_APIC_IRQ(irq) && !cfg->vector) {
+ for_each_irq_desc(irq, desc) {
+ cfg = desc->chip_data;
+ if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
/*
* Hmm.. We don't have an entry for this,
* so default to an old-fashioned 8259
* interrupt if we can..
*/
- if (irq < 16)
+ if (irq < NR_IRQS_LEGACY)
make_8259A_irq(irq);
- else {
- desc = irq_to_desc(irq);
+ else
/* Strange. Oh, well.. */
desc->chip = &no_irq_chip;
- }
}
}
}
@@ -2454,7 +2741,7 @@ static void unmask_lapic_irq(unsigned int irq)
apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
}
-static void ack_lapic_irq (unsigned int irq)
+static void ack_lapic_irq(unsigned int irq)
{
ack_APIC_irq();
}
@@ -2466,11 +2753,8 @@ static struct irq_chip lapic_chip __read_mostly = {
.ack = ack_lapic_irq,
};
-static void lapic_register_intr(int irq)
+static void lapic_register_intr(int irq, struct irq_desc *desc)
{
- struct irq_desc *desc;
-
- desc = irq_to_desc(irq);
desc->status &= ~IRQ_LEVEL;
set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
"edge");
@@ -2574,22 +2858,20 @@ int timer_through_8259 __initdata;
*/
static inline void __init check_timer(void)
{
- struct irq_cfg *cfg = irq_cfg(0);
+ struct irq_desc *desc = irq_to_desc(0);
+ struct irq_cfg *cfg = desc->chip_data;
+ int cpu = boot_cpu_id;
int apic1, pin1, apic2, pin2;
unsigned long flags;
- unsigned int ver;
int no_pin1 = 0;
local_irq_save(flags);
- ver = apic_read(APIC_LVR);
- ver = GET_APIC_VERSION(ver);
-
/*
* get/set the timer IRQ vector:
*/
disable_8259A_irq(0);
- assign_irq_vector(0, TARGET_CPUS);
+ assign_irq_vector(0, cfg, apic->target_cpus());
/*
* As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2603,7 +2885,13 @@ static inline void __init check_timer(void)
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
init_8259A(1);
#ifdef CONFIG_X86_32
- timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
+ {
+ unsigned int ver;
+
+ ver = apic_read(APIC_LVR);
+ ver = GET_APIC_VERSION(ver);
+ timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
+ }
#endif
pin1 = find_isa_irq_pin(0, mp_INT);
@@ -2640,10 +2928,19 @@ static inline void __init check_timer(void)
* Ok, does IRQ0 through the IOAPIC work?
*/
if (no_pin1) {
- add_pin_to_irq(0, apic1, pin1);
+ add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+ } else {
+ /* for edge trigger, setup_IO_APIC_irq already
+ * leave it unmasked.
+ * so only need to unmask if it is level-trigger
+ * do we really have level trigger timer?
+ */
+ int idx;
+ idx = find_irq_entry(apic1, pin1, mp_INT);
+ if (idx != -1 && irq_trigger(idx))
+ unmask_IO_APIC_irq_desc(desc);
}
- unmask_IO_APIC_irq(0);
if (timer_irq_works()) {
if (nmi_watchdog == NMI_IO_APIC) {
setup_nmi();
@@ -2657,6 +2954,7 @@ static inline void __init check_timer(void)
if (intr_remapping_enabled)
panic("timer doesn't work through Interrupt-remapped IO-APIC");
#endif
+ local_irq_disable();
clear_IO_APIC_pin(apic1, pin1);
if (!no_pin1)
apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
@@ -2669,9 +2967,8 @@ static inline void __init check_timer(void)
/*
* legacy devices should be connected to IO APIC #0
*/
- replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+ replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
- unmask_IO_APIC_irq(0);
enable_8259A_irq(0);
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2686,6 +2983,7 @@ static inline void __init check_timer(void)
/*
* Cleanup, just in case ...
*/
+ local_irq_disable();
disable_8259A_irq(0);
clear_IO_APIC_pin(apic2, pin2);
apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
@@ -2703,7 +3001,7 @@ static inline void __init check_timer(void)
apic_printk(APIC_QUIET, KERN_INFO
"...trying to set up timer as Virtual Wire IRQ...\n");
- lapic_register_intr(0);
+ lapic_register_intr(0, desc);
apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
enable_8259A_irq(0);
@@ -2711,6 +3009,7 @@ static inline void __init check_timer(void)
apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
+ local_irq_disable();
disable_8259A_irq(0);
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
@@ -2728,6 +3027,7 @@ static inline void __init check_timer(void)
apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
+ local_irq_disable();
apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
"report. Then try booting with the 'noapic' option.\n");
@@ -2828,8 +3128,8 @@ static int ioapic_resume(struct sys_device *dev)
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(dev->id, 0);
- if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
- reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
+ if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
+ reg_00.bits.ID = mp_ioapics[dev->id].apicid;
io_apic_write(dev->id, 0, reg_00.raw);
}
spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2879,6 +3179,7 @@ static int __init ioapic_init_sysfs(void)
device_initcall(ioapic_init_sysfs);
+static int nr_irqs_gsi = NR_IRQS_LEGACY;
/*
* Dynamic irq allocate and deallocation
*/
@@ -2888,22 +3189,26 @@ unsigned int create_irq_nr(unsigned int irq_want)
unsigned int irq;
unsigned int new;
unsigned long flags;
- struct irq_cfg *cfg_new;
-
- irq_want = nr_irqs - 1;
+ struct irq_cfg *cfg_new = NULL;
+ int cpu = boot_cpu_id;
+ struct irq_desc *desc_new = NULL;
irq = 0;
+ if (irq_want < nr_irqs_gsi)
+ irq_want = nr_irqs_gsi;
+
spin_lock_irqsave(&vector_lock, flags);
- for (new = irq_want; new > 0; new--) {
- if (platform_legacy_irq(new))
+ for (new = irq_want; new < nr_irqs; new++) {
+ desc_new = irq_to_desc_alloc_cpu(new, cpu);
+ if (!desc_new) {
+ printk(KERN_INFO "can not get irq_desc for %d\n", new);
continue;
- cfg_new = irq_cfg(new);
- if (cfg_new && cfg_new->vector != 0)
+ }
+ cfg_new = desc_new->chip_data;
+
+ if (cfg_new->vector != 0)
continue;
- /* check if need to create one */
- if (!cfg_new)
- cfg_new = irq_cfg_alloc(new);
- if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+ if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
irq = new;
break;
}
@@ -2911,15 +3216,20 @@ unsigned int create_irq_nr(unsigned int irq_want)
if (irq > 0) {
dynamic_irq_init(irq);
+ /* restore it, in case dynamic_irq_init clear it */
+ if (desc_new)
+ desc_new->chip_data = cfg_new;
}
return irq;
}
int create_irq(void)
{
+ unsigned int irq_want;
int irq;
- irq = create_irq_nr(nr_irqs - 1);
+ irq_want = nr_irqs_gsi;
+ irq = create_irq_nr(irq_want);
if (irq == 0)
irq = -1;
@@ -2930,14 +3240,22 @@ int create_irq(void)
void destroy_irq(unsigned int irq)
{
unsigned long flags;
+ struct irq_cfg *cfg;
+ struct irq_desc *desc;
+ /* store it, in case dynamic_irq_cleanup clear it */
+ desc = irq_to_desc(irq);
+ cfg = desc->chip_data;
dynamic_irq_cleanup(irq);
+ /* connect back irq_cfg */
+ if (desc)
+ desc->chip_data = cfg;
#ifdef CONFIG_INTR_REMAP
free_irte(irq);
#endif
spin_lock_irqsave(&vector_lock, flags);
- __clear_irq_vector(irq);
+ __clear_irq_vector(irq, cfg);
spin_unlock_irqrestore(&vector_lock, flags);
}
@@ -2950,16 +3268,16 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
struct irq_cfg *cfg;
int err;
unsigned dest;
- cpumask_t tmp;
- tmp = TARGET_CPUS;
- err = assign_irq_vector(irq, tmp);
+ if (disable_apic)
+ return -ENXIO;
+
+ cfg = irq_cfg(irq);
+ err = assign_irq_vector(irq, cfg, apic->target_cpus());
if (err)
return err;
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, tmp);
- dest = cpu_mask_to_apicid(tmp);
+ dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
#ifdef CONFIG_INTR_REMAP
if (irq_remapped(irq)) {
@@ -2973,9 +3291,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
memset (&irte, 0, sizeof(irte));
irte.present = 1;
- irte.dst_mode = INT_DEST_MODE;
+ irte.dst_mode = apic->irq_dest_mode;
irte.trigger_mode = 0; /* edge */
- irte.dlvry_mode = INT_DELIVERY_MODE;
+ irte.dlvry_mode = apic->irq_delivery_mode;
irte.vector = cfg->vector;
irte.dest_id = IRTE_DEST(dest);
@@ -2993,10 +3311,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
msg->address_hi = MSI_ADDR_BASE_HI;
msg->address_lo =
MSI_ADDR_BASE_LO |
- ((INT_DEST_MODE == 0) ?
+ ((apic->irq_dest_mode == 0) ?
MSI_ADDR_DEST_MODE_PHYSICAL:
MSI_ADDR_DEST_MODE_LOGICAL) |
- ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
MSI_ADDR_REDIRECTION_CPU:
MSI_ADDR_REDIRECTION_LOWPRI) |
MSI_ADDR_DEST_ID(dest);
@@ -3004,7 +3322,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
msg->data =
MSI_DATA_TRIGGER_EDGE |
MSI_DATA_LEVEL_ASSERT |
- ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
MSI_DATA_DELIVERY_FIXED:
MSI_DATA_DELIVERY_LOWPRI) |
MSI_DATA_VECTOR(cfg->vector);
@@ -3013,64 +3331,48 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
}
#ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
+ struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
struct msi_msg msg;
unsigned int dest;
- cpumask_t tmp;
- struct irq_desc *desc;
-
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
- return;
- if (assign_irq_vector(irq, mask))
+ dest = set_desc_affinity(desc, mask);
+ if (dest == BAD_APICID)
return;
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
+ cfg = desc->chip_data;
- read_msi_msg(irq, &msg);
+ read_msi_msg_desc(desc, &msg);
msg.data &= ~MSI_DATA_VECTOR_MASK;
msg.data |= MSI_DATA_VECTOR(cfg->vector);
msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
- write_msi_msg(irq, &msg);
- desc = irq_to_desc(irq);
- desc->affinity = mask;
+ write_msi_msg_desc(desc, &msg);
}
-
#ifdef CONFIG_INTR_REMAP
/*
* Migrate the MSI irq to another cpumask. This migration is
* done in the process context using interrupt-remapping hardware.
*/
-static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+static void
+ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
- struct irq_cfg *cfg;
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_cfg *cfg = desc->chip_data;
unsigned int dest;
- cpumask_t tmp, cleanup_mask;
struct irte irte;
- struct irq_desc *desc;
-
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
- return;
if (get_irte(irq, &irte))
return;
- if (assign_irq_vector(irq, mask))
+ dest = set_desc_affinity(desc, mask);
+ if (dest == BAD_APICID)
return;
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
-
irte.vector = cfg->vector;
irte.dest_id = IRTE_DEST(dest);
@@ -3084,16 +3386,10 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
* at the new destination. So, time to cleanup the previous
* vector allocation.
*/
- if (cfg->move_in_progress) {
- cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
- cfg->move_cleanup_count = cpus_weight(cleanup_mask);
- send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- cfg->move_in_progress = 0;
- }
-
- desc = irq_to_desc(irq);
- desc->affinity = mask;
+ if (cfg->move_in_progress)
+ send_cleanup_vector(cfg);
}
+
#endif
#endif /* CONFIG_SMP */
@@ -3152,7 +3448,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
}
#endif
-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
{
int ret;
struct msi_msg msg;
@@ -3161,7 +3457,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
if (ret < 0)
return ret;
- set_irq_msi(irq, desc);
+ set_irq_msi(irq, msidesc);
write_msi_msg(irq, &msg);
#ifdef CONFIG_INTR_REMAP
@@ -3181,58 +3477,11 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
return 0;
}
-static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
-{
- unsigned int irq;
-
- irq = dev->bus->number;
- irq <<= 8;
- irq |= dev->devfn;
- irq <<= 12;
-
- return irq;
-}
-
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
-{
- unsigned int irq;
- int ret;
- unsigned int irq_want;
-
- irq_want = build_irq_for_pci_dev(dev) + 0x100;
-
- irq = create_irq_nr(irq_want);
- if (irq == 0)
- return -1;
-
-#ifdef CONFIG_INTR_REMAP
- if (!intr_remapping_enabled)
- goto no_ir;
-
- ret = msi_alloc_irte(dev, irq, 1);
- if (ret < 0)
- goto error;
-no_ir:
-#endif
- ret = setup_msi_irq(dev, desc, irq);
- if (ret < 0) {
- destroy_irq(irq);
- return ret;
- }
- return 0;
-
-#ifdef CONFIG_INTR_REMAP
-error:
- destroy_irq(irq);
- return ret;
-#endif
-}
-
int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
unsigned int irq;
int ret, sub_handle;
- struct msi_desc *desc;
+ struct msi_desc *msidesc;
unsigned int irq_want;
#ifdef CONFIG_INTR_REMAP
@@ -3240,12 +3489,13 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
int index = 0;
#endif
- irq_want = build_irq_for_pci_dev(dev) + 0x100;
+ irq_want = nr_irqs_gsi;
sub_handle = 0;
- list_for_each_entry(desc, &dev->msi_list, list) {
- irq = create_irq_nr(irq_want--);
+ list_for_each_entry(msidesc, &dev->msi_list, list) {
+ irq = create_irq_nr(irq_want);
if (irq == 0)
return -1;
+ irq_want = irq + 1;
#ifdef CONFIG_INTR_REMAP
if (!intr_remapping_enabled)
goto no_ir;
@@ -3275,7 +3525,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
}
no_ir:
#endif
- ret = setup_msi_irq(dev, desc, irq);
+ ret = setup_msi_irq(dev, msidesc, irq);
if (ret < 0)
goto error;
sub_handle++;
@@ -3294,24 +3544,18 @@ void arch_teardown_msi_irq(unsigned int irq)
#ifdef CONFIG_DMAR
#ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
{
+ struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
struct msi_msg msg;
unsigned int dest;
- cpumask_t tmp;
- struct irq_desc *desc;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
+ dest = set_desc_affinity(desc, mask);
+ if (dest == BAD_APICID)
return;
- if (assign_irq_vector(irq, mask))
- return;
-
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
+ cfg = desc->chip_data;
dmar_msi_read(irq, &msg);
@@ -3321,9 +3565,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
dmar_msi_write(irq, &msg);
- desc = irq_to_desc(irq);
- desc->affinity = mask;
}
+
#endif /* CONFIG_SMP */
struct irq_chip dmar_msi_type = {
@@ -3355,24 +3598,18 @@ int arch_setup_dmar_msi(unsigned int irq)
#ifdef CONFIG_HPET_TIMER
#ifdef CONFIG_SMP
-static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
{
+ struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
- struct irq_desc *desc;
struct msi_msg msg;
unsigned int dest;
- cpumask_t tmp;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
+ dest = set_desc_affinity(desc, mask);
+ if (dest == BAD_APICID)
return;
- if (assign_irq_vector(irq, mask))
- return;
-
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
+ cfg = desc->chip_data;
hpet_msi_read(irq, &msg);
@@ -3382,9 +3619,8 @@ static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
hpet_msi_write(irq, &msg);
- desc = irq_to_desc(irq);
- desc->affinity = mask;
}
+
#endif /* CONFIG_SMP */
struct irq_chip hpet_msi_type = {
@@ -3437,28 +3673,21 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
write_ht_irq_msg(irq, &msg);
}
-static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
+ struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
unsigned int dest;
- cpumask_t tmp;
- struct irq_desc *desc;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
+ dest = set_desc_affinity(desc, mask);
+ if (dest == BAD_APICID)
return;
- if (assign_irq_vector(irq, mask))
- return;
-
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
+ cfg = desc->chip_data;
target_ht_irq(irq, dest, cfg->vector);
- desc = irq_to_desc(irq);
- desc->affinity = mask;
}
+
#endif
static struct irq_chip ht_irq_chip = {
@@ -3476,17 +3705,18 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
{
struct irq_cfg *cfg;
int err;
- cpumask_t tmp;
- tmp = TARGET_CPUS;
- err = assign_irq_vector(irq, tmp);
+ if (disable_apic)
+ return -ENXIO;
+
+ cfg = irq_cfg(irq);
+ err = assign_irq_vector(irq, cfg, apic->target_cpus());
if (!err) {
struct ht_irq_msg msg;
unsigned dest;
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, tmp);
- dest = cpu_mask_to_apicid(tmp);
+ dest = apic->cpu_mask_to_apicid_and(cfg->domain,
+ apic->target_cpus());
msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
@@ -3494,11 +3724,11 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
HT_IRQ_LOW_BASE |
HT_IRQ_LOW_DEST_ID(dest) |
HT_IRQ_LOW_VECTOR(cfg->vector) |
- ((INT_DEST_MODE == 0) ?
+ ((apic->irq_dest_mode == 0) ?
HT_IRQ_LOW_DM_PHYSICAL :
HT_IRQ_LOW_DM_LOGICAL) |
HT_IRQ_LOW_RQEOI_EDGE |
- ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
HT_IRQ_LOW_MT_FIXED :
HT_IRQ_LOW_MT_ARBITRATED) |
HT_IRQ_LOW_IRQ_MASKED;
@@ -3514,7 +3744,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
}
#endif /* CONFIG_HT_IRQ */
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86_UV
/*
* Re-target the irq to the specified CPU and enable the specified MMR located
* on the specified blade to allow the sending of MSIs to the specified CPU.
@@ -3522,7 +3752,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
unsigned long mmr_offset)
{
- const cpumask_t *eligible_cpu = get_cpu_mask(cpu);
+ const struct cpumask *eligible_cpu = cpumask_of(cpu);
struct irq_cfg *cfg;
int mmr_pnode;
unsigned long mmr_value;
@@ -3530,7 +3760,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
unsigned long flags;
int err;
- err = assign_irq_vector(irq, *eligible_cpu);
+ cfg = irq_cfg(irq);
+
+ err = assign_irq_vector(irq, cfg, eligible_cpu);
if (err != 0)
return err;
@@ -3539,19 +3771,17 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
irq_name);
spin_unlock_irqrestore(&vector_lock, flags);
- cfg = irq_cfg(irq);
-
mmr_value = 0;
entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
entry->vector = cfg->vector;
- entry->delivery_mode = INT_DELIVERY_MODE;
- entry->dest_mode = INT_DEST_MODE;
+ entry->delivery_mode = apic->irq_delivery_mode;
+ entry->dest_mode = apic->irq_dest_mode;
entry->polarity = 0;
entry->trigger = 0;
entry->mask = 0;
- entry->dest = cpu_mask_to_apicid(*eligible_cpu);
+ entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
mmr_pnode = uv_blade_to_pnode(mmr_blade);
uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3592,31 +3822,50 @@ int __init io_apic_get_redir_entries (int ioapic)
return reg_01.bits.entries;
}
-int __init probe_nr_irqs(void)
+void __init probe_nr_irqs_gsi(void)
{
- int idx;
int nr = 0;
-#ifndef CONFIG_XEN
- int nr_min = 32;
-#else
- int nr_min = NR_IRQS;
-#endif
- for (idx = 0; idx < nr_ioapics; idx++)
- nr += io_apic_get_redir_entries(idx) + 1;
+ nr = acpi_probe_gsi();
+ if (nr > nr_irqs_gsi) {
+ nr_irqs_gsi = nr;
+ } else {
+ /* for acpi=off or acpi is not compiled in */
+ int idx;
- /* double it for hotplug and msi and nmi */
- nr <<= 1;
+ nr = 0;
+ for (idx = 0; idx < nr_ioapics; idx++)
+ nr += io_apic_get_redir_entries(idx) + 1;
- /* something wrong ? */
- if (nr < nr_min)
- nr = nr_min;
- if (WARN_ON(nr > NR_IRQS))
- nr = NR_IRQS;
+ if (nr > nr_irqs_gsi)
+ nr_irqs_gsi = nr;
+ }
- return nr;
+ printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
}
+#ifdef CONFIG_SPARSE_IRQ
+int __init arch_probe_nr_irqs(void)
+{
+ int nr;
+
+ if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
+ nr_irqs = NR_VECTORS * nr_cpu_ids;
+
+ nr = nr_irqs_gsi + 8 * nr_cpu_ids;
+#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
+ /*
+ * for MSI and HT dyn irq
+ */
+ nr += nr_irqs_gsi * 16;
+#endif
+ if (nr < nr_irqs)
+ nr_irqs = nr;
+
+ return 0;
+}
+#endif
+
/* --------------------------------------------------------------------------
ACPI-based IOAPIC Configuration
-------------------------------------------------------------------------- */
@@ -3642,7 +3891,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
*/
if (physids_empty(apic_id_map))
- apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
+ apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map);
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(ioapic, 0);
@@ -3658,10 +3907,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
* Every APIC in a system must have a unique ID or we get lots of nice
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
- if (check_apicid_used(apic_id_map, apic_id)) {
+ if (apic->check_apicid_used(apic_id_map, apic_id)) {
for (i = 0; i < get_physical_broadcast(); i++) {
- if (!check_apicid_used(apic_id_map, i))
+ if (!apic->check_apicid_used(apic_id_map, i))
break;
}
@@ -3674,7 +3923,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
apic_id = i;
}
- tmp = apicid_to_cpu_present(apic_id);
+ tmp = apic->apicid_to_cpu_present(apic_id);
physids_or(apic_id_map, apic_id_map, tmp);
if (reg_00.bits.ID != apic_id) {
@@ -3713,19 +3962,31 @@ int __init io_apic_get_version(int ioapic)
int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
{
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+ int cpu = boot_cpu_id;
+
if (!IO_APIC_IRQ(irq)) {
apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
ioapic);
return -EINVAL;
}
+ desc = irq_to_desc_alloc_cpu(irq, cpu);
+ if (!desc) {
+ printk(KERN_INFO "can not get irq_desc %d\n", irq);
+ return 0;
+ }
+
/*
* IRQs < 16 are already in the irq_2_pin[] map
*/
- if (irq >= 16)
- add_pin_to_irq(irq, ioapic, pin);
+ if (irq >= NR_IRQS_LEGACY) {
+ cfg = desc->chip_data;
+ add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
+ }
- setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
+ setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
return 0;
}
@@ -3739,8 +4000,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
return -1;
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mp_irqtype == mp_INT &&
- mp_irqs[i].mp_srcbusirq == bus_irq)
+ if (mp_irqs[i].irqtype == mp_INT &&
+ mp_irqs[i].srcbusirq == bus_irq)
break;
if (i >= mp_irq_entries)
return -1;
@@ -3755,13 +4016,15 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
/*
* This function currently is only a helper for the i386 smp boot process where
* we need to reprogram the ioredtbls to cater for the cpus which have come online
- * so mask in all cases should simply be TARGET_CPUS
+ * so mask in all cases should simply be apic->target_cpus()
*/
#ifdef CONFIG_SMP
void __init setup_ioapic_dest(void)
{
int pin, ioapic, irq, irq_entry;
+ struct irq_desc *desc;
struct irq_cfg *cfg;
+ const struct cpumask *mask;
if (skip_ioapic_setup == 1)
return;
@@ -3777,17 +4040,31 @@ void __init setup_ioapic_dest(void)
* when you have too many devices, because at that time only boot
* cpu is online.
*/
- cfg = irq_cfg(irq);
- if (!cfg->vector)
- setup_IO_APIC_irq(ioapic, pin, irq,
+ desc = irq_to_desc(irq);
+ cfg = desc->chip_data;
+ if (!cfg->vector) {
+ setup_IO_APIC_irq(ioapic, pin, irq, desc,
irq_trigger(irq_entry),
irq_polarity(irq_entry));
+ continue;
+
+ }
+
+ /*
+ * Honour affinities which have been set in early boot
+ */
+ if (desc->status &
+ (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
+ mask = desc->affinity;
+ else
+ mask = apic->target_cpus();
+
#ifdef CONFIG_INTR_REMAP
- else if (intr_remapping_enabled)
- set_ir_ioapic_affinity_irq(irq, TARGET_CPUS);
-#endif
+ if (intr_remapping_enabled)
+ set_ir_ioapic_affinity_irq_desc(desc, mask);
else
- set_ioapic_affinity_irq(irq, TARGET_CPUS);
+#endif
+ set_ioapic_affinity_irq_desc(desc, mask);
}
}
@@ -3836,11 +4113,10 @@ void __init ioapic_init_mappings(void)
struct resource *ioapic_res;
int i;
- irq_2_pin_init();
ioapic_res = ioapic_setup_resources();
for (i = 0; i < nr_ioapics; i++) {
if (smp_found_config) {
- ioapic_phys = mp_ioapics[i].mp_apicaddr;
+ ioapic_phys = mp_ioapics[i].apicaddr;
#ifdef CONFIG_X86_32
if (!ioapic_phys) {
printk(KERN_ERR
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 19191430274..e41980a373a 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -35,8 +35,8 @@ static void set_bitmap(unsigned long *bitmap, unsigned int base,
*/
asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
{
- struct thread_struct * t = &current->thread;
- struct tss_struct * tss;
+ struct thread_struct *t = &current->thread;
+ struct tss_struct *tss;
unsigned int i, max_long, bytes, bytes_updated;
if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
@@ -131,9 +131,8 @@ static int do_iopl(unsigned int level, struct pt_regs *regs)
}
#ifdef CONFIG_X86_32
-asmlinkage long sys_iopl(unsigned long regsp)
+long sys_iopl(struct pt_regs *regs)
{
- struct pt_regs *regs = (struct pt_regs *)&regsp;
unsigned int level = regs->bx;
struct thread_struct *t = &current->thread;
int rc;
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index f1c688e46f3..dbf5445727a 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -17,135 +17,121 @@
#include <asm/mmu_context.h>
#include <asm/apic.h>
#include <asm/proto.h>
+#include <asm/ipi.h>
-#ifdef CONFIG_X86_32
-#include <mach_apic.h>
-#include <mach_ipi.h>
-
-/*
- * the following functions deal with sending IPIs between CPUs.
- *
- * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
- */
-
-static inline int __prepare_ICR(unsigned int shortcut, int vector)
+void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector)
{
- unsigned int icr = shortcut | APIC_DEST_LOGICAL;
-
- switch (vector) {
- default:
- icr |= APIC_DM_FIXED | vector;
- break;
- case NMI_VECTOR:
- icr |= APIC_DM_NMI;
- break;
+ unsigned long query_cpu;
+ unsigned long flags;
+
+ /*
+ * Hack. The clustered APIC addressing mode doesn't allow us to send
+ * to an arbitrary mask, so I do a unicast to each CPU instead.
+ * - mbligh
+ */
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask) {
+ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
+ query_cpu), vector, APIC_DEST_PHYSICAL);
}
- return icr;
+ local_irq_restore(flags);
}
-static inline int __prepare_ICR2(unsigned int mask)
+void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
+ int vector)
{
- return SET_APIC_DEST_FIELD(mask);
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int query_cpu;
+ unsigned long flags;
+
+ /* See Hack comment above */
+
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask) {
+ if (query_cpu == this_cpu)
+ continue;
+ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
+ query_cpu), vector, APIC_DEST_PHYSICAL);
+ }
+ local_irq_restore(flags);
}
-void __send_IPI_shortcut(unsigned int shortcut, int vector)
+void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
+ int vector)
{
- /*
- * Subtle. In the case of the 'never do double writes' workaround
- * we have to lock out interrupts to be safe. As we don't care
- * of the value read we use an atomic rmw access to avoid costly
- * cli/sti. Otherwise we use an even cheaper single atomic write
- * to the APIC.
- */
- unsigned int cfg;
-
- /*
- * Wait for idle.
- */
- apic_wait_icr_idle();
+ unsigned long flags;
+ unsigned int query_cpu;
/*
- * No need to touch the target chip field
+ * Hack. The clustered APIC addressing mode doesn't allow us to send
+ * to an arbitrary mask, so I do a unicasts to each CPU instead. This
+ * should be modified to do 1 message per cluster ID - mbligh
*/
- cfg = __prepare_ICR(shortcut, vector);
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- apic_write(APIC_ICR, cfg);
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask)
+ __default_send_IPI_dest_field(
+ apic->cpu_to_logical_apicid(query_cpu), vector,
+ apic->dest_logical);
+ local_irq_restore(flags);
}
-void send_IPI_self(int vector)
+void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
+ int vector)
{
- __send_IPI_shortcut(APIC_DEST_SELF, vector);
-}
-
-/*
- * This is used to send an IPI with no shorthand notation (the destination is
- * specified in bits 56 to 63 of the ICR).
- */
-static inline void __send_IPI_dest_field(unsigned long mask, int vector)
-{
- unsigned long cfg;
-
- /*
- * Wait for idle.
- */
- if (unlikely(vector == NMI_VECTOR))
- safe_apic_wait_icr_idle();
- else
- apic_wait_icr_idle();
-
- /*
- * prepare target chip field
- */
- cfg = __prepare_ICR2(mask);
- apic_write(APIC_ICR2, cfg);
+ unsigned long flags;
+ unsigned int query_cpu;
+ unsigned int this_cpu = smp_processor_id();
- /*
- * program the ICR
- */
- cfg = __prepare_ICR(0, vector);
+ /* See Hack comment above */
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- apic_write(APIC_ICR, cfg);
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask) {
+ if (query_cpu == this_cpu)
+ continue;
+ __default_send_IPI_dest_field(
+ apic->cpu_to_logical_apicid(query_cpu), vector,
+ apic->dest_logical);
+ }
+ local_irq_restore(flags);
}
+#ifdef CONFIG_X86_32
+
/*
* This is only used on smaller machines.
*/
-void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
+void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
{
- unsigned long mask = cpus_addr(cpumask)[0];
+ unsigned long mask = cpumask_bits(cpumask)[0];
unsigned long flags;
local_irq_save(flags);
- WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
- __send_IPI_dest_field(mask, vector);
+ WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
+ __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
local_irq_restore(flags);
}
-void send_IPI_mask_sequence(cpumask_t mask, int vector)
+void default_send_IPI_allbutself(int vector)
{
- unsigned long flags;
- unsigned int query_cpu;
-
/*
- * Hack. The clustered APIC addressing mode doesn't allow us to send
- * to an arbitrary mask, so I do a unicasts to each CPU instead. This
- * should be modified to do 1 message per cluster ID - mbligh
+ * if there are no other CPUs in the system then we get an APIC send
+ * error if we try to broadcast, thus avoid sending IPIs in this case.
*/
+ if (!(num_online_cpus() > 1))
+ return;
- local_irq_save(flags);
- for_each_possible_cpu(query_cpu) {
- if (cpu_isset(query_cpu, mask)) {
- __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
- vector);
- }
- }
- local_irq_restore(flags);
+ __default_local_send_IPI_allbutself(vector);
+}
+
+void default_send_IPI_all(int vector)
+{
+ __default_local_send_IPI_all(vector);
+}
+
+void default_send_IPI_self(int vector)
+{
+ __default_send_IPI_shortcut(APIC_DEST_SELF, vector, apic->dest_logical);
}
/* must come after the send_IPI functions above for inlining */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d1d4dc52f64..f13ca1650aa 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -5,10 +5,13 @@
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/seq_file.h>
+#include <linux/smp.h>
+#include <linux/ftrace.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
-#include <asm/smp.h>
+#include <asm/irq.h>
+#include <asm/idle.h>
atomic_t irq_err_count;
@@ -35,11 +38,7 @@ void ack_bad_irq(unsigned int irq)
#endif
}
-#ifdef CONFIG_X86_32
-# define irq_stats(x) (&per_cpu(irq_stat, x))
-#else
-# define irq_stats(x) cpu_pda(x)
-#endif
+#define irq_stats(x) (&per_cpu(irq_stat, x))
/*
* /proc/interrupts printing:
*/
@@ -118,6 +117,9 @@ int show_interrupts(struct seq_file *p, void *v)
}
desc = irq_to_desc(i);
+ if (!desc)
+ return 0;
+
spin_lock_irqsave(&desc->lock, flags);
#ifndef CONFIG_SMP
any_count = kstat_irqs(i);
@@ -187,3 +189,41 @@ u64 arch_irq_stat(void)
#endif
return sum;
}
+
+
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ /* high bit used in ret_from_ code */
+ unsigned vector = ~regs->orig_ax;
+ unsigned irq;
+
+ exit_idle();
+ irq_enter();
+
+ irq = __get_cpu_var(vector_irq)[vector];
+
+ if (!handle_irq(irq, regs)) {
+#ifdef CONFIG_X86_64
+ if (!disable_apic)
+ ack_APIC_irq();
+#endif
+
+ if (printk_ratelimit())
+ printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n",
+ __func__, smp_processor_id(), vector, irq);
+ }
+
+ irq_exit();
+
+ set_irq_regs(old_regs);
+ return 1;
+}
+
+EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index a51382672de..4beb9a13873 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -15,9 +15,9 @@
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/delay.h>
+#include <linux/uaccess.h>
#include <asm/apic.h>
-#include <asm/uaccess.h>
DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
EXPORT_PER_CPU_SYMBOL(irq_stat);
@@ -93,7 +93,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
return 0;
/* build the stack frame on the IRQ stack */
- isp = (u32 *) ((char*)irqctx + sizeof(*irqctx));
+ isp = (u32 *) ((char *)irqctx + sizeof(*irqctx));
irqctx->tinfo.task = curctx->tinfo.task;
irqctx->tinfo.previous_esp = current_stack_pointer;
@@ -137,7 +137,7 @@ void __cpuinit irq_ctx_init(int cpu)
hardirq_ctx[cpu] = irqctx;
- irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
+ irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE];
irqctx->tinfo.task = NULL;
irqctx->tinfo.exec_domain = NULL;
irqctx->tinfo.cpu = cpu;
@@ -147,7 +147,7 @@ void __cpuinit irq_ctx_init(int cpu)
softirq_ctx[cpu] = irqctx;
printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
- cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
+ cpu, hardirq_ctx[cpu], softirq_ctx[cpu]);
}
void irq_ctx_exit(int cpu)
@@ -174,7 +174,7 @@ asmlinkage void do_softirq(void)
irqctx->tinfo.previous_esp = current_stack_pointer;
/* build the stack frame on the softirq stack */
- isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+ isp = (u32 *) ((char *)irqctx + sizeof(*irqctx));
call_on_stack(__do_softirq, isp);
/*
@@ -191,33 +191,16 @@ static inline int
execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
#endif
-/*
- * do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
- */
-unsigned int do_IRQ(struct pt_regs *regs)
+bool handle_irq(unsigned irq, struct pt_regs *regs)
{
- struct pt_regs *old_regs;
- /* high bit used in ret_from_ code */
- int overflow;
- unsigned vector = ~regs->orig_ax;
struct irq_desc *desc;
- unsigned irq;
-
-
- old_regs = set_irq_regs(regs);
- irq_enter();
- irq = __get_cpu_var(vector_irq)[vector];
+ int overflow;
overflow = check_stack_overflow();
desc = irq_to_desc(irq);
- if (unlikely(!desc)) {
- printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x cpu %d\n",
- __func__, irq, vector, smp_processor_id());
- BUG();
- }
+ if (unlikely(!desc))
+ return false;
if (!execute_on_irq_stack(overflow, desc, irq)) {
if (unlikely(overflow))
@@ -225,33 +208,34 @@ unsigned int do_IRQ(struct pt_regs *regs)
desc->handle_irq(irq, desc);
}
- irq_exit();
- set_irq_regs(old_regs);
- return 1;
+ return true;
}
#ifdef CONFIG_HOTPLUG_CPU
-#include <mach_apic.h>
+#include <asm/genapic.h>
-void fixup_irqs(cpumask_t map)
+/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
+void fixup_irqs(void)
{
unsigned int irq;
static int warned;
struct irq_desc *desc;
for_each_irq_desc(irq, desc) {
- cpumask_t mask;
+ const struct cpumask *affinity;
+ if (!desc)
+ continue;
if (irq == 2)
continue;
- cpus_and(mask, desc->affinity, map);
- if (any_online_cpu(mask) == NR_CPUS) {
+ affinity = desc->affinity;
+ if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
printk("Breaking affinity for irq %i\n", irq);
- mask = map;
+ affinity = cpu_all_mask;
}
if (desc->chip->set_affinity)
- desc->chip->set_affinity(irq, mask);
+ desc->chip->set_affinity(irq, affinity);
else if (desc->action && !(warned++))
printk("Cannot set affinity for irq %i\n", irq);
}
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 60eb84eb77a..977d8b43a0d 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -13,12 +13,19 @@
#include <linux/seq_file.h>
#include <linux/module.h>
#include <linux/delay.h>
-#include <asm/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/uaccess.h>
+#include <linux/smp.h>
#include <asm/io_apic.h>
#include <asm/idle.h>
-#include <asm/smp.h>
+#include <asm/apic.h>
+
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+
+DEFINE_PER_CPU(struct pt_regs *, irq_regs);
+EXPORT_PER_CPU_SYMBOL(irq_regs);
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
/*
* Probabilistic stack overflow check:
*
@@ -28,95 +35,71 @@
*/
static inline void stack_overflow_check(struct pt_regs *regs)
{
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
u64 curbase = (u64)task_stack_page(current);
- static unsigned long warned = -60*HZ;
-
- if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
- regs->sp < curbase + sizeof(struct thread_info) + 128 &&
- time_after(jiffies, warned + 60*HZ)) {
- printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
- current->comm, curbase, regs->sp);
- show_stack(NULL,NULL);
- warned = jiffies;
- }
-}
+
+ WARN_ONCE(regs->sp >= curbase &&
+ regs->sp <= curbase + THREAD_SIZE &&
+ regs->sp < curbase + sizeof(struct thread_info) +
+ sizeof(struct pt_regs) + 128,
+
+ "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
+ current->comm, curbase, regs->sp);
#endif
+}
-/*
- * do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
- */
-asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
+bool handle_irq(unsigned irq, struct pt_regs *regs)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
struct irq_desc *desc;
- /* high bit used in ret_from_ code */
- unsigned vector = ~regs->orig_ax;
- unsigned irq;
-
- exit_idle();
- irq_enter();
- irq = __get_cpu_var(vector_irq)[vector];
-
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
stack_overflow_check(regs);
-#endif
desc = irq_to_desc(irq);
- if (likely(desc))
- generic_handle_irq_desc(irq, desc);
- else {
- if (!disable_apic)
- ack_APIC_irq();
-
- if (printk_ratelimit())
- printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
- __func__, smp_processor_id(), vector);
- }
+ if (unlikely(!desc))
+ return false;
- irq_exit();
-
- set_irq_regs(old_regs);
- return 1;
+ generic_handle_irq_desc(irq, desc);
+ return true;
}
#ifdef CONFIG_HOTPLUG_CPU
-void fixup_irqs(cpumask_t map)
+/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
+void fixup_irqs(void)
{
unsigned int irq;
static int warned;
struct irq_desc *desc;
for_each_irq_desc(irq, desc) {
- cpumask_t mask;
int break_affinity = 0;
int set_affinity = 1;
+ const struct cpumask *affinity;
+ if (!desc)
+ continue;
if (irq == 2)
continue;
/* interrupt's are disabled at this point */
spin_lock(&desc->lock);
+ affinity = desc->affinity;
if (!irq_has_action(irq) ||
- cpus_equal(desc->affinity, map)) {
+ cpumask_equal(affinity, cpu_online_mask)) {
spin_unlock(&desc->lock);
continue;
}
- cpus_and(mask, desc->affinity, map);
- if (cpus_empty(mask)) {
+ if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
break_affinity = 1;
- mask = map;
+ affinity = cpu_all_mask;
}
if (desc->chip->mask)
desc->chip->mask(irq);
if (desc->chip->set_affinity)
- desc->chip->set_affinity(irq, mask);
+ desc->chip->set_affinity(irq, affinity);
else if (!(warned++))
set_affinity = 0;
@@ -142,18 +125,18 @@ extern void call_softirq(void);
asmlinkage void do_softirq(void)
{
- __u32 pending;
- unsigned long flags;
+ __u32 pending;
+ unsigned long flags;
- if (in_interrupt())
- return;
+ if (in_interrupt())
+ return;
- local_irq_save(flags);
- pending = local_softirq_pending();
- /* Switch to interrupt stack */
- if (pending) {
+ local_irq_save(flags);
+ pending = local_softirq_pending();
+ /* Switch to interrupt stack */
+ if (pending) {
call_softirq();
WARN_ON_ONCE(softirq_count());
}
- local_irq_restore(flags);
+ local_irq_restore(flags);
}
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 845aa9803e8..bf629cadec1 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -9,18 +9,18 @@
#include <linux/kernel_stat.h>
#include <linux/sysdev.h>
#include <linux/bitops.h>
+#include <linux/io.h>
+#include <linux/delay.h>
#include <asm/atomic.h>
#include <asm/system.h>
-#include <asm/io.h>
#include <asm/timer.h>
#include <asm/pgtable.h>
-#include <asm/delay.h>
#include <asm/desc.h>
#include <asm/apic.h>
#include <asm/arch_hooks.h>
#include <asm/i8259.h>
-
+#include <asm/traps.h>
/*
@@ -34,12 +34,10 @@
* leads to races. IBM designers who came up with it should
* be shot.
*/
-
static irqreturn_t math_error_irq(int cpl, void *dev_id)
{
- extern void math_error(void __user *);
- outb(0,0xF0);
+ outb(0, 0xF0);
if (ignore_fpu_irq || !boot_cpu_data.hard_math)
return IRQ_NONE;
math_error((void __user *)get_irq_regs()->ip);
@@ -56,7 +54,7 @@ static struct irqaction fpu_irq = {
.name = "fpu",
};
-void __init init_ISA_irqs (void)
+void __init init_ISA_irqs(void)
{
int i;
@@ -68,8 +66,7 @@ void __init init_ISA_irqs (void)
/*
* 16 old-style INTA-cycle interrupts:
*/
- for (i = 0; i < 16; i++) {
- /* first time call this irq_desc */
+ for (i = 0; i < NR_IRQS_LEGACY; i++) {
struct irq_desc *desc = irq_to_desc(i);
desc->status = IRQ_DISABLED;
@@ -111,6 +108,18 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
};
+int vector_used_by_percpu_irq(unsigned int vector)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ if (per_cpu(vector_irq, cpu)[vector] != -1)
+ return 1;
+ }
+
+ return 0;
+}
+
/* Overridden in paravirt.c */
void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
@@ -129,7 +138,7 @@ void __init native_init_IRQ(void)
for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
/* SYSCALL_VECTOR was reserved in trap_init. */
if (i != SYSCALL_VECTOR)
- set_intr_gate(i, interrupt[i]);
+ set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
}
@@ -140,17 +149,26 @@ void __init native_init_IRQ(void)
*/
alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
- /* IPI for invalidation */
- alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+ /* IPIs for invalidation */
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
/* IPI for generic function call */
alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
/* IPI for single call function */
- set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt);
+ alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
+ call_function_single_interrupt);
/* Low priority IPI to cleanup after moving an irq */
set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+ set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
#endif
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index ff023539128..da481a1e3f3 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -11,54 +11,19 @@
#include <linux/kernel_stat.h>
#include <linux/sysdev.h>
#include <linux/bitops.h>
+#include <linux/acpi.h>
+#include <linux/io.h>
+#include <linux/delay.h>
-#include <asm/acpi.h>
#include <asm/atomic.h>
#include <asm/system.h>
-#include <asm/io.h>
#include <asm/hw_irq.h>
#include <asm/pgtable.h>
-#include <asm/delay.h>
#include <asm/desc.h>
#include <asm/apic.h>
#include <asm/i8259.h>
/*
- * Common place to define all x86 IRQ vectors
- *
- * This builds up the IRQ handler stubs using some ugly macros in irq.h
- *
- * These macros create the low-level assembly IRQ routines that save
- * register context and call do_IRQ(). do_IRQ() then does all the
- * operations that are needed to keep the AT (or SMP IOAPIC)
- * interrupt-controller happy.
- */
-
-#define IRQ_NAME2(nr) nr##_interrupt(void)
-#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
-
-/*
- * SMP has a few special interrupts for IPI messages
- */
-
-#define BUILD_IRQ(nr) \
- asmlinkage void IRQ_NAME(nr); \
- asm("\n.text\n.p2align\n" \
- "IRQ" #nr "_interrupt:\n\t" \
- "push $~(" #nr ") ; " \
- "jmp common_interrupt\n" \
- ".previous");
-
-#define BI(x,y) \
- BUILD_IRQ(x##y)
-
-#define BUILD_16_IRQS(x) \
- BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
- BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
- BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
- BI(x,c) BI(x,d) BI(x,e) BI(x,f)
-
-/*
* ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
* (these are usually mapped to vectors 0x30-0x3f)
*/
@@ -73,37 +38,6 @@
*
* (these are usually mapped into the 0x30-0xff vector range)
*/
- BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
-BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
-BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
-BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
-
-#undef BUILD_16_IRQS
-#undef BI
-
-
-#define IRQ(x,y) \
- IRQ##x##y##_interrupt
-
-#define IRQLIST_16(x) \
- IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
- IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
- IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
- IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
-
-/* for the irq vectors */
-static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
- IRQLIST_16(0x2), IRQLIST_16(0x3),
- IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
- IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
- IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
-};
-
-#undef IRQ
-#undef IRQLIST_16
-
-
-
/*
* IRQ2 is cascade interrupt to second interrupt controller
@@ -135,15 +69,26 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
};
-void __init init_ISA_irqs(void)
+int vector_used_by_percpu_irq(unsigned int vector)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ if (per_cpu(vector_irq, cpu)[vector] != -1)
+ return 1;
+ }
+
+ return 0;
+}
+
+static void __init init_ISA_irqs(void)
{
int i;
init_bsp_APIC();
init_8259A(0);
- for (i = 0; i < 16; i++) {
- /* first time call this irq_desc */
+ for (i = 0; i < NR_IRQS_LEGACY; i++) {
struct irq_desc *desc = irq_to_desc(i);
desc->status = IRQ_DISABLED;
@@ -188,6 +133,7 @@ static void __init smp_intr_init(void)
/* Low priority IPI to cleanup after moving an irq */
set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+ set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
#endif
}
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 10435a120d2..5c4f5548384 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -46,7 +46,7 @@
#include <asm/apicdef.h>
#include <asm/system.h>
-#include <mach_ipi.h>
+#include <asm/genapic.h>
/*
* Put the error code here just in case the user cares:
@@ -347,7 +347,7 @@ void kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
*/
void kgdb_roundup_cpus(unsigned long flags)
{
- send_IPI_allbutself(APIC_DM_NMI);
+ apic->send_IPI_allbutself(APIC_DM_NMI);
}
#endif
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 6c27679ec6a..e948b28a5a9 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -376,9 +376,10 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
void __kprobes arch_remove_kprobe(struct kprobe *p)
{
- mutex_lock(&kprobe_mutex);
- free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
- mutex_unlock(&kprobe_mutex);
+ if (p->ainsn.insn) {
+ free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
+ p->ainsn.insn = NULL;
+ }
}
static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
@@ -445,7 +446,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
struct kprobe_ctlblk *kcb)
{
-#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
+#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER)
if (p->ainsn.boostable == 1 && !p->post_handler) {
/* Boost up -- we can execute copied instructions directly */
reset_current_kprobe();
@@ -694,7 +695,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
/*
* It is possible to have multiple instances associated with a given
* task either because multiple functions in the call path have
- * return probes installed on them, and/or more then one
+ * return probes installed on them, and/or more than one
* return probe was registered for a target function.
*
* We can handle this because:
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 774ac499156..652fce6d2cc 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -89,17 +89,17 @@ static cycle_t kvm_clock_read(void)
*/
static unsigned long kvm_get_tsc_khz(void)
{
- return preset_lpj;
+ struct pvclock_vcpu_time_info *src;
+ src = &per_cpu(hv_clock, 0);
+ return pvclock_tsc_khz(src);
}
static void kvm_get_preset_lpj(void)
{
- struct pvclock_vcpu_time_info *src;
unsigned long khz;
u64 lpj;
- src = &per_cpu(hv_clock, 0);
- khz = pvclock_tsc_khz(src);
+ khz = kvm_get_tsc_khz();
lpj = ((u64)khz * 1000);
do_div(lpj, HZ);
@@ -128,7 +128,7 @@ static int kvm_register_clock(char *txt)
}
#ifdef CONFIG_X86_LOCAL_APIC
-static void kvm_setup_secondary_clock(void)
+static void __cpuinit kvm_setup_secondary_clock(void)
{
/*
* Now that the first cpu already had this clocksource initialized,
@@ -194,5 +194,7 @@ void __init kvmclock_init(void)
#endif
kvm_get_preset_lpj();
clocksource_register(&kvm_clock);
+ pv_info.paravirt_enabled = 1;
+ pv_info.name = "KVM";
}
}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index eee32b43fee..71f1d99a635 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -12,8 +12,8 @@
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
-#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/ldt.h>
#include <asm/desc.h>
@@ -93,7 +93,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
if (err < 0)
return err;
- for(i = 0; i < old->size; i++)
+ for (i = 0; i < old->size; i++)
write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
return 0;
}
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 3b599518c32..8815f3c7fec 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -252,7 +252,7 @@ EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
/*
* The MFPGT timers on the CS5536 provide us with suitable timers to use
* as clock event sources - not as good as a HPET or APIC, but certainly
- * better then the PIT. This isn't a general purpose MFGPT driver, but
+ * better than the PIT. This isn't a general purpose MFGPT driver, but
* a simplified one designed specifically to act as a clock event source.
* For full details about the MFGPT, please consult the CS5536 data sheet.
*/
@@ -287,7 +287,7 @@ static struct clock_event_device mfgpt_clockevent = {
.set_mode = mfgpt_set_mode,
.set_next_event = mfgpt_next_event,
.rating = 250,
- .cpumask = CPU_MASK_ALL,
+ .cpumask = cpu_all_mask,
.shift = 32
};
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 5f8e5d75a25..c25fdb38229 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -10,7 +10,7 @@
* This driver allows to upgrade microcode on AMD
* family 0x10 and 0x11 processors.
*
- * Licensed unter the terms of the GNU General Public
+ * Licensed under the terms of the GNU General Public
* License version 2. See file COPYING for details.
*/
@@ -32,9 +32,9 @@
#include <linux/platform_device.h>
#include <linux/pci.h>
#include <linux/pci_ids.h>
+#include <linux/uaccess.h>
#include <asm/msr.h>
-#include <asm/uaccess.h>
#include <asm/processor.h>
#include <asm/microcode.h>
@@ -47,43 +47,38 @@ MODULE_LICENSE("GPL v2");
#define UCODE_UCODE_TYPE 0x00000001
struct equiv_cpu_entry {
- unsigned int installed_cpu;
- unsigned int fixed_errata_mask;
- unsigned int fixed_errata_compare;
- unsigned int equiv_cpu;
-};
+ u32 installed_cpu;
+ u32 fixed_errata_mask;
+ u32 fixed_errata_compare;
+ u16 equiv_cpu;
+ u16 res;
+} __attribute__((packed));
struct microcode_header_amd {
- unsigned int data_code;
- unsigned int patch_id;
- unsigned char mc_patch_data_id[2];
- unsigned char mc_patch_data_len;
- unsigned char init_flag;
- unsigned int mc_patch_data_checksum;
- unsigned int nb_dev_id;
- unsigned int sb_dev_id;
- unsigned char processor_rev_id[2];
- unsigned char nb_rev_id;
- unsigned char sb_rev_id;
- unsigned char bios_api_rev;
- unsigned char reserved1[3];
- unsigned int match_reg[8];
-};
+ u32 data_code;
+ u32 patch_id;
+ u16 mc_patch_data_id;
+ u8 mc_patch_data_len;
+ u8 init_flag;
+ u32 mc_patch_data_checksum;
+ u32 nb_dev_id;
+ u32 sb_dev_id;
+ u16 processor_rev_id;
+ u8 nb_rev_id;
+ u8 sb_rev_id;
+ u8 bios_api_rev;
+ u8 reserved1[3];
+ u32 match_reg[8];
+} __attribute__((packed));
struct microcode_amd {
struct microcode_header_amd hdr;
unsigned int mpb[0];
};
-#define UCODE_MAX_SIZE (2048)
-#define DEFAULT_UCODE_DATASIZE (896)
-#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd))
-#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
-#define DWSIZE (sizeof(u32))
-/* For now we support a fixed ucode total size only */
-#define get_totalsize(mc) \
- ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \
- + MC_HEADER_SIZE)
+#define UCODE_MAX_SIZE 2048
+#define UCODE_CONTAINER_SECTION_HDR 8
+#define UCODE_CONTAINER_HEADER_SIZE 12
/* serialize access to the physical write */
static DEFINE_SPINLOCK(microcode_update_lock);
@@ -93,31 +88,24 @@ static struct equiv_cpu_entry *equiv_cpu_table;
static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ u32 dummy;
memset(csig, 0, sizeof(*csig));
-
if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
- printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n",
- cpu);
+ printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not "
+ "supported\n", cpu, c->x86);
return -1;
}
-
- asm volatile("movl %1, %%ecx; rdmsr"
- : "=a" (csig->rev)
- : "i" (0x0000008B) : "ecx");
-
- printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n",
- csig->rev);
-
+ rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
+ printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev);
return 0;
}
static int get_matching_microcode(int cpu, void *mc, int rev)
{
struct microcode_header_amd *mc_header = mc;
- struct pci_dev *nb_pci_dev, *sb_pci_dev;
unsigned int current_cpu_id;
- unsigned int equiv_cpu_id = 0x00;
+ u16 equiv_cpu_id = 0;
unsigned int i = 0;
BUG_ON(equiv_cpu_table == NULL);
@@ -132,57 +120,25 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
}
if (!equiv_cpu_id) {
- printk(KERN_ERR "microcode: CPU%d cpu_id "
- "not found in equivalent cpu table \n", cpu);
+ printk(KERN_WARNING "microcode: CPU%d: cpu revision "
+ "not listed in equivalent cpu table\n", cpu);
return 0;
}
- if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) {
- printk(KERN_ERR
- "microcode: CPU%d patch does not match "
- "(patch is %x, cpu extended is %x) \n",
- cpu, mc_header->processor_rev_id[0],
- (equiv_cpu_id & 0xff));
+ if (mc_header->processor_rev_id != equiv_cpu_id) {
+ printk(KERN_ERR "microcode: CPU%d: patch mismatch "
+ "(processor_rev_id: %x, equiv_cpu_id: %x)\n",
+ cpu, mc_header->processor_rev_id, equiv_cpu_id);
return 0;
}
- if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) {
- printk(KERN_ERR "microcode: CPU%d patch does not match "
- "(patch is %x, cpu base id is %x) \n",
- cpu, mc_header->processor_rev_id[1],
- ((equiv_cpu_id >> 16) & 0xff));
-
+ /* ucode might be chipset specific -- currently we don't support this */
+ if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
+ printk(KERN_ERR "microcode: CPU%d: loading of chipset "
+ "specific code not yet supported\n", cpu);
return 0;
}
- /* ucode may be northbridge specific */
- if (mc_header->nb_dev_id) {
- nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
- (mc_header->nb_dev_id & 0xff),
- NULL);
- if ((!nb_pci_dev) ||
- (mc_header->nb_rev_id != nb_pci_dev->revision)) {
- printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu);
- pci_dev_put(nb_pci_dev);
- return 0;
- }
- pci_dev_put(nb_pci_dev);
- }
-
- /* ucode may be southbridge specific */
- if (mc_header->sb_dev_id) {
- sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
- (mc_header->sb_dev_id & 0xff),
- NULL);
- if ((!sb_pci_dev) ||
- (mc_header->sb_rev_id != sb_pci_dev->revision)) {
- printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu);
- pci_dev_put(sb_pci_dev);
- return 0;
- }
- pci_dev_put(sb_pci_dev);
- }
-
if (mc_header->patch_id <= rev)
return 0;
@@ -192,12 +148,10 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
static void apply_microcode_amd(int cpu)
{
unsigned long flags;
- unsigned int eax, edx;
- unsigned int rev;
+ u32 rev, dummy;
int cpu_num = raw_smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
struct microcode_amd *mc_amd = uci->mc;
- unsigned long addr;
/* We should bind the task to the CPU */
BUG_ON(cpu_num != cpu);
@@ -206,42 +160,34 @@ static void apply_microcode_amd(int cpu)
return;
spin_lock_irqsave(&microcode_update_lock, flags);
-
- addr = (unsigned long)&mc_amd->hdr.data_code;
- edx = (unsigned int)(((unsigned long)upper_32_bits(addr)));
- eax = (unsigned int)(((unsigned long)lower_32_bits(addr)));
-
- asm volatile("movl %0, %%ecx; wrmsr" :
- : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx");
-
+ wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
/* get patch id after patching */
- asm volatile("movl %1, %%ecx; rdmsr"
- : "=a" (rev)
- : "i" (0x0000008B) : "ecx");
-
+ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
spin_unlock_irqrestore(&microcode_update_lock, flags);
/* check current patch id and patch's id for match */
if (rev != mc_amd->hdr.patch_id) {
- printk(KERN_ERR "microcode: CPU%d update from revision "
- "0x%x to 0x%x failed\n", cpu_num,
- mc_amd->hdr.patch_id, rev);
+ printk(KERN_ERR "microcode: CPU%d: update failed "
+ "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
return;
}
- printk(KERN_INFO "microcode: CPU%d updated from revision "
- "0x%x to 0x%x \n",
- cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id);
+ printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
+ cpu, rev);
uci->cpu_sig.rev = rev;
}
-static void * get_next_ucode(u8 *buf, unsigned int size,
- int (*get_ucode_data)(void *, const void *, size_t),
- unsigned int *mc_size)
+static int get_ucode_data(void *to, const u8 *from, size_t n)
+{
+ memcpy(to, from, n);
+ return 0;
+}
+
+static void *get_next_ucode(const u8 *buf, unsigned int size,
+ unsigned int *mc_size)
{
unsigned int total_size;
-#define UCODE_CONTAINER_SECTION_HDR 8
u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
void *mc;
@@ -249,39 +195,37 @@ static void * get_next_ucode(u8 *buf, unsigned int size,
return NULL;
if (section_hdr[0] != UCODE_UCODE_TYPE) {
- printk(KERN_ERR "microcode: error! "
- "Wrong microcode payload type field\n");
+ printk(KERN_ERR "microcode: error: invalid type field in "
+ "container file section header\n");
return NULL;
}
total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
- printk(KERN_INFO "microcode: size %u, total_size %u\n",
- size, total_size);
+ printk(KERN_DEBUG "microcode: size %u, total_size %u\n",
+ size, total_size);
if (total_size > size || total_size > UCODE_MAX_SIZE) {
- printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
+ printk(KERN_ERR "microcode: error: size mismatch\n");
return NULL;
}
mc = vmalloc(UCODE_MAX_SIZE);
if (mc) {
memset(mc, 0, UCODE_MAX_SIZE);
- if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) {
+ if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR,
+ total_size)) {
vfree(mc);
mc = NULL;
} else
*mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
}
-#undef UCODE_CONTAINER_SECTION_HDR
return mc;
}
-static int install_equiv_cpu_table(u8 *buf,
- int (*get_ucode_data)(void *, const void *, size_t))
+static int install_equiv_cpu_table(const u8 *buf)
{
-#define UCODE_CONTAINER_HEADER_SIZE 12
u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
unsigned int *buf_pos = (unsigned int *)container_hdr;
unsigned long size;
@@ -292,14 +236,15 @@ static int install_equiv_cpu_table(u8 *buf,
size = buf_pos[2];
if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
- printk(KERN_ERR "microcode: error! "
- "Wrong microcode equivalnet cpu table\n");
+ printk(KERN_ERR "microcode: error: invalid type field in "
+ "container file section header\n");
return 0;
}
equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
if (!equiv_cpu_table) {
- printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n");
+ printk(KERN_ERR "microcode: failed to allocate "
+ "equivalent CPU table\n");
return 0;
}
@@ -310,7 +255,6 @@ static int install_equiv_cpu_table(u8 *buf,
}
return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
-#undef UCODE_CONTAINER_HEADER_SIZE
}
static void free_equiv_cpu_table(void)
@@ -321,18 +265,20 @@ static void free_equiv_cpu_table(void)
}
}
-static int generic_load_microcode(int cpu, void *data, size_t size,
- int (*get_ucode_data)(void *, const void *, size_t))
+static int generic_load_microcode(int cpu, const u8 *data, size_t size)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- u8 *ucode_ptr = data, *new_mc = NULL, *mc;
+ const u8 *ucode_ptr = data;
+ void *new_mc = NULL;
+ void *mc;
int new_rev = uci->cpu_sig.rev;
unsigned int leftover;
unsigned long offset;
- offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data);
+ offset = install_equiv_cpu_table(ucode_ptr);
if (!offset) {
- printk(KERN_ERR "microcode: installing equivalent cpu table failed\n");
+ printk(KERN_ERR "microcode: failed to create "
+ "equivalent cpu table\n");
return -EINVAL;
}
@@ -343,7 +289,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
unsigned int uninitialized_var(mc_size);
struct microcode_header_amd *mc_header;
- mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size);
+ mc = get_next_ucode(ucode_ptr, leftover, &mc_size);
if (!mc)
break;
@@ -353,7 +299,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
vfree(new_mc);
new_rev = mc_header->patch_id;
new_mc = mc;
- } else
+ } else
vfree(mc);
ucode_ptr += mc_size;
@@ -365,9 +311,9 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
if (uci->mc)
vfree(uci->mc);
uci->mc = new_mc;
- pr_debug("microcode: CPU%d found a matching microcode update with"
- " version 0x%x (current=0x%x)\n",
- cpu, new_rev, uci->cpu_sig.rev);
+ pr_debug("microcode: CPU%d found a matching microcode "
+ "update with version 0x%x (current=0x%x)\n",
+ cpu, new_rev, uci->cpu_sig.rev);
} else
vfree(new_mc);
}
@@ -377,12 +323,6 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
return (int)leftover;
}
-static int get_ucode_fw(void *to, const void *from, size_t n)
-{
- memcpy(to, from, n);
- return 0;
-}
-
static int request_microcode_fw(int cpu, struct device *device)
{
const char *fw_name = "amd-ucode/microcode_amd.bin";
@@ -394,12 +334,11 @@ static int request_microcode_fw(int cpu, struct device *device)
ret = request_firmware(&firmware, fw_name, device);
if (ret) {
- printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name);
+ printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
return ret;
}
- ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
- &get_ucode_fw);
+ ret = generic_load_microcode(cpu, firmware->data, firmware->size);
release_firmware(firmware);
@@ -408,8 +347,8 @@ static int request_microcode_fw(int cpu, struct device *device)
static int request_microcode_user(int cpu, const void __user *buf, size_t size)
{
- printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode"
- "is not supported\n");
+ printk(KERN_INFO "microcode: AMD microcode update via "
+ "/dev/cpu/microcode not supported\n");
return -1;
}
@@ -433,3 +372,4 @@ struct microcode_ops * __init init_amd_microcode(void)
{
return &microcode_amd_ops;
}
+
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 82fb2809ce3..c9b721ba968 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -99,7 +99,7 @@ MODULE_LICENSE("GPL");
#define MICROCODE_VERSION "2.00"
-struct microcode_ops *microcode_ops;
+static struct microcode_ops *microcode_ops;
/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
static DEFINE_MUTEX(microcode_mutex);
@@ -203,7 +203,7 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
#endif
/* fake device for request_firmware */
-struct platform_device *microcode_pdev;
+static struct platform_device *microcode_pdev;
static ssize_t reload_store(struct sys_device *dev,
struct sysdev_attribute *attr,
@@ -272,13 +272,18 @@ static struct attribute_group mc_attr_group = {
.name = "microcode",
};
-static void microcode_fini_cpu(int cpu)
+static void __microcode_fini_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- mutex_lock(&microcode_mutex);
microcode_ops->microcode_fini_cpu(cpu);
uci->valid = 0;
+}
+
+static void microcode_fini_cpu(int cpu)
+{
+ mutex_lock(&microcode_mutex);
+ __microcode_fini_cpu(cpu);
mutex_unlock(&microcode_mutex);
}
@@ -306,12 +311,16 @@ static int microcode_resume_cpu(int cpu)
* to this cpu (a bit of paranoia):
*/
if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
- microcode_fini_cpu(cpu);
+ __microcode_fini_cpu(cpu);
+ printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n",
+ cpu);
return -1;
}
- if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) {
- microcode_fini_cpu(cpu);
+ if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) {
+ __microcode_fini_cpu(cpu);
+ printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n",
+ cpu);
/* Should we look for a new ucode here? */
return 1;
}
@@ -319,7 +328,7 @@ static int microcode_resume_cpu(int cpu)
return 0;
}
-void microcode_update_cpu(int cpu)
+static void microcode_update_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
int err = 0;
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 622dc4a2178..5e9f4fc5138 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -87,9 +87,9 @@
#include <linux/cpu.h>
#include <linux/firmware.h>
#include <linux/platform_device.h>
+#include <linux/uaccess.h>
#include <asm/msr.h>
-#include <asm/uaccess.h>
#include <asm/processor.h>
#include <asm/microcode.h>
@@ -155,6 +155,7 @@ static DEFINE_SPINLOCK(microcode_update_lock);
static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
{
struct cpuinfo_x86 *c = &cpu_data(cpu_num);
+ unsigned long flags;
unsigned int val[2];
memset(csig, 0, sizeof(*csig));
@@ -174,11 +175,16 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
csig->pf = 1 << ((val[1] >> 18) & 7);
}
+ /* serialize access to the physical write to MSR 0x79 */
+ spin_lock_irqsave(&microcode_update_lock, flags);
+
wrmsr(MSR_IA32_UCODE_REV, 0, 0);
/* see notes above for revision 1.07. Apparent chip bug */
sync_core();
/* get the current revision from MSR 0x8B */
rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
+ spin_unlock_irqrestore(&microcode_update_lock, flags);
+
pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
csig->sig, csig->pf, csig->rev);
@@ -190,7 +196,7 @@ static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
}
-static inline int
+static inline int
update_match_revision(struct microcode_header_intel *mc_header, int rev)
{
return (mc_header->rev <= rev) ? 0 : 1;
@@ -436,8 +442,8 @@ static int request_microcode_fw(int cpu, struct device *device)
return ret;
}
- ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
- &get_ucode_fw);
+ ret = generic_load_microcode(cpu, (void *)firmware->data,
+ firmware->size, &get_ucode_fw);
release_firmware(firmware);
@@ -454,7 +460,7 @@ static int request_microcode_user(int cpu, const void __user *buf, size_t size)
/* We should bind the task to the CPU */
BUG_ON(cpu != raw_smp_processor_id());
- return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user);
+ return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
}
static void microcode_fini_cpu(int cpu)
@@ -465,7 +471,7 @@ static void microcode_fini_cpu(int cpu)
uci->mc = NULL;
}
-struct microcode_ops microcode_intel_ops = {
+static struct microcode_ops microcode_intel_ops = {
.request_microcode_user = request_microcode_user,
.request_microcode_fw = request_microcode_fw,
.collect_cpu_info = collect_cpu_info,
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index efc2f361fe8..666e43df51f 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -13,8 +13,7 @@
#include <asm/msr.h>
#include <asm/acpi.h>
#include <asm/mmconfig.h>
-
-#include "../pci/pci.h"
+#include <asm/pci_x86.h>
struct pci_hostbridge_probe {
u32 bus;
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c
index 3db0a5442eb..0edd819050e 100644
--- a/arch/x86/kernel/module_32.c
+++ b/arch/x86/kernel/module_32.c
@@ -42,7 +42,7 @@ void module_free(struct module *mod, void *module_region)
{
vfree(module_region);
/* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
+ table entries. */
}
/* We don't need anything special. */
@@ -113,13 +113,13 @@ int module_finalize(const Elf_Ehdr *hdr,
*para = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
- for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
+ for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
if (!strcmp(".text", secstrings + s->sh_name))
text = s;
if (!strcmp(".altinstructions", secstrings + s->sh_name))
alt = s;
if (!strcmp(".smp_locks", secstrings + s->sh_name))
- locks= s;
+ locks = s;
if (!strcmp(".parainstructions", secstrings + s->sh_name))
para = s;
}
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
index 6ba87830d4b..c23880b90b5 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module_64.c
@@ -30,14 +30,14 @@
#include <asm/page.h>
#include <asm/pgtable.h>
-#define DEBUGP(fmt...)
+#define DEBUGP(fmt...)
#ifndef CONFIG_UML
void module_free(struct module *mod, void *module_region)
{
vfree(module_region);
/* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
+ table entries. */
}
void *module_alloc(unsigned long size)
@@ -77,7 +77,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
Elf64_Sym *sym;
void *loc;
- u64 val;
+ u64 val;
DEBUGP("Applying relocate section %u to %u\n", relsec,
sechdrs[relsec].sh_info);
@@ -91,11 +91,11 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
+ ELF64_R_SYM(rel[i].r_info);
- DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
- (int)ELF64_R_TYPE(rel[i].r_info),
- sym->st_value, rel[i].r_addend, (u64)loc);
+ DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
+ (int)ELF64_R_TYPE(rel[i].r_info),
+ sym->st_value, rel[i].r_addend, (u64)loc);
- val = sym->st_value + rel[i].r_addend;
+ val = sym->st_value + rel[i].r_addend;
switch (ELF64_R_TYPE(rel[i].r_info)) {
case R_X86_64_NONE:
@@ -113,16 +113,16 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
if ((s64)val != *(s32 *)loc)
goto overflow;
break;
- case R_X86_64_PC32:
+ case R_X86_64_PC32:
val -= (u64)loc;
*(u32 *)loc = val;
#if 0
if ((s64)val != *(s32 *)loc)
- goto overflow;
+ goto overflow;
#endif
break;
default:
- printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n",
+ printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n",
me->name, ELF64_R_TYPE(rel[i].r_info));
return -ENOEXEC;
}
@@ -130,7 +130,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
return 0;
overflow:
- printk(KERN_ERR "overflow in relocation type %d val %Lx\n",
+ printk(KERN_ERR "overflow in relocation type %d val %Lx\n",
(int)ELF64_R_TYPE(rel[i].r_info), val);
printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
me->name);
@@ -143,13 +143,13 @@ int apply_relocate(Elf_Shdr *sechdrs,
unsigned int relsec,
struct module *me)
{
- printk("non add relocation not supported\n");
+ printk(KERN_ERR "non add relocation not supported\n");
return -ENOSYS;
-}
+}
int module_finalize(const Elf_Ehdr *hdr,
- const Elf_Shdr *sechdrs,
- struct module *me)
+ const Elf_Shdr *sechdrs,
+ struct module *me)
{
const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
*para = NULL;
@@ -161,7 +161,7 @@ int module_finalize(const Elf_Ehdr *hdr,
if (!strcmp(".altinstructions", secstrings + s->sh_name))
alt = s;
if (!strcmp(".smp_locks", secstrings + s->sh_name))
- locks= s;
+ locks = s;
if (!strcmp(".parainstructions", secstrings + s->sh_name))
para = s;
}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index f98f4e1dba0..20076445319 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -2,8 +2,8 @@
* Intel Multiprocessor Specification 1.1 and 1.4
* compliant MP-table parsing routines.
*
- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
+ * (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
* (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
*/
@@ -16,25 +16,20 @@
#include <linux/bitops.h>
#include <linux/acpi.h>
#include <linux/module.h>
+#include <linux/smp.h>
-#include <asm/smp.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
#include <asm/pgalloc.h>
#include <asm/io_apic.h>
#include <asm/proto.h>
-#include <asm/acpi.h>
#include <asm/bios_ebda.h>
#include <asm/e820.h>
#include <asm/trampoline.h>
#include <asm/setup.h>
+#include <asm/smp.h>
-#include <mach_apic.h>
-#ifdef CONFIG_X86_32
-#include <mach_apicdef.h>
-#include <mach_mpparse.h>
-#endif
-
+#include <asm/genapic.h>
/*
* Checksum an MP configuration block.
*/
@@ -49,12 +44,12 @@ static int __init mpf_checksum(unsigned char *mp, int len)
return sum & 0xFF;
}
-static void __init MP_processor_info(struct mpc_config_processor *m)
+static void __init MP_processor_info(struct mpc_cpu *m)
{
int apicid;
char *bootup_cpu = "";
- if (!(m->mpc_cpuflag & CPU_ENABLED)) {
+ if (!(m->cpuflag & CPU_ENABLED)) {
disabled_cpus++;
return;
}
@@ -62,54 +57,54 @@ static void __init MP_processor_info(struct mpc_config_processor *m)
if (x86_quirks->mpc_apic_id)
apicid = x86_quirks->mpc_apic_id(m);
else
- apicid = m->mpc_apicid;
+ apicid = m->apicid;
- if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+ if (m->cpuflag & CPU_BOOTPROCESSOR) {
bootup_cpu = " (Bootup-CPU)";
- boot_cpu_physical_apicid = m->mpc_apicid;
+ boot_cpu_physical_apicid = m->apicid;
}
- printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
- generic_processor_info(apicid, m->mpc_apicver);
+ printk(KERN_INFO "Processor #%d%s\n", m->apicid, bootup_cpu);
+ generic_processor_info(apicid, m->apicver);
}
#ifdef CONFIG_X86_IO_APIC
-static void __init MP_bus_info(struct mpc_config_bus *m)
+static void __init MP_bus_info(struct mpc_bus *m)
{
char str[7];
- memcpy(str, m->mpc_bustype, 6);
+ memcpy(str, m->bustype, 6);
str[6] = 0;
if (x86_quirks->mpc_oem_bus_info)
x86_quirks->mpc_oem_bus_info(m, str);
else
- apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str);
+ apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str);
#if MAX_MP_BUSSES < 256
- if (m->mpc_busid >= MAX_MP_BUSSES) {
+ if (m->busid >= MAX_MP_BUSSES) {
printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
" is too large, max. supported is %d\n",
- m->mpc_busid, str, MAX_MP_BUSSES - 1);
+ m->busid, str, MAX_MP_BUSSES - 1);
return;
}
#endif
if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
- set_bit(m->mpc_busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
+ set_bit(m->busid, mp_bus_not_pci);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+ mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
#endif
} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
if (x86_quirks->mpc_oem_pci_bus)
x86_quirks->mpc_oem_pci_bus(m);
- clear_bit(m->mpc_busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
+ clear_bit(m->busid, mp_bus_not_pci);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+ mp_bus_id_to_type[m->busid] = MP_BUS_PCI;
} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
+ mp_bus_id_to_type[m->busid] = MP_BUS_EISA;
} else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
+ mp_bus_id_to_type[m->busid] = MP_BUS_MCA;
#endif
} else
printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
@@ -133,89 +128,88 @@ static int bad_ioapic(unsigned long address)
return 0;
}
-static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
+static void __init MP_ioapic_info(struct mpc_ioapic *m)
{
- if (!(m->mpc_flags & MPC_APIC_USABLE))
+ if (!(m->flags & MPC_APIC_USABLE))
return;
printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
- m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
+ m->apicid, m->apicver, m->apicaddr);
- if (bad_ioapic(m->mpc_apicaddr))
+ if (bad_ioapic(m->apicaddr))
return;
- mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr;
- mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid;
- mp_ioapics[nr_ioapics].mp_type = m->mpc_type;
- mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver;
- mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags;
+ mp_ioapics[nr_ioapics].apicaddr = m->apicaddr;
+ mp_ioapics[nr_ioapics].apicid = m->apicid;
+ mp_ioapics[nr_ioapics].type = m->type;
+ mp_ioapics[nr_ioapics].apicver = m->apicver;
+ mp_ioapics[nr_ioapics].flags = m->flags;
nr_ioapics++;
}
-static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
+static void print_MP_intsrc_info(struct mpc_intsrc *m)
{
apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
" IRQ %02x, APIC ID %x, APIC INT %02x\n",
- m->mpc_irqtype, m->mpc_irqflag & 3,
- (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
- m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
+ m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
+ m->srcbusirq, m->dstapic, m->dstirq);
}
-static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
+static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
{
apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
" IRQ %02x, APIC ID %x, APIC INT %02x\n",
- mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
- (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
- mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
+ mp_irq->irqtype, mp_irq->irqflag & 3,
+ (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus,
+ mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
}
-static void __init assign_to_mp_irq(struct mpc_config_intsrc *m,
- struct mp_config_intsrc *mp_irq)
+static void __init assign_to_mp_irq(struct mpc_intsrc *m,
+ struct mpc_intsrc *mp_irq)
{
- mp_irq->mp_dstapic = m->mpc_dstapic;
- mp_irq->mp_type = m->mpc_type;
- mp_irq->mp_irqtype = m->mpc_irqtype;
- mp_irq->mp_irqflag = m->mpc_irqflag;
- mp_irq->mp_srcbus = m->mpc_srcbus;
- mp_irq->mp_srcbusirq = m->mpc_srcbusirq;
- mp_irq->mp_dstirq = m->mpc_dstirq;
+ mp_irq->dstapic = m->dstapic;
+ mp_irq->type = m->type;
+ mp_irq->irqtype = m->irqtype;
+ mp_irq->irqflag = m->irqflag;
+ mp_irq->srcbus = m->srcbus;
+ mp_irq->srcbusirq = m->srcbusirq;
+ mp_irq->dstirq = m->dstirq;
}
-static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
- struct mpc_config_intsrc *m)
+static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq,
+ struct mpc_intsrc *m)
{
- m->mpc_dstapic = mp_irq->mp_dstapic;
- m->mpc_type = mp_irq->mp_type;
- m->mpc_irqtype = mp_irq->mp_irqtype;
- m->mpc_irqflag = mp_irq->mp_irqflag;
- m->mpc_srcbus = mp_irq->mp_srcbus;
- m->mpc_srcbusirq = mp_irq->mp_srcbusirq;
- m->mpc_dstirq = mp_irq->mp_dstirq;
+ m->dstapic = mp_irq->dstapic;
+ m->type = mp_irq->type;
+ m->irqtype = mp_irq->irqtype;
+ m->irqflag = mp_irq->irqflag;
+ m->srcbus = mp_irq->srcbus;
+ m->srcbusirq = mp_irq->srcbusirq;
+ m->dstirq = mp_irq->dstirq;
}
-static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
- struct mpc_config_intsrc *m)
+static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq,
+ struct mpc_intsrc *m)
{
- if (mp_irq->mp_dstapic != m->mpc_dstapic)
+ if (mp_irq->dstapic != m->dstapic)
return 1;
- if (mp_irq->mp_type != m->mpc_type)
+ if (mp_irq->type != m->type)
return 2;
- if (mp_irq->mp_irqtype != m->mpc_irqtype)
+ if (mp_irq->irqtype != m->irqtype)
return 3;
- if (mp_irq->mp_irqflag != m->mpc_irqflag)
+ if (mp_irq->irqflag != m->irqflag)
return 4;
- if (mp_irq->mp_srcbus != m->mpc_srcbus)
+ if (mp_irq->srcbus != m->srcbus)
return 5;
- if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq)
+ if (mp_irq->srcbusirq != m->srcbusirq)
return 6;
- if (mp_irq->mp_dstirq != m->mpc_dstirq)
+ if (mp_irq->dstirq != m->dstirq)
return 7;
return 0;
}
-static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
+static void __init MP_intsrc_info(struct mpc_intsrc *m)
{
int i;
@@ -233,57 +227,55 @@ static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
#endif
-static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
+static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
{
apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
" IRQ %02x, APIC ID %x, APIC LINT %02x\n",
- m->mpc_irqtype, m->mpc_irqflag & 3,
- (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
- m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
+ m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid,
+ m->srcbusirq, m->destapic, m->destapiclint);
}
/*
* Read/parse the MPC
*/
-static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem,
- char *str)
+static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
{
- if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
+ if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) {
printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
- mpc->mpc_signature[0], mpc->mpc_signature[1],
- mpc->mpc_signature[2], mpc->mpc_signature[3]);
+ mpc->signature[0], mpc->signature[1],
+ mpc->signature[2], mpc->signature[3]);
return 0;
}
- if (mpf_checksum((unsigned char *)mpc, mpc->mpc_length)) {
+ if (mpf_checksum((unsigned char *)mpc, mpc->length)) {
printk(KERN_ERR "MPTABLE: checksum error!\n");
return 0;
}
- if (mpc->mpc_spec != 0x01 && mpc->mpc_spec != 0x04) {
+ if (mpc->spec != 0x01 && mpc->spec != 0x04) {
printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
- mpc->mpc_spec);
+ mpc->spec);
return 0;
}
- if (!mpc->mpc_lapic) {
+ if (!mpc->lapic) {
printk(KERN_ERR "MPTABLE: null local APIC address!\n");
return 0;
}
- memcpy(oem, mpc->mpc_oem, 8);
+ memcpy(oem, mpc->oem, 8);
oem[8] = 0;
printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
- memcpy(str, mpc->mpc_productid, 12);
+ memcpy(str, mpc->productid, 12);
str[12] = 0;
printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
- printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
+ printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->lapic);
return 1;
}
-static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
+static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
{
char str[16];
char oem[10];
@@ -295,27 +287,18 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
return 0;
#ifdef CONFIG_X86_32
- /*
- * need to make sure summit and es7000's mps_oem_check is safe to be
- * called early via genericarch 's mps_oem_check
- */
- if (early) {
-#ifdef CONFIG_X86_NUMAQ
- numaq_mps_oem_check(mpc, oem, str);
-#endif
- } else
- mps_oem_check(mpc, oem, str);
+ generic_mps_oem_check(mpc, oem, str);
#endif
/* save the local APIC address, it might be non-default */
if (!acpi_lapic)
- mp_lapic_addr = mpc->mpc_lapic;
+ mp_lapic_addr = mpc->lapic;
if (early)
return 1;
- if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
- struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
- x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
+ if (mpc->oemptr && x86_quirks->smp_read_mpc_oem) {
+ struct mpc_oemtable *oem_table = (void *)(long)mpc->oemptr;
+ x86_quirks->smp_read_mpc_oem(oem_table, mpc->oemsize);
}
/*
@@ -324,12 +307,11 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
if (x86_quirks->mpc_record)
*x86_quirks->mpc_record = 0;
- while (count < mpc->mpc_length) {
+ while (count < mpc->length) {
switch (*mpt) {
case MP_PROCESSOR:
{
- struct mpc_config_processor *m =
- (struct mpc_config_processor *)mpt;
+ struct mpc_cpu *m = (struct mpc_cpu *)mpt;
/* ACPI may have already provided this data */
if (!acpi_lapic)
MP_processor_info(m);
@@ -339,8 +321,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
}
case MP_BUS:
{
- struct mpc_config_bus *m =
- (struct mpc_config_bus *)mpt;
+ struct mpc_bus *m = (struct mpc_bus *)mpt;
#ifdef CONFIG_X86_IO_APIC
MP_bus_info(m);
#endif
@@ -351,30 +332,28 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
case MP_IOAPIC:
{
#ifdef CONFIG_X86_IO_APIC
- struct mpc_config_ioapic *m =
- (struct mpc_config_ioapic *)mpt;
+ struct mpc_ioapic *m = (struct mpc_ioapic *)mpt;
MP_ioapic_info(m);
#endif
- mpt += sizeof(struct mpc_config_ioapic);
- count += sizeof(struct mpc_config_ioapic);
+ mpt += sizeof(struct mpc_ioapic);
+ count += sizeof(struct mpc_ioapic);
break;
}
case MP_INTSRC:
{
#ifdef CONFIG_X86_IO_APIC
- struct mpc_config_intsrc *m =
- (struct mpc_config_intsrc *)mpt;
+ struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
MP_intsrc_info(m);
#endif
- mpt += sizeof(struct mpc_config_intsrc);
- count += sizeof(struct mpc_config_intsrc);
+ mpt += sizeof(struct mpc_intsrc);
+ count += sizeof(struct mpc_intsrc);
break;
}
case MP_LINTSRC:
{
- struct mpc_config_lintsrc *m =
- (struct mpc_config_lintsrc *)mpt;
+ struct mpc_lintsrc *m =
+ (struct mpc_lintsrc *)mpt;
MP_lintsrc_info(m);
mpt += sizeof(*m);
count += sizeof(*m);
@@ -385,21 +364,21 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
printk(KERN_ERR "type %x\n", *mpt);
print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
- 1, mpc, mpc->mpc_length, 1);
- count = mpc->mpc_length;
+ 1, mpc, mpc->length, 1);
+ count = mpc->length;
break;
}
if (x86_quirks->mpc_record)
(*x86_quirks->mpc_record)++;
}
-#ifdef CONFIG_X86_GENERICARCH
- generic_bigsmp_probe();
+#ifdef CONFIG_X86_BIGSMP
+ generic_bigsmp_probe();
#endif
-#ifdef CONFIG_X86_32
- setup_apic_routing();
-#endif
+ if (apic->setup_apic_routing)
+ apic->setup_apic_routing();
+
if (!num_processors)
printk(KERN_ERR "MPTABLE: no processors registered!\n");
return num_processors;
@@ -417,16 +396,16 @@ static int __init ELCR_trigger(unsigned int irq)
static void __init construct_default_ioirq_mptable(int mpc_default_type)
{
- struct mpc_config_intsrc intsrc;
+ struct mpc_intsrc intsrc;
int i;
int ELCR_fallback = 0;
- intsrc.mpc_type = MP_INTSRC;
- intsrc.mpc_irqflag = 0; /* conforming */
- intsrc.mpc_srcbus = 0;
- intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid;
+ intsrc.type = MP_INTSRC;
+ intsrc.irqflag = 0; /* conforming */
+ intsrc.srcbus = 0;
+ intsrc.dstapic = mp_ioapics[0].apicid;
- intsrc.mpc_irqtype = mp_INT;
+ intsrc.irqtype = mp_INT;
/*
* If true, we have an ISA/PCI system with no IRQ entries
@@ -469,30 +448,30 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
* irqflag field (level sensitive, active high polarity).
*/
if (ELCR_trigger(i))
- intsrc.mpc_irqflag = 13;
+ intsrc.irqflag = 13;
else
- intsrc.mpc_irqflag = 0;
+ intsrc.irqflag = 0;
}
- intsrc.mpc_srcbusirq = i;
- intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
+ intsrc.srcbusirq = i;
+ intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
MP_intsrc_info(&intsrc);
}
- intsrc.mpc_irqtype = mp_ExtINT;
- intsrc.mpc_srcbusirq = 0;
- intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
+ intsrc.irqtype = mp_ExtINT;
+ intsrc.srcbusirq = 0;
+ intsrc.dstirq = 0; /* 8259A to INTIN0 */
MP_intsrc_info(&intsrc);
}
static void __init construct_ioapic_table(int mpc_default_type)
{
- struct mpc_config_ioapic ioapic;
- struct mpc_config_bus bus;
+ struct mpc_ioapic ioapic;
+ struct mpc_bus bus;
- bus.mpc_type = MP_BUS;
- bus.mpc_busid = 0;
+ bus.type = MP_BUS;
+ bus.busid = 0;
switch (mpc_default_type) {
default:
printk(KERN_ERR "???\nUnknown standard configuration %d\n",
@@ -500,29 +479,29 @@ static void __init construct_ioapic_table(int mpc_default_type)
/* fall through */
case 1:
case 5:
- memcpy(bus.mpc_bustype, "ISA ", 6);
+ memcpy(bus.bustype, "ISA ", 6);
break;
case 2:
case 6:
case 3:
- memcpy(bus.mpc_bustype, "EISA ", 6);
+ memcpy(bus.bustype, "EISA ", 6);
break;
case 4:
case 7:
- memcpy(bus.mpc_bustype, "MCA ", 6);
+ memcpy(bus.bustype, "MCA ", 6);
}
MP_bus_info(&bus);
if (mpc_default_type > 4) {
- bus.mpc_busid = 1;
- memcpy(bus.mpc_bustype, "PCI ", 6);
+ bus.busid = 1;
+ memcpy(bus.bustype, "PCI ", 6);
MP_bus_info(&bus);
}
- ioapic.mpc_type = MP_IOAPIC;
- ioapic.mpc_apicid = 2;
- ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
- ioapic.mpc_flags = MPC_APIC_USABLE;
- ioapic.mpc_apicaddr = 0xFEC00000;
+ ioapic.type = MP_IOAPIC;
+ ioapic.apicid = 2;
+ ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+ ioapic.flags = MPC_APIC_USABLE;
+ ioapic.apicaddr = 0xFEC00000;
MP_ioapic_info(&ioapic);
/*
@@ -536,8 +515,8 @@ static inline void __init construct_ioapic_table(int mpc_default_type) { }
static inline void __init construct_default_ISA_mptable(int mpc_default_type)
{
- struct mpc_config_processor processor;
- struct mpc_config_lintsrc lintsrc;
+ struct mpc_cpu processor;
+ struct mpc_lintsrc lintsrc;
int linttypes[2] = { mp_ExtINT, mp_NMI };
int i;
@@ -549,65 +528,65 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
/*
* 2 CPUs, numbered 0 & 1.
*/
- processor.mpc_type = MP_PROCESSOR;
+ processor.type = MP_PROCESSOR;
/* Either an integrated APIC or a discrete 82489DX. */
- processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
- processor.mpc_cpuflag = CPU_ENABLED;
- processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
+ processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+ processor.cpuflag = CPU_ENABLED;
+ processor.cpufeature = (boot_cpu_data.x86 << 8) |
(boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
- processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
- processor.mpc_reserved[0] = 0;
- processor.mpc_reserved[1] = 0;
+ processor.featureflag = boot_cpu_data.x86_capability[0];
+ processor.reserved[0] = 0;
+ processor.reserved[1] = 0;
for (i = 0; i < 2; i++) {
- processor.mpc_apicid = i;
+ processor.apicid = i;
MP_processor_info(&processor);
}
construct_ioapic_table(mpc_default_type);
- lintsrc.mpc_type = MP_LINTSRC;
- lintsrc.mpc_irqflag = 0; /* conforming */
- lintsrc.mpc_srcbusid = 0;
- lintsrc.mpc_srcbusirq = 0;
- lintsrc.mpc_destapic = MP_APIC_ALL;
+ lintsrc.type = MP_LINTSRC;
+ lintsrc.irqflag = 0; /* conforming */
+ lintsrc.srcbusid = 0;
+ lintsrc.srcbusirq = 0;
+ lintsrc.destapic = MP_APIC_ALL;
for (i = 0; i < 2; i++) {
- lintsrc.mpc_irqtype = linttypes[i];
- lintsrc.mpc_destapiclint = i;
+ lintsrc.irqtype = linttypes[i];
+ lintsrc.destapiclint = i;
MP_lintsrc_info(&lintsrc);
}
}
-static struct intel_mp_floating *mpf_found;
+static struct mpf_intel *mpf_found;
/*
* Scan the memory blocks for an SMP configuration block.
*/
static void __init __get_smp_config(unsigned int early)
{
- struct intel_mp_floating *mpf = mpf_found;
+ struct mpf_intel *mpf = mpf_found;
+
+ if (!mpf)
+ return;
- if (x86_quirks->mach_get_smp_config) {
- if (x86_quirks->mach_get_smp_config(early))
- return;
- }
if (acpi_lapic && early)
return;
+
/*
- * ACPI supports both logical (e.g. Hyper-Threading) and physical
- * processors, where MPS only supports physical.
+ * MPS doesn't support hyperthreading, aka only have
+ * thread 0 apic id in MPS table
*/
- if (acpi_lapic && acpi_ioapic) {
- printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
- "information\n");
+ if (acpi_lapic && acpi_ioapic)
return;
- } else if (acpi_lapic)
- printk(KERN_INFO "Using ACPI for processor (LAPIC) "
- "configuration information\n");
+
+ if (x86_quirks->mach_get_smp_config) {
+ if (x86_quirks->mach_get_smp_config(early))
+ return;
+ }
printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
- mpf->mpf_specification);
+ mpf->specification);
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
- if (mpf->mpf_feature2 & (1 << 7)) {
+ if (mpf->feature2 & (1 << 7)) {
printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
pic_mode = 1;
} else {
@@ -618,7 +597,7 @@ static void __init __get_smp_config(unsigned int early)
/*
* Now see if we need to read further.
*/
- if (mpf->mpf_feature1 != 0) {
+ if (mpf->feature1 != 0) {
if (early) {
/*
* local APIC has default address
@@ -628,16 +607,16 @@ static void __init __get_smp_config(unsigned int early)
}
printk(KERN_INFO "Default MP configuration #%d\n",
- mpf->mpf_feature1);
- construct_default_ISA_mptable(mpf->mpf_feature1);
+ mpf->feature1);
+ construct_default_ISA_mptable(mpf->feature1);
- } else if (mpf->mpf_physptr) {
+ } else if (mpf->physptr) {
/*
* Read the physical hardware table. Anything here will
* override the defaults.
*/
- if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) {
+ if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) {
#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 0;
#endif
@@ -657,15 +636,15 @@ static void __init __get_smp_config(unsigned int early)
* ISA defaults and hope it will work.
*/
if (!mp_irq_entries) {
- struct mpc_config_bus bus;
+ struct mpc_bus bus;
printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
"using default mptable. "
"(tell your hw vendor)\n");
- bus.mpc_type = MP_BUS;
- bus.mpc_busid = 0;
- memcpy(bus.mpc_bustype, "ISA ", 6);
+ bus.type = MP_BUS;
+ bus.busid = 0;
+ memcpy(bus.bustype, "ISA ", 6);
MP_bus_info(&bus);
construct_default_ioirq_mptable(0);
@@ -695,32 +674,32 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
unsigned reserve)
{
unsigned int *bp = phys_to_virt(base);
- struct intel_mp_floating *mpf;
+ struct mpf_intel *mpf;
apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
bp, length);
BUILD_BUG_ON(sizeof(*mpf) != 16);
while (length > 0) {
- mpf = (struct intel_mp_floating *)bp;
+ mpf = (struct mpf_intel *)bp;
if ((*bp == SMP_MAGIC_IDENT) &&
- (mpf->mpf_length == 1) &&
+ (mpf->length == 1) &&
!mpf_checksum((unsigned char *)bp, 16) &&
- ((mpf->mpf_specification == 1)
- || (mpf->mpf_specification == 4))) {
+ ((mpf->specification == 1)
+ || (mpf->specification == 4))) {
#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 1;
#endif
mpf_found = mpf;
- printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
- mpf, virt_to_phys(mpf));
+ printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
+ mpf, (u64)virt_to_phys(mpf));
if (!reserve)
return 1;
reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
BOOTMEM_DEFAULT);
- if (mpf->mpf_physptr) {
+ if (mpf->physptr) {
unsigned long size = PAGE_SIZE;
#ifdef CONFIG_X86_32
/*
@@ -729,14 +708,14 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
* the bottom is mapped now.
* PC-9800's MPC table places on the very last
* of physical memory; so that simply reserving
- * PAGE_SIZE from mpg->mpf_physptr yields BUG()
+ * PAGE_SIZE from mpf->physptr yields BUG()
* in reserve_bootmem.
*/
unsigned long end = max_low_pfn * PAGE_SIZE;
- if (mpf->mpf_physptr + size > end)
- size = end - mpf->mpf_physptr;
+ if (mpf->physptr + size > end)
+ size = end - mpf->physptr;
#endif
- reserve_bootmem_generic(mpf->mpf_physptr, size,
+ reserve_bootmem_generic(mpf->physptr, size,
BOOTMEM_DEFAULT);
}
@@ -803,28 +782,28 @@ void __init find_smp_config(void)
#ifdef CONFIG_X86_IO_APIC
static u8 __initdata irq_used[MAX_IRQ_SOURCES];
-static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
+static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
{
int i;
- if (m->mpc_irqtype != mp_INT)
+ if (m->irqtype != mp_INT)
return 0;
- if (m->mpc_irqflag != 0x0f)
+ if (m->irqflag != 0x0f)
return 0;
/* not legacy */
for (i = 0; i < mp_irq_entries; i++) {
- if (mp_irqs[i].mp_irqtype != mp_INT)
+ if (mp_irqs[i].irqtype != mp_INT)
continue;
- if (mp_irqs[i].mp_irqflag != 0x0f)
+ if (mp_irqs[i].irqflag != 0x0f)
continue;
- if (mp_irqs[i].mp_srcbus != m->mpc_srcbus)
+ if (mp_irqs[i].srcbus != m->srcbus)
continue;
- if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq)
+ if (mp_irqs[i].srcbusirq != m->srcbusirq)
continue;
if (irq_used[i]) {
/* already claimed */
@@ -840,10 +819,10 @@ static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
#define SPARE_SLOT_NUM 20
-static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
+static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
#endif
-static int __init replace_intsrc_all(struct mp_config_table *mpc,
+static int __init replace_intsrc_all(struct mpc_table *mpc,
unsigned long mpc_new_phys,
unsigned long mpc_new_length)
{
@@ -855,36 +834,33 @@ static int __init replace_intsrc_all(struct mp_config_table *mpc,
int count = sizeof(*mpc);
unsigned char *mpt = ((unsigned char *)mpc) + count;
- printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length);
- while (count < mpc->mpc_length) {
+ printk(KERN_INFO "mpc_length %x\n", mpc->length);
+ while (count < mpc->length) {
switch (*mpt) {
case MP_PROCESSOR:
{
- struct mpc_config_processor *m =
- (struct mpc_config_processor *)mpt;
+ struct mpc_cpu *m = (struct mpc_cpu *)mpt;
mpt += sizeof(*m);
count += sizeof(*m);
break;
}
case MP_BUS:
{
- struct mpc_config_bus *m =
- (struct mpc_config_bus *)mpt;
+ struct mpc_bus *m = (struct mpc_bus *)mpt;
mpt += sizeof(*m);
count += sizeof(*m);
break;
}
case MP_IOAPIC:
{
- mpt += sizeof(struct mpc_config_ioapic);
- count += sizeof(struct mpc_config_ioapic);
+ mpt += sizeof(struct mpc_ioapic);
+ count += sizeof(struct mpc_ioapic);
break;
}
case MP_INTSRC:
{
#ifdef CONFIG_X86_IO_APIC
- struct mpc_config_intsrc *m =
- (struct mpc_config_intsrc *)mpt;
+ struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
printk(KERN_INFO "OLD ");
print_MP_intsrc_info(m);
@@ -905,14 +881,14 @@ static int __init replace_intsrc_all(struct mp_config_table *mpc,
nr_m_spare++;
}
#endif
- mpt += sizeof(struct mpc_config_intsrc);
- count += sizeof(struct mpc_config_intsrc);
+ mpt += sizeof(struct mpc_intsrc);
+ count += sizeof(struct mpc_intsrc);
break;
}
case MP_LINTSRC:
{
- struct mpc_config_lintsrc *m =
- (struct mpc_config_lintsrc *)mpt;
+ struct mpc_lintsrc *m =
+ (struct mpc_lintsrc *)mpt;
mpt += sizeof(*m);
count += sizeof(*m);
break;
@@ -922,7 +898,7 @@ static int __init replace_intsrc_all(struct mp_config_table *mpc,
printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
printk(KERN_ERR "type %x\n", *mpt);
print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
- 1, mpc, mpc->mpc_length, 1);
+ 1, mpc, mpc->length, 1);
goto out;
}
}
@@ -932,10 +908,10 @@ static int __init replace_intsrc_all(struct mp_config_table *mpc,
if (irq_used[i])
continue;
- if (mp_irqs[i].mp_irqtype != mp_INT)
+ if (mp_irqs[i].irqtype != mp_INT)
continue;
- if (mp_irqs[i].mp_irqflag != 0x0f)
+ if (mp_irqs[i].irqflag != 0x0f)
continue;
if (nr_m_spare > 0) {
@@ -944,9 +920,8 @@ static int __init replace_intsrc_all(struct mp_config_table *mpc,
assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
m_spare[nr_m_spare] = NULL;
} else {
- struct mpc_config_intsrc *m =
- (struct mpc_config_intsrc *)mpt;
- count += sizeof(struct mpc_config_intsrc);
+ struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
+ count += sizeof(struct mpc_intsrc);
if (!mpc_new_phys) {
printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
} else {
@@ -958,17 +933,16 @@ static int __init replace_intsrc_all(struct mp_config_table *mpc,
}
}
assign_to_mpc_intsrc(&mp_irqs[i], m);
- mpc->mpc_length = count;
- mpt += sizeof(struct mpc_config_intsrc);
+ mpc->length = count;
+ mpt += sizeof(struct mpc_intsrc);
}
print_mp_irq_info(&mp_irqs[i]);
}
#endif
out:
/* update checksum */
- mpc->mpc_checksum = 0;
- mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc,
- mpc->mpc_length);
+ mpc->checksum = 0;
+ mpc->checksum -= mpf_checksum((unsigned char *)mpc, mpc->length);
return 0;
}
@@ -1013,9 +987,8 @@ static int __init update_mp_table(void)
{
char str[16];
char oem[10];
- struct intel_mp_floating *mpf;
- struct mp_config_table *mpc;
- struct mp_config_table *mpc_new;
+ struct mpf_intel *mpf;
+ struct mpc_table *mpc, *mpc_new;
if (!enable_update_mptable)
return 0;
@@ -1027,21 +1000,21 @@ static int __init update_mp_table(void)
/*
* Now see if we need to go further.
*/
- if (mpf->mpf_feature1 != 0)
+ if (mpf->feature1 != 0)
return 0;
- if (!mpf->mpf_physptr)
+ if (!mpf->physptr)
return 0;
- mpc = phys_to_virt(mpf->mpf_physptr);
+ mpc = phys_to_virt(mpf->physptr);
if (!smp_check_mpc(mpc, oem, str))
return 0;
- printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf));
- printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
+ printk(KERN_INFO "mpf: %llx\n", (u64)virt_to_phys(mpf));
+ printk(KERN_INFO "physptr: %x\n", mpf->physptr);
- if (mpc_new_phys && mpc->mpc_length > mpc_new_length) {
+ if (mpc_new_phys && mpc->length > mpc_new_length) {
mpc_new_phys = 0;
printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
mpc_new_length);
@@ -1050,33 +1023,33 @@ static int __init update_mp_table(void)
if (!mpc_new_phys) {
unsigned char old, new;
/* check if we can change the postion */
- mpc->mpc_checksum = 0;
- old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
- mpc->mpc_checksum = 0xff;
- new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
+ mpc->checksum = 0;
+ old = mpf_checksum((unsigned char *)mpc, mpc->length);
+ mpc->checksum = 0xff;
+ new = mpf_checksum((unsigned char *)mpc, mpc->length);
if (old == new) {
printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
return 0;
}
printk(KERN_INFO "use in-positon replacing\n");
} else {
- mpf->mpf_physptr = mpc_new_phys;
+ mpf->physptr = mpc_new_phys;
mpc_new = phys_to_virt(mpc_new_phys);
- memcpy(mpc_new, mpc, mpc->mpc_length);
+ memcpy(mpc_new, mpc, mpc->length);
mpc = mpc_new;
/* check if we can modify that */
- if (mpc_new_phys - mpf->mpf_physptr) {
- struct intel_mp_floating *mpf_new;
+ if (mpc_new_phys - mpf->physptr) {
+ struct mpf_intel *mpf_new;
/* steal 16 bytes from [0, 1k) */
printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
mpf_new = phys_to_virt(0x400 - 16);
memcpy(mpf_new, mpf, 16);
mpf = mpf_new;
- mpf->mpf_physptr = mpc_new_phys;
+ mpf->physptr = mpc_new_phys;
}
- mpf->mpf_checksum = 0;
- mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
- printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
+ mpf->checksum = 0;
+ mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16);
+ printk(KERN_INFO "physptr new: %x\n", mpf->physptr);
}
/*
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 82a7c7ed6d4..3cf3413ec62 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -35,10 +35,10 @@
#include <linux/device.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
+#include <linux/uaccess.h>
#include <asm/processor.h>
#include <asm/msr.h>
-#include <asm/uaccess.h>
#include <asm/system.h>
static struct class *msr_class;
@@ -136,7 +136,7 @@ static int msr_open(struct inode *inode, struct file *file)
lock_kernel();
cpu = iminor(file->f_path.dentry->d_inode);
- if (cpu >= NR_CPUS || !cpu_online(cpu)) {
+ if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
ret = -ENXIO; /* No such CPU */
goto out;
}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 2c97f07f1c2..bdfad80c3cf 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -29,14 +29,12 @@
#include <asm/i8259.h>
#include <asm/io_apic.h>
-#include <asm/smp.h>
-#include <asm/nmi.h>
#include <asm/proto.h>
#include <asm/timer.h>
#include <asm/mce.h>
-#include <mach_traps.h>
+#include <asm/mach_traps.h>
int unknown_nmi_panic;
int nmi_watchdog_enabled;
@@ -63,11 +61,7 @@ static int endflag __initdata;
static inline unsigned int get_nmi_count(int cpu)
{
-#ifdef CONFIG_X86_64
- return cpu_pda(cpu)->__nmi_count;
-#else
- return nmi_count(cpu);
-#endif
+ return per_cpu(irq_stat, cpu).__nmi_count;
}
static inline int mce_in_progress(void)
@@ -84,12 +78,8 @@ static inline int mce_in_progress(void)
*/
static inline unsigned int get_timer_irqs(int cpu)
{
-#ifdef CONFIG_X86_64
- return read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
-#else
return per_cpu(irq_stat, cpu).apic_timer_irqs +
per_cpu(irq_stat, cpu).irq0_irqs;
-#endif
}
#ifdef CONFIG_SMP
@@ -131,6 +121,11 @@ static void report_broken_nmi(int cpu, int *prev_nmi_count)
atomic_dec(&nmi_active);
}
+static void __acpi_nmi_disable(void *__unused)
+{
+ apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+}
+
int __init check_nmi_watchdog(void)
{
unsigned int *prev_nmi_count;
@@ -179,8 +174,12 @@ int __init check_nmi_watchdog(void)
kfree(prev_nmi_count);
return 0;
error:
- if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
- disable_8259A_irq(0);
+ if (nmi_watchdog == NMI_IO_APIC) {
+ if (!timer_through_8259)
+ disable_8259A_irq(0);
+ on_each_cpu(__acpi_nmi_disable, NULL, 1);
+ }
+
#ifdef CONFIG_X86_32
timer_ack = 0;
#endif
@@ -199,12 +198,17 @@ static int __init setup_nmi_watchdog(char *str)
++str;
}
- get_option(&str, &nmi);
-
- if (nmi >= NMI_INVALID)
- return 0;
+ if (!strncmp(str, "lapic", 5))
+ nmi_watchdog = NMI_LOCAL_APIC;
+ else if (!strncmp(str, "ioapic", 6))
+ nmi_watchdog = NMI_IO_APIC;
+ else {
+ get_option(&str, &nmi);
+ if (nmi >= NMI_INVALID)
+ return 0;
+ nmi_watchdog = nmi;
+ }
- nmi_watchdog = nmi;
return 1;
}
__setup("nmi_watchdog=", setup_nmi_watchdog);
@@ -285,11 +289,6 @@ void acpi_nmi_enable(void)
on_each_cpu(__acpi_nmi_enable, NULL, 1);
}
-static void __acpi_nmi_disable(void *__unused)
-{
- apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
-}
-
/*
* Disable timer based NMIs on all CPUs:
*/
@@ -340,6 +339,8 @@ void stop_apic_nmi_watchdog(void *unused)
return;
if (nmi_watchdog == NMI_LOCAL_APIC)
lapic_watchdog_stop();
+ else
+ __acpi_nmi_disable(NULL);
__get_cpu_var(wd_enabled) = 0;
atomic_dec(&nmi_active);
}
@@ -465,6 +466,24 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
#ifdef CONFIG_SYSCTL
+static void enable_ioapic_nmi_watchdog_single(void *unused)
+{
+ __get_cpu_var(wd_enabled) = 1;
+ atomic_inc(&nmi_active);
+ __acpi_nmi_enable(NULL);
+}
+
+static void enable_ioapic_nmi_watchdog(void)
+{
+ on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1);
+ touch_nmi_watchdog();
+}
+
+static void disable_ioapic_nmi_watchdog(void)
+{
+ on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
+}
+
static int __init setup_unknown_nmi_panic(char *str)
{
unknown_nmi_panic = 1;
@@ -507,6 +526,11 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
enable_lapic_nmi_watchdog();
else
disable_lapic_nmi_watchdog();
+ } else if (nmi_watchdog == NMI_IO_APIC) {
+ if (nmi_watchdog_enabled)
+ enable_ioapic_nmi_watchdog();
+ else
+ disable_ioapic_nmi_watchdog();
} else {
printk(KERN_WARNING
"NMI watchdog doesn't know what hardware to touch\n");
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index 4caff39078e..0cc41a1d255 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -3,7 +3,7 @@
*
* Copyright (C) 2002, IBM Corp.
*
- * All rights reserved.
+ * All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -23,17 +23,18 @@
* Send feedback to <gone@us.ibm.com>
*/
-#include <linux/mm.h>
+#include <linux/nodemask.h>
#include <linux/bootmem.h>
#include <linux/mmzone.h>
#include <linux/module.h>
-#include <linux/nodemask.h>
-#include <asm/numaq.h>
-#include <asm/topology.h>
+#include <linux/mm.h>
+
#include <asm/processor.h>
-#include <asm/mpspec.h>
-#include <asm/e820.h>
+#include <asm/topology.h>
+#include <asm/genapic.h>
+#include <asm/numaq.h>
#include <asm/setup.h>
+#include <asm/e820.h>
#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
@@ -91,19 +92,20 @@ static int __init numaq_pre_time_init(void)
}
int found_numaq;
+
/*
* Have to match translation table entries to main table entries by counter
* hence the mpc_record variable .... can't see a less disgusting way of
* doing this ....
*/
struct mpc_config_translation {
- unsigned char mpc_type;
- unsigned char trans_len;
- unsigned char trans_type;
- unsigned char trans_quad;
- unsigned char trans_global;
- unsigned char trans_local;
- unsigned short trans_reserved;
+ unsigned char mpc_type;
+ unsigned char trans_len;
+ unsigned char trans_type;
+ unsigned char trans_quad;
+ unsigned char trans_global;
+ unsigned char trans_local;
+ unsigned short trans_reserved;
};
/* x86_quirks member */
@@ -117,16 +119,15 @@ static inline int generate_logical_apicid(int quad, int phys_apicid)
}
/* x86_quirks member */
-static int mpc_apic_id(struct mpc_config_processor *m)
+static int mpc_apic_id(struct mpc_cpu *m)
{
int quad = translation_table[mpc_record]->trans_quad;
- int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
+ int logical_apicid = generate_logical_apicid(quad, m->apicid);
printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
- m->mpc_apicid,
- (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
- (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
- m->mpc_apicver, quad, logical_apicid);
+ m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
+ (m->cpufeature & CPU_MODEL_MASK) >> 4,
+ m->apicver, quad, logical_apicid);
return logical_apicid;
}
@@ -135,26 +136,26 @@ int mp_bus_id_to_node[MAX_MP_BUSSES];
int mp_bus_id_to_local[MAX_MP_BUSSES];
/* x86_quirks member */
-static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name)
+static void mpc_oem_bus_info(struct mpc_bus *m, char *name)
{
int quad = translation_table[mpc_record]->trans_quad;
int local = translation_table[mpc_record]->trans_local;
- mp_bus_id_to_node[m->mpc_busid] = quad;
- mp_bus_id_to_local[m->mpc_busid] = local;
+ mp_bus_id_to_node[m->busid] = quad;
+ mp_bus_id_to_local[m->busid] = local;
printk(KERN_INFO "Bus #%d is %s (node %d)\n",
- m->mpc_busid, name, quad);
+ m->busid, name, quad);
}
int quad_local_to_mp_bus_id [NR_CPUS/4][4];
/* x86_quirks member */
-static void mpc_oem_pci_bus(struct mpc_config_bus *m)
+static void mpc_oem_pci_bus(struct mpc_bus *m)
{
int quad = translation_table[mpc_record]->trans_quad;
int local = translation_table[mpc_record]->trans_local;
- quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
+ quad_local_to_mp_bus_id[quad][local] = m->busid;
}
static void __init MP_translation_info(struct mpc_config_translation *m)
@@ -186,7 +187,7 @@ static int __init mpf_checksum(unsigned char *mp, int len)
* Read/parse the MPC oem tables
*/
-static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
+static void __init smp_read_mpc_oem(struct mpc_oemtable *oemtable,
unsigned short oemsize)
{
int count = sizeof(*oemtable); /* the header size */
@@ -195,18 +196,18 @@ static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
mpc_record = 0;
printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
oemtable);
- if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
+ if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
printk(KERN_WARNING
"SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
- oemtable->oem_signature[0], oemtable->oem_signature[1],
- oemtable->oem_signature[2], oemtable->oem_signature[3]);
+ oemtable->signature[0], oemtable->signature[1],
+ oemtable->signature[2], oemtable->signature[3]);
return;
}
- if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
+ if (mpf_checksum((unsigned char *)oemtable, oemtable->length)) {
printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
return;
}
- while (count < oemtable->oem_length) {
+ while (count < oemtable->length) {
switch (*oemptr) {
case MP_TRANSLATION:
{
@@ -235,6 +236,13 @@ static int __init numaq_setup_ioapic_ids(void)
return 1;
}
+static int __init numaq_update_genapic(void)
+{
+ apic->wakeup_cpu = wakeup_secondary_cpu_via_nmi;
+
+ return 0;
+}
+
static struct x86_quirks numaq_x86_quirks __initdata = {
.arch_pre_time_init = numaq_pre_time_init,
.arch_time_init = NULL,
@@ -250,10 +258,10 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
.mpc_oem_pci_bus = mpc_oem_pci_bus,
.smp_read_mpc_oem = smp_read_mpc_oem,
.setup_ioapic_ids = numaq_setup_ioapic_ids,
+ .update_genapic = numaq_update_genapic,
};
-void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
- char *productid)
+void numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
{
if (strncmp(oem, "IBM NUMA", 8))
printk("Warning! Not a NUMA-Q system!\n");
@@ -285,3 +293,280 @@ int __init get_memcfg_numaq(void)
smp_dump_qct();
return 1;
}
+
+/*
+ * APIC driver for the IBM NUMAQ chipset.
+ */
+#define APIC_DEFINITION 1
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <asm/mpspec.h>
+#include <asm/genapic.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <asm/ipi.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/numa.h>
+#include <linux/smp.h>
+#include <asm/numaq.h>
+#include <asm/io.h>
+#include <linux/mmzone.h>
+#include <linux/nodemask.h>
+
+#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
+
+static inline unsigned int numaq_get_apic_id(unsigned long x)
+{
+ return (x >> 24) & 0x0F;
+}
+
+static inline void numaq_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ default_send_IPI_mask_sequence_logical(mask, vector);
+}
+
+static inline void numaq_send_IPI_allbutself(int vector)
+{
+ default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector);
+}
+
+static inline void numaq_send_IPI_all(int vector)
+{
+ numaq_send_IPI_mask(cpu_online_mask, vector);
+}
+
+extern void numaq_mps_oem_check(struct mpc_table *, char *, char *);
+
+#define NUMAQ_TRAMPOLINE_PHYS_LOW (0x8)
+#define NUMAQ_TRAMPOLINE_PHYS_HIGH (0xa)
+
+/*
+ * Because we use NMIs rather than the INIT-STARTUP sequence to
+ * bootstrap the CPUs, the APIC may be in a weird state. Kick it:
+ */
+static inline void numaq_smp_callin_clear_local_apic(void)
+{
+ clear_local_APIC();
+}
+
+static inline void
+numaq_store_NMI_vector(unsigned short *high, unsigned short *low)
+{
+ printk("Storing NMI vector\n");
+ *high =
+ *((volatile unsigned short *)phys_to_virt(NUMAQ_TRAMPOLINE_PHYS_HIGH));
+ *low =
+ *((volatile unsigned short *)phys_to_virt(NUMAQ_TRAMPOLINE_PHYS_LOW));
+}
+
+static inline const cpumask_t *numaq_target_cpus(void)
+{
+ return &CPU_MASK_ALL;
+}
+
+static inline unsigned long
+numaq_check_apicid_used(physid_mask_t bitmap, int apicid)
+{
+ return physid_isset(apicid, bitmap);
+}
+
+static inline unsigned long numaq_check_apicid_present(int bit)
+{
+ return physid_isset(bit, phys_cpu_present_map);
+}
+
+#define apicid_cluster(apicid) (apicid & 0xF0)
+
+static inline int numaq_apic_id_registered(void)
+{
+ return 1;
+}
+
+static inline void numaq_init_apic_ldr(void)
+{
+ /* Already done in NUMA-Q firmware */
+}
+
+static inline void numaq_setup_apic_routing(void)
+{
+ printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
+ "NUMA-Q", nr_ioapics);
+}
+
+/*
+ * Skip adding the timer int on secondary nodes, which causes
+ * a small but painful rift in the time-space continuum.
+ */
+static inline int numaq_multi_timer_check(int apic, int irq)
+{
+ return apic != 0 && irq == 0;
+}
+
+static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map)
+{
+ /* We don't have a good way to do this yet - hack */
+ return physids_promote(0xFUL);
+}
+
+/* Mapping from cpu number to logical apicid */
+extern u8 cpu_2_logical_apicid[];
+
+static inline int numaq_cpu_to_logical_apicid(int cpu)
+{
+ if (cpu >= nr_cpu_ids)
+ return BAD_APICID;
+ return (int)cpu_2_logical_apicid[cpu];
+}
+
+/*
+ * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
+ * cpu to APIC ID relation to properly interact with the intelligent
+ * mode of the cluster controller.
+ */
+static inline int numaq_cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < 60)
+ return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3));
+ else
+ return BAD_APICID;
+}
+
+static inline int numaq_apicid_to_node(int logical_apicid)
+{
+ return logical_apicid >> 4;
+}
+
+static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid)
+{
+ int node = numaq_apicid_to_node(logical_apicid);
+ int cpu = __ffs(logical_apicid & 0xf);
+
+ return physid_mask_of_physid(cpu + 4*node);
+}
+
+/* Where the IO area was mapped on multiquad, always 0 otherwise */
+void *xquad_portio;
+
+static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid)
+{
+ return 1;
+}
+
+/*
+ * We use physical apicids here, not logical, so just return the default
+ * physical broadcast to stop people from breaking us
+ */
+static inline unsigned int numaq_cpu_mask_to_apicid(const cpumask_t *cpumask)
+{
+ return 0x0F;
+}
+
+static inline unsigned int
+numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ return 0x0F;
+}
+
+/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
+static inline int numaq_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+static int __numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
+{
+ numaq_mps_oem_check(mpc, oem, productid);
+ return found_numaq;
+}
+
+static int probe_numaq(void)
+{
+ /* already know from get_memcfg_numaq() */
+ return found_numaq;
+}
+
+static void numaq_vector_allocation_domain(int cpu, cpumask_t *retmask)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
+}
+
+static void numaq_setup_portio_remap(void)
+{
+ int num_quads = num_online_nodes();
+
+ if (num_quads <= 1)
+ return;
+
+ printk("Remapping cross-quad port I/O for %d quads\n", num_quads);
+ xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD);
+ printk("xquad_portio vaddr 0x%08lx, len %08lx\n",
+ (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
+}
+
+struct genapic apic_numaq = {
+
+ .name = "NUMAQ",
+ .probe = probe_numaq,
+ .acpi_madt_oem_check = NULL,
+ .apic_id_registered = numaq_apic_id_registered,
+
+ .irq_delivery_mode = dest_LowestPrio,
+ /* physical delivery on LOCAL quad: */
+ .irq_dest_mode = 0,
+
+ .target_cpus = numaq_target_cpus,
+ .disable_esr = 1,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = numaq_check_apicid_used,
+ .check_apicid_present = numaq_check_apicid_present,
+
+ .vector_allocation_domain = numaq_vector_allocation_domain,
+ .init_apic_ldr = numaq_init_apic_ldr,
+
+ .ioapic_phys_id_map = numaq_ioapic_phys_id_map,
+ .setup_apic_routing = numaq_setup_apic_routing,
+ .multi_timer_check = numaq_multi_timer_check,
+ .apicid_to_node = numaq_apicid_to_node,
+ .cpu_to_logical_apicid = numaq_cpu_to_logical_apicid,
+ .cpu_present_to_apicid = numaq_cpu_present_to_apicid,
+ .apicid_to_cpu_present = numaq_apicid_to_cpu_present,
+ .setup_portio_remap = numaq_setup_portio_remap,
+ .check_phys_apicid_present = numaq_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = numaq_phys_pkg_id,
+ .mps_oem_check = __numaq_mps_oem_check,
+
+ .get_apic_id = numaq_get_apic_id,
+ .set_apic_id = NULL,
+ .apic_id_mask = 0x0F << 24,
+
+ .cpu_mask_to_apicid = numaq_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = numaq_send_IPI_mask,
+ .send_IPI_mask_allbutself = NULL,
+ .send_IPI_allbutself = numaq_send_IPI_allbutself,
+ .send_IPI_all = numaq_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+
+ .wakeup_cpu = NULL,
+ .trampoline_phys_low = NUMAQ_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = NUMAQ_TRAMPOLINE_PHYS_HIGH,
+
+ /* We don't do anything here because we use NMI's to boot instead */
+ .wait_for_init_deassert = NULL,
+
+ .smp_callin_clear_local_apic = numaq_smp_callin_clear_local_apic,
+ .store_NMI_vector = numaq_store_NMI_vector,
+ .inquire_remote_apic = NULL,
+};
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 0e9f1982b1d..3a7c5a44082 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -7,7 +7,8 @@
#include <asm/paravirt.h>
-static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
+static inline void
+default_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
{
__raw_spin_lock(lock);
}
@@ -25,13 +26,3 @@ struct pv_lock_ops pv_lock_ops = {
};
EXPORT_SYMBOL(pv_lock_ops);
-void __init paravirt_use_bytelocks(void)
-{
-#ifdef CONFIG_SMP
- pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
- pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
- pv_lock_ops.spin_lock = __byte_spin_lock;
- pv_lock_ops.spin_trylock = __byte_spin_trylock;
- pv_lock_ops.spin_unlock = __byte_spin_unlock;
-#endif
-}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e4c8fb60887..cea11c8e304 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -44,6 +44,17 @@ void _paravirt_nop(void)
{
}
+/* identity function, which can be inlined */
+u32 _paravirt_ident_32(u32 x)
+{
+ return x;
+}
+
+u64 _paravirt_ident_64(u64 x)
+{
+ return x;
+}
+
static void __init default_banner(void)
{
printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
@@ -138,9 +149,16 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
if (opfunc == NULL)
/* If there's no function, patch it with a ud2a (BUG) */
ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
- else if (opfunc == paravirt_nop)
+ else if (opfunc == _paravirt_nop)
/* If the operation is a nop, then nop the callsite */
ret = paravirt_patch_nop();
+
+ /* identity functions just return their single argument */
+ else if (opfunc == _paravirt_ident_32)
+ ret = paravirt_patch_ident_32(insnbuf, len);
+ else if (opfunc == _paravirt_ident_64)
+ ret = paravirt_patch_ident_64(insnbuf, len);
+
else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
@@ -292,10 +310,10 @@ struct pv_time_ops pv_time_ops = {
struct pv_irq_ops pv_irq_ops = {
.init_IRQ = native_init_IRQ,
- .save_fl = native_save_fl,
- .restore_fl = native_restore_fl,
- .irq_disable = native_irq_disable,
- .irq_enable = native_irq_enable,
+ .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
+ .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
+ .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
+ .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
.safe_halt = native_safe_halt,
.halt = native_halt,
#ifdef CONFIG_X86_64
@@ -373,6 +391,14 @@ struct pv_apic_ops pv_apic_ops = {
#endif
};
+#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
+/* 32-bit pagetable entries */
+#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32)
+#else
+/* 64-bit pagetable entries */
+#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
+#endif
+
struct pv_mmu_ops pv_mmu_ops = {
#ifndef CONFIG_X86_64
.pagetable_setup_start = native_pagetable_setup_start,
@@ -424,22 +450,23 @@ struct pv_mmu_ops pv_mmu_ops = {
.pmd_clear = native_pmd_clear,
#endif
.set_pud = native_set_pud,
- .pmd_val = native_pmd_val,
- .make_pmd = native_make_pmd,
+
+ .pmd_val = PTE_IDENT,
+ .make_pmd = PTE_IDENT,
#if PAGETABLE_LEVELS == 4
- .pud_val = native_pud_val,
- .make_pud = native_make_pud,
+ .pud_val = PTE_IDENT,
+ .make_pud = PTE_IDENT,
+
.set_pgd = native_set_pgd,
#endif
#endif /* PAGETABLE_LEVELS >= 3 */
- .pte_val = native_pte_val,
- .pte_flags = native_pte_flags,
- .pgd_val = native_pgd_val,
+ .pte_val = PTE_IDENT,
+ .pgd_val = PTE_IDENT,
- .make_pte = native_make_pte,
- .make_pgd = native_make_pgd,
+ .make_pte = PTE_IDENT,
+ .make_pgd = PTE_IDENT,
.dup_mmap = paravirt_nop,
.exit_mmap = paravirt_nop,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 9fe644f4861..d9f32e6d6ab 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -12,6 +12,18 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
DEF_NATIVE(pv_cpu_ops, clts, "clts");
DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
+unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
+{
+ /* arg in %eax, return in %eax */
+ return 0;
+}
+
+unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
+{
+ /* arg in %edx:%eax, return in %edx:%eax */
+ return 0;
+}
+
unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
{
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 061d01df9ae..3f08f34f93e 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -19,6 +19,21 @@ DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
+DEF_NATIVE(, mov32, "mov %edi, %eax");
+DEF_NATIVE(, mov64, "mov %rdi, %rax");
+
+unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
+{
+ return paravirt_patch_insns(insnbuf, len,
+ start__mov32, end__mov32);
+}
+
+unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
+{
+ return paravirt_patch_insns(insnbuf, len,
+ start__mov64, end__mov64);
+}
+
unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
{
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index e1e731d78f3..d28bbdc35e4 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1567,7 +1567,7 @@ static int __init calgary_parse_options(char *p)
++p;
if (*p == '\0')
break;
- bridge = simple_strtol(p, &endp, 0);
+ bridge = simple_strtoul(p, &endp, 0);
if (p == endp)
break;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 19262482021..b2542853314 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -6,6 +6,7 @@
#include <asm/proto.h>
#include <asm/dma.h>
#include <asm/iommu.h>
+#include <asm/gart.h>
#include <asm/calgary.h>
#include <asm/amd_iommu.h>
@@ -30,11 +31,6 @@ int no_iommu __read_mostly;
/* Set this to 1 if there is a HW IOMMU in the system */
int iommu_detected __read_mostly = 0;
-/* This tells the BIO block layer to assume merging. Default to off
- because we cannot guarantee merging later. */
-int iommu_bio_merge __read_mostly = 0;
-EXPORT_SYMBOL(iommu_bio_merge);
-
dma_addr_t bad_dma_address __read_mostly = 0;
EXPORT_SYMBOL(bad_dma_address);
@@ -42,7 +38,7 @@ EXPORT_SYMBOL(bad_dma_address);
be probably a smaller DMA mask, but this is bug-to-bug compatible
to older i386. */
struct device x86_dma_fallback_dev = {
- .bus_id = "fallback device",
+ .init_name = "fallback device",
.coherent_dma_mask = DMA_32BIT_MASK,
.dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
};
@@ -105,11 +101,15 @@ static void __init dma32_free_bootmem(void)
dma32_bootmem_ptr = NULL;
dma32_bootmem_size = 0;
}
+#endif
void __init pci_iommu_alloc(void)
{
+#ifdef CONFIG_X86_64
/* free the range so iommu could get some range less than 4G */
dma32_free_bootmem();
+#endif
+
/*
* The order of these functions is important for
* fall-back/fail-over reasons
@@ -125,15 +125,6 @@ void __init pci_iommu_alloc(void)
pci_swiotlb_init();
}
-unsigned long iommu_nr_pages(unsigned long addr, unsigned long len)
-{
- unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
-
- return size >> PAGE_SHIFT;
-}
-EXPORT_SYMBOL(iommu_nr_pages);
-#endif
-
void *dma_generic_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_addr, gfp_t flag)
{
@@ -188,7 +179,6 @@ static __init int iommu_setup(char *p)
}
if (!strncmp(p, "biomerge", 8)) {
- iommu_bio_merge = 4096;
iommu_merge = 1;
force_iommu = 1;
}
@@ -300,8 +290,8 @@ fs_initcall(pci_iommu_init);
static __devinit void via_no_dac(struct pci_dev *dev)
{
if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
- printk(KERN_INFO "PCI: VIA PCI bridge detected."
- "Disabling DAC.\n");
+ printk(KERN_INFO
+ "PCI: VIA PCI bridge detected. Disabling DAC.\n");
forbid_dac = 1;
}
}
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index a42b02b4df6..d5768b1af08 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -5,7 +5,7 @@
* This allows to use PCI devices that only support 32bit addresses on systems
* with more than 4GB.
*
- * See Documentation/DMA-mapping.txt for the interface specification.
+ * See Documentation/PCI/PCI-DMA-mapping.txt for the interface specification.
*
* Copyright 2002 Andi Kleen, SuSE Labs.
* Subject to the GNU General Public License v2 only.
@@ -52,7 +52,7 @@ static u32 *iommu_gatt_base; /* Remapping table */
* to trigger bugs with some popular PCI cards, in particular 3ware (but
* has been also also seen with Qlogic at least).
*/
-int iommu_fullflush = 1;
+static int iommu_fullflush = 1;
/* Allocation bitmap for the remapping area: */
static DEFINE_SPINLOCK(iommu_bitmap_lock);
@@ -123,6 +123,8 @@ static void free_iommu(unsigned long offset, int size)
spin_lock_irqsave(&iommu_bitmap_lock, flags);
iommu_area_free(iommu_gart_bitmap, offset, size);
+ if (offset >= next_bit)
+ next_bit = offset + size;
spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
}
@@ -743,10 +745,8 @@ void __init gart_iommu_init(void)
unsigned long scratch;
long i;
- if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) {
- printk(KERN_INFO "PCI-GART: No AMD GART found.\n");
+ if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0)
return;
- }
#ifndef CONFIG_AGP_AMD64
no_agp = 1;
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 3c539d111ab..d59c9174766 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -3,6 +3,8 @@
#include <linux/pci.h>
#include <linux/cache.h>
#include <linux/module.h>
+#include <linux/swiotlb.h>
+#include <linux/bootmem.h>
#include <linux/dma-mapping.h>
#include <asm/iommu.h>
@@ -11,6 +13,31 @@
int swiotlb __read_mostly;
+void * __init swiotlb_alloc_boot(size_t size, unsigned long nslabs)
+{
+ return alloc_bootmem_low_pages(size);
+}
+
+void *swiotlb_alloc(unsigned order, unsigned long nslabs)
+{
+ return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
+}
+
+dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
+{
+ return paddr;
+}
+
+phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
+{
+ return baddr;
+}
+
+int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
+{
+ return 0;
+}
+
static dma_addr_t
swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
int direction)
@@ -50,8 +77,10 @@ struct dma_mapping_ops swiotlb_dma_ops = {
void __init pci_swiotlb_init(void)
{
/* don't initialize swiotlb if iommu=off (no_iommu=1) */
+#ifdef CONFIG_X86_64
if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)
swiotlb = 1;
+#endif
if (swiotlb_force)
swiotlb = 1;
if (swiotlb) {
diff --git a/arch/x86/kernel/probe_32.c b/arch/x86/kernel/probe_32.c
new file mode 100644
index 00000000000..22337b75de6
--- /dev/null
+++ b/arch/x86/kernel/probe_32.c
@@ -0,0 +1,411 @@
+/*
+ * Default generic APIC driver. This handles up to 8 CPUs.
+ *
+ * Copyright 2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic x86 APIC driver probe layer.
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/genapic.h>
+#include <asm/setup.h>
+
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <asm/mpspec.h>
+#include <asm/genapic.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <asm/genapic.h>
+#include <asm/ipi.h>
+
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <asm/acpi.h>
+#include <asm/arch_hooks.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+
+#include <asm/genapic.h>
+
+#ifdef CONFIG_HOTPLUG_CPU
+#define DEFAULT_SEND_IPI (1)
+#else
+#define DEFAULT_SEND_IPI (0)
+#endif
+
+int no_broadcast = DEFAULT_SEND_IPI;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+static void default_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ /*
+ * Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ *retmask = (cpumask_t) { { [0] = APIC_ALL_CPUS } };
+}
+
+/* should be called last. */
+static int probe_default(void)
+{
+ return 1;
+}
+
+struct genapic apic_default = {
+
+ .name = "default",
+ .probe = probe_default,
+ .acpi_madt_oem_check = NULL,
+ .apic_id_registered = default_apic_id_registered,
+
+ .irq_delivery_mode = dest_LowestPrio,
+ /* logical delivery broadcast to all CPUs: */
+ .irq_dest_mode = 1,
+
+ .target_cpus = default_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = default_check_apicid_used,
+ .check_apicid_present = default_check_apicid_present,
+
+ .vector_allocation_domain = default_vector_allocation_domain,
+ .init_apic_ldr = default_init_apic_ldr,
+
+ .ioapic_phys_id_map = default_ioapic_phys_id_map,
+ .setup_apic_routing = default_setup_apic_routing,
+ .multi_timer_check = NULL,
+ .apicid_to_node = default_apicid_to_node,
+ .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = default_apicid_to_cpu_present,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = default_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = default_get_apic_id,
+ .set_apic_id = NULL,
+ .apic_id_mask = 0x0F << 24,
+
+ .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = default_send_IPI_mask_logical,
+ .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical,
+ .send_IPI_allbutself = default_send_IPI_allbutself,
+ .send_IPI_all = default_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+
+ .wakeup_cpu = NULL,
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+
+ .wait_for_init_deassert = default_wait_for_init_deassert,
+
+ .smp_callin_clear_local_apic = NULL,
+ .store_NMI_vector = NULL,
+ .inquire_remote_apic = default_inquire_remote_apic,
+};
+
+extern struct genapic apic_numaq;
+extern struct genapic apic_summit;
+extern struct genapic apic_bigsmp;
+extern struct genapic apic_es7000;
+extern struct genapic apic_default;
+
+struct genapic *apic = &apic_default;
+
+static struct genapic *apic_probe[] __initdata = {
+#ifdef CONFIG_X86_NUMAQ
+ &apic_numaq,
+#endif
+#ifdef CONFIG_X86_SUMMIT
+ &apic_summit,
+#endif
+#ifdef CONFIG_X86_BIGSMP
+ &apic_bigsmp,
+#endif
+#ifdef CONFIG_X86_ES7000
+ &apic_es7000,
+#endif
+ &apic_default, /* must be last */
+ NULL,
+};
+
+static int cmdline_apic __initdata;
+static int __init parse_apic(char *arg)
+{
+ int i;
+
+ if (!arg)
+ return -EINVAL;
+
+ for (i = 0; apic_probe[i]; i++) {
+ if (!strcmp(apic_probe[i]->name, arg)) {
+ apic = apic_probe[i];
+ cmdline_apic = 1;
+ return 0;
+ }
+ }
+
+ if (x86_quirks->update_genapic)
+ x86_quirks->update_genapic();
+
+ /* Parsed again by __setup for debug/verbose */
+ return 0;
+}
+early_param("apic", parse_apic);
+
+void __init generic_bigsmp_probe(void)
+{
+#ifdef CONFIG_X86_BIGSMP
+ /*
+ * This routine is used to switch to bigsmp mode when
+ * - There is no apic= option specified by the user
+ * - generic_apic_probe() has chosen apic_default as the sub_arch
+ * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
+ */
+
+ if (!cmdline_apic && apic == &apic_default) {
+ if (apic_bigsmp.probe()) {
+ apic = &apic_bigsmp;
+ if (x86_quirks->update_genapic)
+ x86_quirks->update_genapic();
+ printk(KERN_INFO "Overriding APIC driver with %s\n",
+ apic->name);
+ }
+ }
+#endif
+}
+
+void __init generic_apic_probe(void)
+{
+ if (!cmdline_apic) {
+ int i;
+ for (i = 0; apic_probe[i]; i++) {
+ if (apic_probe[i]->probe()) {
+ apic = apic_probe[i];
+ break;
+ }
+ }
+ /* Not visible without early console */
+ if (!apic_probe[i])
+ panic("Didn't find an APIC driver");
+
+ if (x86_quirks->update_genapic)
+ x86_quirks->update_genapic();
+ }
+ printk(KERN_INFO "Using APIC driver %s\n", apic->name);
+}
+
+/* These functions can switch the APIC even after the initial ->probe() */
+
+int __init
+generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
+{
+ int i;
+
+ for (i = 0; apic_probe[i]; ++i) {
+ if (!apic_probe[i]->mps_oem_check)
+ continue;
+ if (!apic_probe[i]->mps_oem_check(mpc, oem, productid))
+ continue;
+
+ if (!cmdline_apic) {
+ apic = apic_probe[i];
+ if (x86_quirks->update_genapic)
+ x86_quirks->update_genapic();
+ printk(KERN_INFO "Switched to APIC driver `%s'.\n",
+ apic->name);
+ }
+ return 1;
+ }
+ return 0;
+}
+
+int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ int i;
+
+ for (i = 0; apic_probe[i]; ++i) {
+ if (!apic_probe[i]->acpi_madt_oem_check)
+ continue;
+ if (!apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id))
+ continue;
+
+ if (!cmdline_apic) {
+ apic = apic_probe[i];
+ if (x86_quirks->update_genapic)
+ x86_quirks->update_genapic();
+ printk(KERN_INFO "Switched to APIC driver `%s'.\n",
+ apic->name);
+ }
+ return 1;
+ }
+ return 0;
+}
+
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+/**
+ * pre_intr_init_hook - initialisation prior to setting up interrupt vectors
+ *
+ * Description:
+ * Perform any necessary interrupt initialisation prior to setting up
+ * the "ordinary" interrupt call gates. For legacy reasons, the ISA
+ * interrupts should be initialised here if the machine emulates a PC
+ * in any way.
+ **/
+void __init pre_intr_init_hook(void)
+{
+ if (x86_quirks->arch_pre_intr_init) {
+ if (x86_quirks->arch_pre_intr_init())
+ return;
+ }
+ init_ISA_irqs();
+}
+
+/**
+ * intr_init_hook - post gate setup interrupt initialisation
+ *
+ * Description:
+ * Fill in any interrupts that may have been left out by the general
+ * init_IRQ() routine. interrupts having to do with the machine rather
+ * than the devices on the I/O bus (like APIC interrupts in intel MP
+ * systems) are started here.
+ **/
+void __init intr_init_hook(void)
+{
+ if (x86_quirks->arch_intr_init) {
+ if (x86_quirks->arch_intr_init())
+ return;
+ }
+}
+
+/**
+ * pre_setup_arch_hook - hook called prior to any setup_arch() execution
+ *
+ * Description:
+ * generally used to activate any machine specific identification
+ * routines that may be needed before setup_arch() runs. On Voyager
+ * this is used to get the board revision and type.
+ **/
+void __init pre_setup_arch_hook(void)
+{
+}
+
+/**
+ * trap_init_hook - initialise system specific traps
+ *
+ * Description:
+ * Called as the final act of trap_init(). Used in VISWS to initialise
+ * the various board specific APIC traps.
+ **/
+void __init trap_init_hook(void)
+{
+ if (x86_quirks->arch_trap_init) {
+ if (x86_quirks->arch_trap_init())
+ return;
+ }
+}
+
+static struct irqaction irq0 = {
+ .handler = timer_interrupt,
+ .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL,
+ .mask = CPU_MASK_NONE,
+ .name = "timer"
+};
+
+/**
+ * pre_time_init_hook - do any specific initialisations before.
+ *
+ **/
+void __init pre_time_init_hook(void)
+{
+ if (x86_quirks->arch_pre_time_init)
+ x86_quirks->arch_pre_time_init();
+}
+
+/**
+ * time_init_hook - do any specific initialisations for the system timer.
+ *
+ * Description:
+ * Must plug the system timer interrupt source at HZ into the IRQ listed
+ * in irq_vectors.h:TIMER_IRQ
+ **/
+void __init time_init_hook(void)
+{
+ if (x86_quirks->arch_time_init) {
+ /*
+ * A nonzero return code does not mean failure, it means
+ * that the architecture quirk does not want any
+ * generic (timer) setup to be performed after this:
+ */
+ if (x86_quirks->arch_time_init())
+ return;
+ }
+
+ irq0.mask = cpumask_of_cpu(0);
+ setup_irq(0, &irq0);
+}
+
+#ifdef CONFIG_MCA
+/**
+ * mca_nmi_hook - hook into MCA specific NMI chain
+ *
+ * Description:
+ * The MCA (Microchannel Architecture) has an NMI chain for NMI sources
+ * along the MCA bus. Use this to hook into that chain if you will need
+ * it.
+ **/
+void mca_nmi_hook(void)
+{
+ /*
+ * If I recall correctly, there's a whole bunch of other things that
+ * we can do to check for NMI problems, but that's all I know about
+ * at the moment.
+ */
+ pr_warning("NMI generated from unknown source!\n");
+}
+#endif
+
+static __init int no_ipi_broadcast(char *str)
+{
+ get_option(&str, &no_broadcast);
+ pr_info("Using %s mode\n",
+ no_broadcast ? "No IPI Broadcast" : "IPI Broadcast");
+ return 1;
+}
+__setup("no_ipi_broadcast=", no_ipi_broadcast);
+
+static int __init print_ipi_mode(void)
+{
+ pr_info("Using IPI %s mode\n",
+ no_broadcast ? "No-Shortcut" : "Shortcut");
+ return 0;
+}
+
+late_initcall(print_ipi_mode);
+
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms_32.c
index 675a48c404a..071e7fea42e 100644
--- a/arch/x86/kernel/probe_roms_32.c
+++ b/arch/x86/kernel/probe_roms_32.c
@@ -18,7 +18,7 @@
#include <asm/setup.h>
#include <asm/sections.h>
#include <asm/io.h>
-#include <setup_arch.h>
+#include <asm/setup_arch.h>
static struct resource system_rom_resource = {
.name = "System ROM",
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c622772744d..87b69d4fac1 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1,13 +1,16 @@
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <asm/idle.h>
#include <linux/smp.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/pm.h>
#include <linux/clockchips.h>
+#include <linux/ftrace.h>
#include <asm/system.h>
+#include <asm/apic.h>
unsigned long idle_halt;
EXPORT_SYMBOL(idle_halt);
@@ -100,6 +103,9 @@ static inline int hlt_use_halt(void)
void default_idle(void)
{
if (hlt_use_halt()) {
+ struct power_trace it;
+
+ trace_power_start(&it, POWER_CSTATE, 1);
current_thread_info()->status &= ~TS_POLLING;
/*
* TS_POLLING-cleared state must be visible before we
@@ -112,6 +118,7 @@ void default_idle(void)
else
local_irq_enable();
current_thread_info()->status |= TS_POLLING;
+ trace_power_end(&it);
} else {
local_irq_enable();
/* loop is done by the caller */
@@ -122,6 +129,21 @@ void default_idle(void)
EXPORT_SYMBOL(default_idle);
#endif
+void stop_this_cpu(void *dummy)
+{
+ local_irq_disable();
+ /*
+ * Remove this CPU:
+ */
+ cpu_clear(smp_processor_id(), cpu_online_map);
+ disable_local_APIC();
+
+ for (;;) {
+ if (hlt_works(smp_processor_id()))
+ halt();
+ }
+}
+
static void do_nothing(void *unused)
{
}
@@ -154,24 +176,37 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
*/
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
+ struct power_trace it;
+
+ trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
if (!need_resched()) {
+ if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
+ clflush((void *)&current_thread_info()->flags);
+
__monitor((void *)&current_thread_info()->flags, 0, 0);
smp_mb();
if (!need_resched())
__mwait(ax, cx);
}
+ trace_power_end(&it);
}
/* Default MONITOR/MWAIT with no hints, used for default C1 state */
static void mwait_idle(void)
{
+ struct power_trace it;
if (!need_resched()) {
+ trace_power_start(&it, POWER_CSTATE, 1);
+ if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
+ clflush((void *)&current_thread_info()->flags);
+
__monitor((void *)&current_thread_info()->flags, 0, 0);
smp_mb();
if (!need_resched())
__sti_mwait(0, 0);
else
local_irq_enable();
+ trace_power_end(&it);
} else
local_irq_enable();
}
@@ -183,9 +218,13 @@ static void mwait_idle(void)
*/
static void poll_idle(void)
{
+ struct power_trace it;
+
+ trace_power_start(&it, POWER_CSTATE, 0);
local_irq_enable();
while (!need_resched())
cpu_relax();
+ trace_power_end(&it);
}
/*
@@ -270,7 +309,7 @@ static void c1e_idle(void)
rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
if (lo & K8_INTP_C1E_ACTIVE_MASK) {
c1e_detected = 1;
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
mark_tsc_unstable("TSC halt in AMD C1E");
printk(KERN_INFO "System has AMD C1E enabled\n");
set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
@@ -311,7 +350,7 @@ static void c1e_idle(void)
void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_SMP
+#ifdef CONFIG_SMP
if (pm_idle == poll_idle && smp_num_siblings > 1) {
printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
" performance may degrade.\n");
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0a1302fe6d4..fec79ad85dc 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -11,6 +11,7 @@
#include <stdarg.h>
+#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
@@ -38,11 +39,13 @@
#include <linux/percpu.h>
#include <linux/prctl.h>
#include <linux/dmi.h>
+#include <linux/ftrace.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/kdebug.h>
-#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/system.h>
-#include <asm/io.h>
#include <asm/ldt.h>
#include <asm/processor.h>
#include <asm/i387.h>
@@ -55,19 +58,15 @@
#include <asm/tlbflush.h>
#include <asm/cpu.h>
-#include <asm/kdebug.h>
#include <asm/idle.h>
#include <asm/syscalls.h>
-#include <asm/smp.h>
+#include <asm/ds.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
-DEFINE_PER_CPU(int, cpu_number);
-EXPORT_PER_CPU_SYMBOL(cpu_number);
-
/*
* Return saved PC of a blocked thread.
*/
@@ -93,6 +92,15 @@ void cpu_idle(void)
{
int cpu = smp_processor_id();
+ /*
+ * If we're the non-boot CPU, nothing set the stack canary up
+ * for us. CPU0 already has it initialized but no harm in
+ * doing it again. This is a good place for updating it, as
+ * we wont ever return from this function (so the invalid
+ * canaries already on the stack wont ever trigger).
+ */
+ boot_init_stack_canary();
+
current_thread_info()->status |= TS_POLLING;
/* endless idle loop with no priority at all */
@@ -110,7 +118,6 @@ void cpu_idle(void)
play_dead();
local_irq_disable();
- __get_cpu_var(irq_stat).idle_timestamp = jiffies;
/* Don't trace irqs off for idle */
stop_critical_timings();
pm_idle();
@@ -134,7 +141,7 @@ void __show_regs(struct pt_regs *regs, int all)
if (user_mode_vm(regs)) {
sp = regs->sp;
ss = regs->ss & 0xffff;
- savesegment(gs, gs);
+ gs = get_user_gs(regs);
} else {
sp = (unsigned long) (&regs->sp);
savesegment(ss, ss);
@@ -203,7 +210,7 @@ extern void kernel_thread_helper(void);
/*
* Create a kernel thread
*/
-int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
+int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
struct pt_regs regs;
@@ -215,6 +222,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
regs.ds = __USER_DS;
regs.es = __USER_DS;
regs.fs = __KERNEL_PERCPU;
+ regs.gs = __KERNEL_STACK_CANARY;
regs.orig_ax = -1;
regs.ip = (unsigned long) kernel_thread_helper;
regs.cs = __KERNEL_CS | get_kernel_rpl();
@@ -250,14 +258,8 @@ void exit_thread(void)
tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
put_cpu();
}
-#ifdef CONFIG_X86_DS
- /* Free any DS contexts that have not been properly released. */
- if (unlikely(current->thread.ds_ctx)) {
- /* we clear debugctl to make sure DS is not used. */
- update_debugctlmsr(0);
- ds_free(current->thread.ds_ctx);
- }
-#endif /* CONFIG_X86_DS */
+
+ ds_exit_thread(current);
}
void flush_thread(void)
@@ -270,7 +272,7 @@ void flush_thread(void)
tsk->thread.debugreg3 = 0;
tsk->thread.debugreg6 = 0;
tsk->thread.debugreg7 = 0;
- memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
clear_tsk_thread_flag(tsk, TIF_DEBUG);
/*
* Forget coprocessor state..
@@ -297,9 +299,9 @@ void prepare_to_copy(struct task_struct *tsk)
int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
unsigned long unused,
- struct task_struct * p, struct pt_regs * regs)
+ struct task_struct *p, struct pt_regs *regs)
{
- struct pt_regs * childregs;
+ struct pt_regs *childregs;
struct task_struct *tsk;
int err;
@@ -313,7 +315,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
p->thread.ip = (unsigned long) ret_from_fork;
- savesegment(gs, p->thread.gs);
+ task_user_gs(p) = get_user_gs(regs);
tsk = current;
if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -339,13 +341,19 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
kfree(p->thread.io_bitmap_ptr);
p->thread.io_bitmap_max = 0;
}
+
+ ds_copy_thread(p, current);
+
+ clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
+ p->thread.debugctlmsr = 0;
+
return err;
}
void
start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
- __asm__("movl %0, %%gs" :: "r"(0));
+ set_user_gs(regs, 0);
regs->fs = 0;
set_fs(USER_DS);
regs->ds = __USER_DS;
@@ -419,48 +427,19 @@ int set_tsc_mode(unsigned int val)
return 0;
}
-#ifdef CONFIG_X86_DS
-static int update_debugctl(struct thread_struct *prev,
- struct thread_struct *next, unsigned long debugctl)
-{
- unsigned long ds_prev = 0;
- unsigned long ds_next = 0;
-
- if (prev->ds_ctx)
- ds_prev = (unsigned long)prev->ds_ctx->ds;
- if (next->ds_ctx)
- ds_next = (unsigned long)next->ds_ctx->ds;
-
- if (ds_next != ds_prev) {
- /* we clear debugctl to make sure DS
- * is not in use when we change it */
- debugctl = 0;
- update_debugctlmsr(0);
- wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
- }
- return debugctl;
-}
-#else
-static int update_debugctl(struct thread_struct *prev,
- struct thread_struct *next, unsigned long debugctl)
-{
- return debugctl;
-}
-#endif /* CONFIG_X86_DS */
-
static noinline void
__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss)
{
struct thread_struct *prev, *next;
- unsigned long debugctl;
prev = &prev_p->thread;
next = &next_p->thread;
- debugctl = update_debugctl(prev, next, prev->debugctlmsr);
-
- if (next->debugctlmsr != debugctl)
+ if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
+ test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
+ ds_switch_to(prev_p, next_p);
+ else if (next->debugctlmsr != prev->debugctlmsr)
update_debugctlmsr(next->debugctlmsr);
if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -482,15 +461,6 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
hard_enable_TSC();
}
-#ifdef CONFIG_X86_PTRACE_BTS
- if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
- ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
-
- if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
- ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif /* CONFIG_X86_PTRACE_BTS */
-
-
if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
/*
* Disable the bitmap via an invalid offset. We still cache
@@ -548,7 +518,8 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
* the task-switch, and shows up in ret_from_fork in entry.S,
* for example.
*/
-struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+__notrace_funcgraph struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
@@ -579,7 +550,7 @@ struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct
* used %fs or %gs (it does not today), or if the kernel is
* running inside of a hypervisor layer.
*/
- savesegment(gs, prev->gs);
+ lazy_save_gs(prev->gs);
/*
* Load the per-thread Thread-Local Storage descriptor.
@@ -625,31 +596,31 @@ struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct
* Restore %gs if needed (which is common)
*/
if (prev->gs | next->gs)
- loadsegment(gs, next->gs);
+ lazy_load_gs(next->gs);
- x86_write_percpu(current_task, next_p);
+ percpu_write(current_task, next_p);
return prev_p;
}
-asmlinkage int sys_fork(struct pt_regs regs)
+int sys_fork(struct pt_regs *regs)
{
- return do_fork(SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
+ return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
}
-asmlinkage int sys_clone(struct pt_regs regs)
+int sys_clone(struct pt_regs *regs)
{
unsigned long clone_flags;
unsigned long newsp;
int __user *parent_tidptr, *child_tidptr;
- clone_flags = regs.bx;
- newsp = regs.cx;
- parent_tidptr = (int __user *)regs.dx;
- child_tidptr = (int __user *)regs.di;
+ clone_flags = regs->bx;
+ newsp = regs->cx;
+ parent_tidptr = (int __user *)regs->dx;
+ child_tidptr = (int __user *)regs->di;
if (!newsp)
- newsp = regs.sp;
- return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
+ newsp = regs->sp;
+ return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
}
/*
@@ -662,27 +633,27 @@ asmlinkage int sys_clone(struct pt_regs regs)
* do not have enough call-clobbered registers to hold all
* the information you need.
*/
-asmlinkage int sys_vfork(struct pt_regs regs)
+int sys_vfork(struct pt_regs *regs)
{
- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, NULL, NULL);
}
/*
* sys_execve() executes a new program.
*/
-asmlinkage int sys_execve(struct pt_regs regs)
+int sys_execve(struct pt_regs *regs)
{
int error;
- char * filename;
+ char *filename;
- filename = getname((char __user *) regs.bx);
+ filename = getname((char __user *) regs->bx);
error = PTR_ERR(filename);
if (IS_ERR(filename))
goto out;
error = do_execve(filename,
- (char __user * __user *) regs.cx,
- (char __user * __user *) regs.dx,
- &regs);
+ (char __user * __user *) regs->cx,
+ (char __user * __user *) regs->dx,
+ regs);
if (error == 0) {
/* Make sure we don't return using sysenter.. */
set_thread_flag(TIF_IRET);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c958120fb1b..836ef6575f0 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -16,6 +16,7 @@
#include <stdarg.h>
+#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
@@ -39,22 +40,30 @@
#include <linux/prctl.h>
#include <linux/uaccess.h>
#include <linux/io.h>
+#include <linux/ftrace.h>
+#include <linux/dmi.h>
#include <asm/pgtable.h>
#include <asm/system.h>
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/mmu_context.h>
-#include <asm/pda.h>
#include <asm/prctl.h>
#include <asm/desc.h>
#include <asm/proto.h>
#include <asm/ia32.h>
#include <asm/idle.h>
#include <asm/syscalls.h>
+#include <asm/ds.h>
asmlinkage extern void ret_from_fork(void);
+DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+
+DEFINE_PER_CPU(unsigned long, old_rsp);
+static DEFINE_PER_CPU(unsigned char, is_idle);
+
unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@ -73,13 +82,13 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
void enter_idle(void)
{
- write_pda(isidle, 1);
+ percpu_write(is_idle, 1);
atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
}
static void __exit_idle(void)
{
- if (test_and_clear_bit_pda(0, isidle) == 0)
+ if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
return;
atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
}
@@ -109,6 +118,16 @@ static inline void play_dead(void)
void cpu_idle(void)
{
current_thread_info()->status |= TS_POLLING;
+
+ /*
+ * If we're the non-boot CPU, nothing set the stack canary up
+ * for us. CPU0 already has it initialized but no harm in
+ * doing it again. This is a good place for updating it, as
+ * we wont ever return from this function (so the invalid
+ * canaries already on the stack wont ever trigger).
+ */
+ boot_init_stack_canary();
+
/* endless idle loop with no priority at all */
while (1) {
tick_nohz_stop_sched_tick(1);
@@ -149,14 +168,18 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned long d0, d1, d2, d3, d6, d7;
unsigned int fsindex, gsindex;
unsigned int ds, cs, es;
+ const char *board;
printk("\n");
print_modules();
- printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
+ board = dmi_get_system_info(DMI_PRODUCT_NAME);
+ if (!board)
+ board = "";
+ printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
current->pid, current->comm, print_tainted(),
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
+ init_utsname()->version, board);
printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
printk_address(regs->ip, 1);
printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
@@ -235,14 +258,8 @@ void exit_thread(void)
t->io_bitmap_max = 0;
put_cpu();
}
-#ifdef CONFIG_X86_DS
- /* Free any DS contexts that have not been properly released. */
- if (unlikely(t->ds_ctx)) {
- /* we clear debugctl to make sure DS is not used. */
- update_debugctlmsr(0);
- ds_free(t->ds_ctx);
- }
-#endif /* CONFIG_X86_DS */
+
+ ds_exit_thread(current);
}
void flush_thread(void)
@@ -372,6 +389,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
if (err)
goto out;
}
+
+ ds_copy_thread(p, me);
+
+ clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
+ p->thread.debugctlmsr = 0;
+
err = 0;
out:
if (err && p->thread.io_bitmap_ptr) {
@@ -390,7 +413,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
load_gs_index(0);
regs->ip = new_ip;
regs->sp = new_sp;
- write_pda(oldrsp, new_sp);
+ percpu_write(old_rsp, new_sp);
regs->cs = __USER_CS;
regs->ss = __USER_DS;
regs->flags = 0x200;
@@ -470,35 +493,14 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
struct tss_struct *tss)
{
struct thread_struct *prev, *next;
- unsigned long debugctl;
prev = &prev_p->thread,
next = &next_p->thread;
- debugctl = prev->debugctlmsr;
-
-#ifdef CONFIG_X86_DS
- {
- unsigned long ds_prev = 0, ds_next = 0;
-
- if (prev->ds_ctx)
- ds_prev = (unsigned long)prev->ds_ctx->ds;
- if (next->ds_ctx)
- ds_next = (unsigned long)next->ds_ctx->ds;
-
- if (ds_next != ds_prev) {
- /*
- * We clear debugctl to make sure DS
- * is not in use when we change it:
- */
- debugctl = 0;
- update_debugctlmsr(0);
- wrmsrl(MSR_IA32_DS_AREA, ds_next);
- }
- }
-#endif /* CONFIG_X86_DS */
-
- if (next->debugctlmsr != debugctl)
+ if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
+ test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
+ ds_switch_to(prev_p, next_p);
+ else if (next->debugctlmsr != prev->debugctlmsr)
update_debugctlmsr(next->debugctlmsr);
if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -533,14 +535,6 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
*/
memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
}
-
-#ifdef CONFIG_X86_PTRACE_BTS
- if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
- ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
-
- if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
- ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif /* CONFIG_X86_PTRACE_BTS */
}
/*
@@ -551,8 +545,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
* - could test fs/gs bitsliced
*
* Kprobes not supported here. Set the probe on schedule instead.
+ * Function graph tracer not supported too.
*/
-struct task_struct *
+__notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread;
@@ -639,21 +634,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Switch the PDA and FPU contexts.
*/
- prev->usersp = read_pda(oldrsp);
- write_pda(oldrsp, next->usersp);
- write_pda(pcurrent, next_p);
+ prev->usersp = percpu_read(old_rsp);
+ percpu_write(old_rsp, next->usersp);
+ percpu_write(current_task, next_p);
- write_pda(kernelstack,
+ percpu_write(kernel_stack,
(unsigned long)task_stack_page(next_p) +
- THREAD_SIZE - PDA_STACKOFFSET);
-#ifdef CONFIG_CC_STACKPROTECTOR
- write_pda(stack_canary, next_p->stack_canary);
- /*
- * Build time only check to make sure the stack_canary is at
- * offset 40 in the pda; this is a gcc ABI requirement
- */
- BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
-#endif
+ THREAD_SIZE - KERNEL_STACK_OFFSET);
/*
* Now maybe reload the debug registers and handle I/O bitmaps
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 0a6d8c12e10..7ec39ab37a2 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -75,10 +75,7 @@ static inline bool invalid_selector(u16 value)
static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
{
BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
- regno >>= 2;
- if (regno > FS)
- --regno;
- return &regs->bx + regno;
+ return &regs->bx + (regno >> 2);
}
static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
@@ -90,9 +87,10 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
if (offset != offsetof(struct user_regs_struct, gs))
retval = *pt_regs_access(task_pt_regs(task), offset);
else {
- retval = task->thread.gs;
if (task == current)
- savesegment(gs, retval);
+ retval = get_user_gs(task_pt_regs(task));
+ else
+ retval = task_user_gs(task);
}
return retval;
}
@@ -126,13 +124,10 @@ static int set_segment_reg(struct task_struct *task,
break;
case offsetof(struct user_regs_struct, gs):
- task->thread.gs = value;
if (task == current)
- /*
- * The user-mode %gs is not affected by
- * kernel entry, so we must update the CPU.
- */
- loadsegment(gs, value);
+ set_user_gs(task_pt_regs(task), value);
+ else
+ task_user_gs(task) = value;
}
return 0;
@@ -581,158 +576,91 @@ static int ioperm_get(struct task_struct *target,
}
#ifdef CONFIG_X86_PTRACE_BTS
-/*
- * The configuration for a particular BTS hardware implementation.
- */
-struct bts_configuration {
- /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
- unsigned char sizeof_bts;
- /* the size of a field in the BTS record in bytes */
- unsigned char sizeof_field;
- /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
- unsigned long debugctl_mask;
-};
-static struct bts_configuration bts_cfg;
-
-#define BTS_MAX_RECORD_SIZE (8 * 3)
-
-
-/*
- * Branch Trace Store (BTS) uses the following format. Different
- * architectures vary in the size of those fields.
- * - source linear address
- * - destination linear address
- * - flags
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- * We compute the base address for the first 8 fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- *
- * In order to store additional information in the BTS buffer, we use
- * a special source address to indicate that the record requires
- * special interpretation.
- *
- * Netburst indicated via a bit in the flags field whether the branch
- * was predicted; this is ignored.
- */
-
-enum bts_field {
- bts_from = 0,
- bts_to,
- bts_flags,
-
- bts_escape = (unsigned long)-1,
- bts_qual = bts_to,
- bts_jiffies = bts_flags
-};
-
-static inline unsigned long bts_get(const char *base, enum bts_field field)
-{
- base += (bts_cfg.sizeof_field * field);
- return *(unsigned long *)base;
-}
-
-static inline void bts_set(char *base, enum bts_field field, unsigned long val)
-{
- base += (bts_cfg.sizeof_field * field);;
- (*(unsigned long *)base) = val;
-}
-
-/*
- * Translate a BTS record from the raw format into the bts_struct format
- *
- * out (out): bts_struct interpretation
- * raw: raw BTS record
- */
-static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
-{
- memset(out, 0, sizeof(*out));
- if (bts_get(raw, bts_from) == bts_escape) {
- out->qualifier = bts_get(raw, bts_qual);
- out->variant.jiffies = bts_get(raw, bts_jiffies);
- } else {
- out->qualifier = BTS_BRANCH;
- out->variant.lbr.from_ip = bts_get(raw, bts_from);
- out->variant.lbr.to_ip = bts_get(raw, bts_to);
- }
-}
-
static int ptrace_bts_read_record(struct task_struct *child, size_t index,
struct bts_struct __user *out)
{
- struct bts_struct ret;
- const void *bts_record;
- size_t bts_index, bts_end;
+ const struct bts_trace *trace;
+ struct bts_struct bts;
+ const unsigned char *at;
int error;
- error = ds_get_bts_end(child, &bts_end);
- if (error < 0)
- return error;
-
- if (bts_end <= index)
- return -EINVAL;
+ trace = ds_read_bts(child->bts);
+ if (!trace)
+ return -EPERM;
- error = ds_get_bts_index(child, &bts_index);
- if (error < 0)
- return error;
+ at = trace->ds.top - ((index + 1) * trace->ds.size);
+ if ((void *)at < trace->ds.begin)
+ at += (trace->ds.n * trace->ds.size);
- /* translate the ptrace bts index into the ds bts index */
- bts_index += bts_end - (index + 1);
- if (bts_end <= bts_index)
- bts_index -= bts_end;
+ if (!trace->read)
+ return -EOPNOTSUPP;
- error = ds_access_bts(child, bts_index, &bts_record);
+ error = trace->read(child->bts, at, &bts);
if (error < 0)
return error;
- ptrace_bts_translate_record(&ret, bts_record);
-
- if (copy_to_user(out, &ret, sizeof(ret)))
+ if (copy_to_user(out, &bts, sizeof(bts)))
return -EFAULT;
- return sizeof(ret);
+ return sizeof(bts);
}
static int ptrace_bts_drain(struct task_struct *child,
long size,
struct bts_struct __user *out)
{
- struct bts_struct ret;
- const unsigned char *raw;
- size_t end, i;
- int error;
+ const struct bts_trace *trace;
+ const unsigned char *at;
+ int error, drained = 0;
- error = ds_get_bts_index(child, &end);
- if (error < 0)
- return error;
+ trace = ds_read_bts(child->bts);
+ if (!trace)
+ return -EPERM;
+
+ if (!trace->read)
+ return -EOPNOTSUPP;
- if (size < (end * sizeof(struct bts_struct)))
+ if (size < (trace->ds.top - trace->ds.begin))
return -EIO;
- error = ds_access_bts(child, 0, (const void **)&raw);
- if (error < 0)
- return error;
+ for (at = trace->ds.begin; (void *)at < trace->ds.top;
+ out++, drained++, at += trace->ds.size) {
+ struct bts_struct bts;
+ int error;
- for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) {
- ptrace_bts_translate_record(&ret, raw);
+ error = trace->read(child->bts, at, &bts);
+ if (error < 0)
+ return error;
- if (copy_to_user(out, &ret, sizeof(ret)))
+ if (copy_to_user(out, &bts, sizeof(bts)))
return -EFAULT;
}
- error = ds_clear_bts(child);
+ memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
+
+ error = ds_reset_bts(child->bts);
if (error < 0)
return error;
- return end;
+ return drained;
+}
+
+static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
+{
+ child->bts_buffer = alloc_locked_buffer(size);
+ if (!child->bts_buffer)
+ return -ENOMEM;
+
+ child->bts_size = size;
+
+ return 0;
}
-static void ptrace_bts_ovfl(struct task_struct *child)
+static void ptrace_bts_free_buffer(struct task_struct *child)
{
- send_sig(child->thread.bts_ovfl_signal, child, 0);
+ free_locked_buffer(child->bts_buffer, child->bts_size);
+ child->bts_buffer = NULL;
+ child->bts_size = 0;
}
static int ptrace_bts_config(struct task_struct *child,
@@ -740,114 +668,86 @@ static int ptrace_bts_config(struct task_struct *child,
const struct ptrace_bts_config __user *ucfg)
{
struct ptrace_bts_config cfg;
- int error = 0;
+ unsigned int flags = 0;
- error = -EOPNOTSUPP;
- if (!bts_cfg.sizeof_bts)
- goto errout;
-
- error = -EIO;
if (cfg_size < sizeof(cfg))
- goto errout;
+ return -EIO;
- error = -EFAULT;
if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
- goto errout;
+ return -EFAULT;
- error = -EINVAL;
- if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
- !(cfg.flags & PTRACE_BTS_O_ALLOC))
- goto errout;
+ if (child->bts) {
+ ds_release_bts(child->bts);
+ child->bts = NULL;
+ }
- if (cfg.flags & PTRACE_BTS_O_ALLOC) {
- ds_ovfl_callback_t ovfl = NULL;
- unsigned int sig = 0;
+ if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
+ if (!cfg.signal)
+ return -EINVAL;
- /* we ignore the error in case we were not tracing child */
- (void)ds_release_bts(child);
+ return -EOPNOTSUPP;
- if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
- if (!cfg.signal)
- goto errout;
+ child->thread.bts_ovfl_signal = cfg.signal;
+ }
- sig = cfg.signal;
- ovfl = ptrace_bts_ovfl;
- }
+ if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
+ (cfg.size != child->bts_size)) {
+ int error;
- error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
- if (error < 0)
- goto errout;
+ ptrace_bts_free_buffer(child);
- child->thread.bts_ovfl_signal = sig;
+ error = ptrace_bts_allocate_buffer(child, cfg.size);
+ if (error < 0)
+ return error;
}
- error = -EINVAL;
- if (!child->thread.ds_ctx && cfg.flags)
- goto errout;
-
if (cfg.flags & PTRACE_BTS_O_TRACE)
- child->thread.debugctlmsr |= bts_cfg.debugctl_mask;
- else
- child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
+ flags |= BTS_USER;
if (cfg.flags & PTRACE_BTS_O_SCHED)
- set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
- else
- clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+ flags |= BTS_TIMESTAMPS;
- error = sizeof(cfg);
+ child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size,
+ /* ovfl = */ NULL, /* th = */ (size_t)-1,
+ flags);
+ if (IS_ERR(child->bts)) {
+ int error = PTR_ERR(child->bts);
-out:
- if (child->thread.debugctlmsr)
- set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
- else
- clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+ ptrace_bts_free_buffer(child);
+ child->bts = NULL;
- return error;
+ return error;
+ }
-errout:
- child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
- clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
- goto out;
+ return sizeof(cfg);
}
static int ptrace_bts_status(struct task_struct *child,
long cfg_size,
struct ptrace_bts_config __user *ucfg)
{
+ const struct bts_trace *trace;
struct ptrace_bts_config cfg;
- size_t end;
- const void *base, *max;
- int error;
if (cfg_size < sizeof(cfg))
return -EIO;
- error = ds_get_bts_end(child, &end);
- if (error < 0)
- return error;
-
- error = ds_access_bts(child, /* index = */ 0, &base);
- if (error < 0)
- return error;
-
- error = ds_access_bts(child, /* index = */ end, &max);
- if (error < 0)
- return error;
+ trace = ds_read_bts(child->bts);
+ if (!trace)
+ return -EPERM;
memset(&cfg, 0, sizeof(cfg));
- cfg.size = (max - base);
+ cfg.size = trace->ds.end - trace->ds.begin;
cfg.signal = child->thread.bts_ovfl_signal;
cfg.bts_size = sizeof(struct bts_struct);
if (cfg.signal)
cfg.flags |= PTRACE_BTS_O_SIGNAL;
- if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
- child->thread.debugctlmsr & bts_cfg.debugctl_mask)
+ if (trace->ds.flags & BTS_USER)
cfg.flags |= PTRACE_BTS_O_TRACE;
- if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
+ if (trace->ds.flags & BTS_TIMESTAMPS)
cfg.flags |= PTRACE_BTS_O_SCHED;
if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
@@ -856,110 +756,77 @@ static int ptrace_bts_status(struct task_struct *child,
return sizeof(cfg);
}
-static int ptrace_bts_write_record(struct task_struct *child,
- const struct bts_struct *in)
+static int ptrace_bts_clear(struct task_struct *child)
{
- unsigned char bts_record[BTS_MAX_RECORD_SIZE];
+ const struct bts_trace *trace;
- BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts);
+ trace = ds_read_bts(child->bts);
+ if (!trace)
+ return -EPERM;
- memset(bts_record, 0, bts_cfg.sizeof_bts);
- switch (in->qualifier) {
- case BTS_INVALID:
- break;
+ memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
- case BTS_BRANCH:
- bts_set(bts_record, bts_from, in->variant.lbr.from_ip);
- bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
- break;
+ return ds_reset_bts(child->bts);
+}
- case BTS_TASK_ARRIVES:
- case BTS_TASK_DEPARTS:
- bts_set(bts_record, bts_from, bts_escape);
- bts_set(bts_record, bts_qual, in->qualifier);
- bts_set(bts_record, bts_jiffies, in->variant.jiffies);
- break;
+static int ptrace_bts_size(struct task_struct *child)
+{
+ const struct bts_trace *trace;
- default:
- return -EINVAL;
- }
+ trace = ds_read_bts(child->bts);
+ if (!trace)
+ return -EPERM;
- /* The writing task will be the switched-to task on a context
- * switch. It needs to write into the switched-from task's BTS
- * buffer. */
- return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
+ return (trace->ds.top - trace->ds.begin) / trace->ds.size;
}
-void ptrace_bts_take_timestamp(struct task_struct *tsk,
- enum bts_qualifier qualifier)
+static void ptrace_bts_fork(struct task_struct *tsk)
{
- struct bts_struct rec = {
- .qualifier = qualifier,
- .variant.jiffies = jiffies_64
- };
-
- ptrace_bts_write_record(tsk, &rec);
+ tsk->bts = NULL;
+ tsk->bts_buffer = NULL;
+ tsk->bts_size = 0;
+ tsk->thread.bts_ovfl_signal = 0;
}
-static const struct bts_configuration bts_cfg_netburst = {
- .sizeof_bts = sizeof(long) * 3,
- .sizeof_field = sizeof(long),
- .debugctl_mask = (1<<2)|(1<<3)|(1<<5)
-};
+static void ptrace_bts_untrace(struct task_struct *child)
+{
+ if (unlikely(child->bts)) {
+ ds_release_bts(child->bts);
+ child->bts = NULL;
+
+ /* We cannot update total_vm and locked_vm since
+ child's mm is already gone. But we can reclaim the
+ memory. */
+ kfree(child->bts_buffer);
+ child->bts_buffer = NULL;
+ child->bts_size = 0;
+ }
+}
-static const struct bts_configuration bts_cfg_pentium_m = {
- .sizeof_bts = sizeof(long) * 3,
- .sizeof_field = sizeof(long),
- .debugctl_mask = (1<<6)|(1<<7)
-};
+static void ptrace_bts_detach(struct task_struct *child)
+{
+ if (unlikely(child->bts)) {
+ ds_release_bts(child->bts);
+ child->bts = NULL;
-static const struct bts_configuration bts_cfg_core2 = {
- .sizeof_bts = 8 * 3,
- .sizeof_field = 8,
- .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
-};
+ ptrace_bts_free_buffer(child);
+ }
+}
+#else
+static inline void ptrace_bts_fork(struct task_struct *tsk) {}
+static inline void ptrace_bts_detach(struct task_struct *child) {}
+static inline void ptrace_bts_untrace(struct task_struct *child) {}
+#endif /* CONFIG_X86_PTRACE_BTS */
-static inline void bts_configure(const struct bts_configuration *cfg)
+void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
{
- bts_cfg = *cfg;
+ ptrace_bts_fork(child);
}
-void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
+void x86_ptrace_untrace(struct task_struct *child)
{
- switch (c->x86) {
- case 0x6:
- switch (c->x86_model) {
- case 0xD:
- case 0xE: /* Pentium M */
- bts_configure(&bts_cfg_pentium_m);
- break;
- case 0xF: /* Core2 */
- case 0x1C: /* Atom */
- bts_configure(&bts_cfg_core2);
- break;
- default:
- /* sorry, don't know about them */
- break;
- }
- break;
- case 0xF:
- switch (c->x86_model) {
- case 0x0:
- case 0x1:
- case 0x2: /* Netburst */
- bts_configure(&bts_cfg_netburst);
- break;
- default:
- /* sorry, don't know about them */
- break;
- }
- break;
- default:
- /* sorry, don't know about them */
- break;
- }
+ ptrace_bts_untrace(child);
}
-#endif /* CONFIG_X86_PTRACE_BTS */
/*
* Called by kernel/ptrace.c when detaching..
@@ -972,15 +839,7 @@ void ptrace_disable(struct task_struct *child)
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
#endif
-#ifdef CONFIG_X86_PTRACE_BTS
- (void)ds_release_bts(child);
-
- child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
- if (!child->thread.debugctlmsr)
- clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-
- clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
-#endif /* CONFIG_X86_PTRACE_BTS */
+ ptrace_bts_detach(child);
}
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -1112,7 +971,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
break;
case PTRACE_BTS_SIZE:
- ret = ds_get_bts_index(child, /* pos = */ NULL);
+ ret = ptrace_bts_size(child);
break;
case PTRACE_BTS_GET:
@@ -1121,7 +980,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
break;
case PTRACE_BTS_CLEAR:
- ret = ds_clear_bts(child);
+ ret = ptrace_bts_clear(child);
break;
case PTRACE_BTS_DRAIN:
@@ -1384,6 +1243,14 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
case PTRACE_GET_THREAD_AREA:
case PTRACE_SET_THREAD_AREA:
+#ifdef CONFIG_X86_PTRACE_BTS
+ case PTRACE_BTS_CONFIG:
+ case PTRACE_BTS_STATUS:
+ case PTRACE_BTS_SIZE:
+ case PTRACE_BTS_GET:
+ case PTRACE_BTS_CLEAR:
+ case PTRACE_BTS_DRAIN:
+#endif /* CONFIG_X86_PTRACE_BTS */
return arch_ptrace(child, request, addr, data);
default:
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 67465ed8931..309949e9e1c 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -168,6 +168,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31,
ich_force_enable_hpet);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4,
+ ich_force_enable_hpet);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
ich_force_enable_hpet);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index c3cd512484e..32e8f0af292 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -12,6 +12,9 @@
#include <asm/proto.h>
#include <asm/reboot_fixups.h>
#include <asm/reboot.h>
+#include <asm/pci_x86.h>
+#include <asm/virtext.h>
+#include <asm/cpu.h>
#ifdef CONFIG_X86_32
# include <linux/dmi.h>
@@ -21,8 +24,7 @@
# include <asm/iommu.h>
#endif
-#include <mach_ipi.h>
-
+#include <asm/genapic.h>
/*
* Power off function, if any
@@ -39,7 +41,16 @@ int reboot_force;
static int reboot_cpu = -1;
#endif
-/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old]
+/* This is set if we need to go through the 'emergency' path.
+ * When machine_emergency_restart() is called, we may be on
+ * an inconsistent state and won't be able to do a clean cleanup
+ */
+static int reboot_emergency;
+
+/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
+bool port_cf9_safe = false;
+
+/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
warm Don't set the cold reboot flag
cold Set the cold reboot flag
bios Reboot by jumping through the BIOS (only for X86_32)
@@ -48,6 +59,7 @@ static int reboot_cpu = -1;
kbd Use the keyboard controller. cold reset (default)
acpi Use the RESET_REG in the FADT
efi Use efi reset_system runtime service
+ pci Use the so-called "PCI reset register", CF9
force Avoid anything that could hang.
*/
static int __init reboot_setup(char *str)
@@ -82,6 +94,7 @@ static int __init reboot_setup(char *str)
case 'k':
case 't':
case 'e':
+ case 'p':
reboot_type = *str;
break;
@@ -172,6 +185,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "0KW626"),
},
},
+ { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 330",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"),
+ DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
+ },
+ },
{ /* Handle problems with rebooting on Dell 2400's */
.callback = set_bios_reboot,
.ident = "Dell PowerEdge 2400",
@@ -354,6 +376,48 @@ static inline void kb_wait(void)
}
}
+static void vmxoff_nmi(int cpu, struct die_args *args)
+{
+ cpu_emergency_vmxoff();
+}
+
+/* Use NMIs as IPIs to tell all CPUs to disable virtualization
+ */
+static void emergency_vmx_disable_all(void)
+{
+ /* Just make sure we won't change CPUs while doing this */
+ local_irq_disable();
+
+ /* We need to disable VMX on all CPUs before rebooting, otherwise
+ * we risk hanging up the machine, because the CPU ignore INIT
+ * signals when VMX is enabled.
+ *
+ * We can't take any locks and we may be on an inconsistent
+ * state, so we use NMIs as IPIs to tell the other CPUs to disable
+ * VMX and halt.
+ *
+ * For safety, we will avoid running the nmi_shootdown_cpus()
+ * stuff unnecessarily, but we don't have a way to check
+ * if other CPUs have VMX enabled. So we will call it only if the
+ * CPU we are running on has VMX enabled.
+ *
+ * We will miss cases where VMX is not enabled on all CPUs. This
+ * shouldn't do much harm because KVM always enable VMX on all
+ * CPUs anyway. But we can miss it on the small window where KVM
+ * is still enabling VMX.
+ */
+ if (cpu_has_vmx() && cpu_vmx_enabled()) {
+ /* Disable VMX on this CPU.
+ */
+ cpu_vmxoff();
+
+ /* Halt and disable VMX on the other CPUs */
+ nmi_shootdown_cpus(vmxoff_nmi);
+
+ }
+}
+
+
void __attribute__((weak)) mach_reboot_fixups(void)
{
}
@@ -362,6 +426,9 @@ static void native_machine_emergency_restart(void)
{
int i;
+ if (reboot_emergency)
+ emergency_vmx_disable_all();
+
/* Tell the BIOS if we want cold or warm reboot */
*((unsigned short *)__va(0x472)) = reboot_mode;
@@ -398,12 +465,27 @@ static void native_machine_emergency_restart(void)
reboot_type = BOOT_KBD;
break;
-
case BOOT_EFI:
if (efi_enabled)
- efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD,
+ efi.reset_system(reboot_mode ?
+ EFI_RESET_WARM :
+ EFI_RESET_COLD,
EFI_SUCCESS, 0, NULL);
+ reboot_type = BOOT_KBD;
+ break;
+
+ case BOOT_CF9:
+ port_cf9_safe = true;
+ /* fall through */
+ case BOOT_CF9_COND:
+ if (port_cf9_safe) {
+ u8 cf9 = inb(0xcf9) & ~6;
+ outb(cf9|2, 0xcf9); /* Request hard reset */
+ udelay(50);
+ outb(cf9|6, 0xcf9); /* Actually do the reset */
+ udelay(50);
+ }
reboot_type = BOOT_KBD;
break;
}
@@ -420,7 +502,7 @@ void native_machine_shutdown(void)
#ifdef CONFIG_X86_32
/* See if there has been given a command line override */
- if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
+ if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) &&
cpu_online(reboot_cpu))
reboot_cpu_id = reboot_cpu;
#endif
@@ -430,7 +512,7 @@ void native_machine_shutdown(void)
reboot_cpu_id = smp_processor_id();
/* Make certain I only run on the appropriate processor */
- set_cpus_allowed_ptr(current, &cpumask_of_cpu(reboot_cpu_id));
+ set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id));
/* O.K Now that I'm on the appropriate processor,
* stop all of the others.
@@ -453,17 +535,28 @@ void native_machine_shutdown(void)
#endif
}
+static void __machine_emergency_restart(int emergency)
+{
+ reboot_emergency = emergency;
+ machine_ops.emergency_restart();
+}
+
static void native_machine_restart(char *__unused)
{
printk("machine restart\n");
if (!reboot_force)
machine_shutdown();
- machine_emergency_restart();
+ __machine_emergency_restart(0);
}
static void native_machine_halt(void)
{
+ /* stop other cpus and apics */
+ machine_shutdown();
+
+ /* stop this cpu */
+ stop_this_cpu(NULL);
}
static void native_machine_power_off(void)
@@ -498,7 +591,7 @@ void machine_shutdown(void)
void machine_emergency_restart(void)
{
- machine_ops.emergency_restart();
+ __machine_emergency_restart(1);
}
void machine_restart(char *cmd)
@@ -558,10 +651,7 @@ static int crash_nmi_callback(struct notifier_block *self,
static void smp_send_nmi_allbutself(void)
{
- cpumask_t mask = cpu_online_map;
- cpu_clear(safe_smp_processor_id(), mask);
- if (!cpus_empty(mask))
- send_IPI_mask(mask, NMI_VECTOR);
+ apic->send_IPI_allbutself(NMI_VECTOR);
}
static struct notifier_block crash_nmi_nb = {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0fa6790c1dd..8fce6c71451 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -81,7 +81,7 @@
#include <asm/io_apic.h>
#include <asm/ist.h>
#include <asm/vmi.h>
-#include <setup_arch.h>
+#include <asm/setup_arch.h>
#include <asm/bios_ebda.h>
#include <asm/cacheflush.h>
#include <asm/processor.h>
@@ -89,15 +89,17 @@
#include <asm/system.h>
#include <asm/vsyscall.h>
-#include <asm/smp.h>
+#include <asm/cpu.h>
#include <asm/desc.h>
#include <asm/dma.h>
#include <asm/iommu.h>
+#include <asm/gart.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
-#include <mach_apic.h>
+#include <asm/genapic.h>
#include <asm/paravirt.h>
+#include <asm/hypervisor.h>
#include <asm/percpu.h>
#include <asm/topology.h>
@@ -110,6 +112,20 @@
#define ARCH_SETUP
#endif
+unsigned int boot_cpu_id __read_mostly;
+
+#ifdef CONFIG_X86_64
+int default_cpu_present_to_apicid(int mps_cpu)
+{
+ return __default_cpu_present_to_apicid(mps_cpu);
+}
+
+int default_check_phys_apicid_present(int boot_cpu_physical_apicid)
+{
+ return __default_check_phys_apicid_present(boot_cpu_physical_apicid);
+}
+#endif
+
#ifndef CONFIG_DEBUG_BOOT_PARAMS
struct boot_params __initdata boot_params;
#else
@@ -448,6 +464,7 @@ static void __init reserve_early_setup_data(void)
* @size: Size of the crashkernel memory to reserve.
* Returns the base address on success, and -1ULL on failure.
*/
+static
unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
{
const unsigned long long alignment = 16<<20; /* 16M */
@@ -583,165 +600,27 @@ static int __init setup_elfcorehdr(char *arg)
early_param("elfcorehdr", setup_elfcorehdr);
#endif
-static struct x86_quirks default_x86_quirks __initdata;
-
-struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
-
-/*
- * Some BIOSes seem to corrupt the low 64k of memory during events
- * like suspend/resume and unplugging an HDMI cable. Reserve all
- * remaining free memory in that area and fill it with a distinct
- * pattern.
- */
-#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
-#define MAX_SCAN_AREAS 8
-
-static int __read_mostly memory_corruption_check = -1;
-
-static unsigned __read_mostly corruption_check_size = 64*1024;
-static unsigned __read_mostly corruption_check_period = 60; /* seconds */
-
-static struct e820entry scan_areas[MAX_SCAN_AREAS];
-static int num_scan_areas;
-
-
-static int set_corruption_check(char *arg)
+static int __init default_update_genapic(void)
{
- char *end;
-
- memory_corruption_check = simple_strtol(arg, &end, 10);
-
- return (*end == 0) ? 0 : -EINVAL;
-}
-early_param("memory_corruption_check", set_corruption_check);
-
-static int set_corruption_check_period(char *arg)
-{
- char *end;
-
- corruption_check_period = simple_strtoul(arg, &end, 10);
-
- return (*end == 0) ? 0 : -EINVAL;
-}
-early_param("memory_corruption_check_period", set_corruption_check_period);
-
-static int set_corruption_check_size(char *arg)
-{
- char *end;
- unsigned size;
-
- size = memparse(arg, &end);
-
- if (*end == '\0')
- corruption_check_size = size;
-
- return (size == corruption_check_size) ? 0 : -EINVAL;
-}
-early_param("memory_corruption_check_size", set_corruption_check_size);
-
-
-static void __init setup_bios_corruption_check(void)
-{
- u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
-
- if (memory_corruption_check == -1) {
- memory_corruption_check =
-#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
- 1
-#else
- 0
+#ifdef CONFIG_SMP
+ if (!apic->wakeup_cpu)
+ apic->wakeup_cpu = wakeup_secondary_cpu_via_init;
#endif
- ;
- }
-
- if (corruption_check_size == 0)
- memory_corruption_check = 0;
-
- if (!memory_corruption_check)
- return;
-
- corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
- while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
- u64 size;
- addr = find_e820_area_size(addr, &size, PAGE_SIZE);
-
- if (addr == 0)
- break;
-
- if ((addr + size) > corruption_check_size)
- size = corruption_check_size - addr;
-
- if (size == 0)
- break;
-
- e820_update_range(addr, size, E820_RAM, E820_RESERVED);
- scan_areas[num_scan_areas].addr = addr;
- scan_areas[num_scan_areas].size = size;
- num_scan_areas++;
-
- /* Assume we've already mapped this early memory */
- memset(__va(addr), 0, size);
-
- addr += size;
- }
-
- printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
- num_scan_areas);
- update_e820();
-}
-
-static struct timer_list periodic_check_timer;
-
-void check_for_bios_corruption(void)
-{
- int i;
- int corruption = 0;
-
- if (!memory_corruption_check)
- return;
-
- for(i = 0; i < num_scan_areas; i++) {
- unsigned long *addr = __va(scan_areas[i].addr);
- unsigned long size = scan_areas[i].size;
-
- for(; size; addr++, size -= sizeof(unsigned long)) {
- if (!*addr)
- continue;
- printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
- addr, __pa(addr), *addr);
- corruption = 1;
- *addr = 0;
- }
- }
-
- WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n");
-}
-
-static void periodic_check_for_corruption(unsigned long data)
-{
- check_for_bios_corruption();
- mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ));
+ return 0;
}
-void start_periodic_check_for_corruption(void)
-{
- if (!memory_corruption_check || corruption_check_period == 0)
- return;
-
- printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
- corruption_check_period);
+static struct x86_quirks default_x86_quirks __initdata = {
+ .update_genapic = default_update_genapic,
+};
- init_timer(&periodic_check_timer);
- periodic_check_timer.function = &periodic_check_for_corruption;
- periodic_check_for_corruption(0);
-}
-#endif
+struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
+#ifdef CONFIG_X86_RESERVE_LOW_64K
static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
{
printk(KERN_NOTICE
- "%s detected: BIOS may corrupt low RAM, working it around.\n",
+ "%s detected: BIOS may corrupt low RAM, working around it.\n",
d->ident);
e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
@@ -749,6 +628,7 @@ static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
return 0;
}
+#endif
/* List of systems that have known low memory corruption BIOS problems */
static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
@@ -764,7 +644,7 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
.callback = dmi_low_memory_corruption,
.ident = "Phoenix BIOS",
.matches = {
- DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
+ DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
},
},
#endif
@@ -794,6 +674,9 @@ void __init setup_arch(char **cmdline_p)
printk(KERN_INFO "Command line: %s\n", boot_command_line);
#endif
+ /* VMI may relocate the fixmap; do this before touching ioremap area */
+ vmi_init();
+
early_cpu_init();
early_ioremap_init();
@@ -880,13 +763,8 @@ void __init setup_arch(char **cmdline_p)
check_efer();
#endif
-#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
- /*
- * Must be before kernel pagetables are setup
- * or fixmap area is touched.
- */
- vmi_init();
-#endif
+ /* Must be before kernel pagetables are setup */
+ vmi_activate();
/* after early param, so could get panic from serial */
reserve_early_setup_data();
@@ -909,6 +787,12 @@ void __init setup_arch(char **cmdline_p)
dmi_check_system(bad_bios_dmi_table);
+ /*
+ * VMware detection requires dmi to be available, so this
+ * needs to be done after dmi_scan_machine, for the BP.
+ */
+ init_hypervisor(&boot_cpu_data);
+
#ifdef CONFIG_X86_32
probe_roms();
#endif
@@ -1021,12 +905,11 @@ void __init setup_arch(char **cmdline_p)
*/
acpi_reserve_bootmem();
#endif
-#ifdef CONFIG_X86_FIND_SMP_CONFIG
/*
* Find and reserve possible boot-time SMP configuration:
*/
find_smp_config();
-#endif
+
reserve_crashkernel();
#ifdef CONFIG_X86_64
@@ -1053,9 +936,7 @@ void __init setup_arch(char **cmdline_p)
map_vsyscall();
#endif
-#ifdef CONFIG_X86_GENERICARCH
generic_apic_probe();
-#endif
early_quirks();
@@ -1082,7 +963,7 @@ void __init setup_arch(char **cmdline_p)
ioapic_init_mappings();
/* need to wait for io_apic is mapped */
- nr_irqs = probe_nr_irqs();
+ probe_nr_irqs_gsi();
kvm_guest_init();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ae0c0d3bb77..d992e6cff73 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -5,133 +5,54 @@
#include <linux/percpu.h>
#include <linux/kexec.h>
#include <linux/crash_dump.h>
-#include <asm/smp.h>
-#include <asm/percpu.h>
+#include <linux/smp.h>
+#include <linux/topology.h>
#include <asm/sections.h>
#include <asm/processor.h>
#include <asm/setup.h>
-#include <asm/topology.h>
#include <asm/mpspec.h>
#include <asm/apicdef.h>
#include <asm/highmem.h>
+#include <asm/proto.h>
+#include <asm/cpumask.h>
+#include <asm/cpu.h>
+#include <asm/stackprotector.h>
-#ifdef CONFIG_X86_LOCAL_APIC
-unsigned int num_processors;
-unsigned disabled_cpus __cpuinitdata;
-/* Processor that is doing the boot up */
-unsigned int boot_cpu_physical_apicid = -1U;
-unsigned int max_physical_apicid;
-EXPORT_SYMBOL(boot_cpu_physical_apicid);
-
-/* Bitmask of physically existing CPUs */
-physid_mask_t phys_cpu_present_map;
-#endif
-
-/* map cpu index to physical APIC ID */
-DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
-DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
-
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
-#define X86_64_NUMA 1
-
-/* map cpu index to node index */
-DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
-
-/* which logical CPUs are on which nodes */
-cpumask_t *node_to_cpumask_map;
-EXPORT_SYMBOL(node_to_cpumask_map);
-
-/* setup node_to_cpumask_map */
-static void __init setup_node_to_cpumask_map(void);
-
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+# define DBG(x...) printk(KERN_DEBUG x)
#else
-static inline void setup_node_to_cpumask_map(void) { }
+# define DBG(x...)
#endif
-#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
-/*
- * Copy data used in early init routines from the initial arrays to the
- * per cpu data areas. These arrays then become expendable and the
- * *_early_ptr's are zeroed indicating that the static arrays are gone.
- */
-static void __init setup_per_cpu_maps(void)
-{
- int cpu;
+DEFINE_PER_CPU(int, cpu_number);
+EXPORT_PER_CPU_SYMBOL(cpu_number);
- for_each_possible_cpu(cpu) {
- per_cpu(x86_cpu_to_apicid, cpu) =
- early_per_cpu_map(x86_cpu_to_apicid, cpu);
- per_cpu(x86_bios_cpu_apicid, cpu) =
- early_per_cpu_map(x86_bios_cpu_apicid, cpu);
-#ifdef X86_64_NUMA
- per_cpu(x86_cpu_to_node_map, cpu) =
- early_per_cpu_map(x86_cpu_to_node_map, cpu);
+#ifdef CONFIG_X86_64
+#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
+#else
+#define BOOT_PERCPU_OFFSET 0
#endif
- }
- /* indicate the early static arrays will soon be gone */
- early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
- early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
-#ifdef X86_64_NUMA
- early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
-#endif
-}
+DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
+EXPORT_PER_CPU_SYMBOL(this_cpu_off);
-#ifdef CONFIG_X86_32
-/*
- * Great future not-so-futuristic plan: make i386 and x86_64 do it
- * the same way
- */
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
+ [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
+};
EXPORT_SYMBOL(__per_cpu_offset);
-static inline void setup_cpu_pda_map(void) { }
-
-#elif !defined(CONFIG_SMP)
-static inline void setup_cpu_pda_map(void) { }
-
-#else /* CONFIG_SMP && CONFIG_X86_64 */
-/*
- * Allocate cpu_pda pointer table and array via alloc_bootmem.
- */
-static void __init setup_cpu_pda_map(void)
+static inline void setup_percpu_segment(int cpu)
{
- char *pda;
- struct x8664_pda **new_cpu_pda;
- unsigned long size;
- int cpu;
-
- size = roundup(sizeof(struct x8664_pda), cache_line_size());
-
- /* allocate cpu_pda array and pointer table */
- {
- unsigned long tsize = nr_cpu_ids * sizeof(void *);
- unsigned long asize = size * (nr_cpu_ids - 1);
-
- tsize = roundup(tsize, cache_line_size());
- new_cpu_pda = alloc_bootmem(tsize + asize);
- pda = (char *)new_cpu_pda + tsize;
- }
-
- /* initialize pointer table to static pda's */
- for_each_possible_cpu(cpu) {
- if (cpu == 0) {
- /* leave boot cpu pda in place */
- new_cpu_pda[0] = cpu_pda(0);
- continue;
- }
- new_cpu_pda[cpu] = (struct x8664_pda *)pda;
- new_cpu_pda[cpu]->in_bootmem = 1;
- pda += size;
- }
+#ifdef CONFIG_X86_32
+ struct desc_struct gdt;
- /* point to new pointer table */
- _cpu_pda = new_cpu_pda;
-}
+ pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
+ 0x2 | DESCTYPE_S, 0x8);
+ gdt.s = 1;
+ write_gdt_entry(get_cpu_gdt_table(cpu),
+ GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
#endif
+}
/*
* Great future plan:
@@ -140,251 +61,86 @@ static void __init setup_cpu_pda_map(void)
*/
void __init setup_per_cpu_areas(void)
{
- ssize_t size, old_size;
+ ssize_t size;
char *ptr;
int cpu;
- unsigned long align = 1;
-
- /* Setup cpu_pda map */
- setup_cpu_pda_map();
/* Copy section for each CPU (we discard the original) */
- old_size = PERCPU_ENOUGH_ROOM;
- align = max_t(unsigned long, PAGE_SIZE, align);
- size = roundup(old_size, align);
- printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
- size);
+ size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
+
+ pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
+ NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
+
+ pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size);
for_each_possible_cpu(cpu) {
#ifndef CONFIG_NEED_MULTIPLE_NODES
- ptr = __alloc_bootmem(size, align,
- __pa(MAX_DMA_ADDRESS));
+ ptr = alloc_bootmem_pages(size);
#else
int node = early_cpu_to_node(cpu);
if (!node_online(node) || !NODE_DATA(node)) {
- ptr = __alloc_bootmem(size, align,
- __pa(MAX_DMA_ADDRESS));
- printk(KERN_INFO
- "cpu %d has no node %d or node-local memory\n",
+ ptr = alloc_bootmem_pages(size);
+ pr_info("cpu %d has no node %d or node-local memory\n",
cpu, node);
- if (ptr)
- printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n",
- cpu, __pa(ptr));
- }
- else {
- ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
- __pa(MAX_DMA_ADDRESS));
- if (ptr)
- printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
- cpu, node, __pa(ptr));
+ pr_debug("per cpu data for cpu%d at %016lx\n",
+ cpu, __pa(ptr));
+ } else {
+ ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
+ pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
+ cpu, node, __pa(ptr));
}
#endif
+
+ memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
per_cpu_offset(cpu) = ptr - __per_cpu_start;
- memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+ per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
+ per_cpu(cpu_number, cpu) = cpu;
+ setup_percpu_segment(cpu);
+ setup_stack_canary_segment(cpu);
+ /*
+ * Copy data used in early init routines from the
+ * initial arrays to the per cpu data areas. These
+ * arrays then become expendable and the *_early_ptr's
+ * are zeroed indicating that the static arrays are
+ * gone.
+ */
+#ifdef CONFIG_X86_LOCAL_APIC
+ per_cpu(x86_cpu_to_apicid, cpu) =
+ early_per_cpu_map(x86_cpu_to_apicid, cpu);
+ per_cpu(x86_bios_cpu_apicid, cpu) =
+ early_per_cpu_map(x86_bios_cpu_apicid, cpu);
+#endif
+#ifdef CONFIG_X86_64
+ per_cpu(irq_stack_ptr, cpu) =
+ per_cpu(irq_stack_union.irq_stack, cpu) +
+ IRQ_STACK_SIZE - 64;
+#ifdef CONFIG_NUMA
+ per_cpu(x86_cpu_to_node_map, cpu) =
+ early_per_cpu_map(x86_cpu_to_node_map, cpu);
+#endif
+#endif
+ /*
+ * Up to this point, the boot CPU has been using .data.init
+ * area. Reload any changed state for the boot CPU.
+ */
+ if (cpu == boot_cpu_id)
+ switch_to_new_gdt(cpu);
+
+ DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
}
- printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
- NR_CPUS, nr_cpu_ids, nr_node_ids);
-
- /* Setup percpu data maps */
- setup_per_cpu_maps();
+ /* indicate the early static arrays will soon be gone */
+#ifdef CONFIG_X86_LOCAL_APIC
+ early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
+ early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
+#endif
+#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
+ early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
+#endif
/* Setup node to cpumask map */
setup_node_to_cpumask_map();
-}
-
-#endif
-#ifdef X86_64_NUMA
-
-/*
- * Allocate node_to_cpumask_map based on number of available nodes
- * Requires node_possible_map to be valid.
- *
- * Note: node_to_cpumask() is not valid until after this is done.
- */
-static void __init setup_node_to_cpumask_map(void)
-{
- unsigned int node, num = 0;
- cpumask_t *map;
-
- /* setup nr_node_ids if not done yet */
- if (nr_node_ids == MAX_NUMNODES) {
- for_each_node_mask(node, node_possible_map)
- num = node;
- nr_node_ids = num + 1;
- }
-
- /* allocate the map */
- map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
-
- pr_debug("Node to cpumask map at %p for %d nodes\n",
- map, nr_node_ids);
-
- /* node_to_cpumask() will now work */
- node_to_cpumask_map = map;
-}
-
-void __cpuinit numa_set_node(int cpu, int node)
-{
- int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
-
- if (cpu_pda(cpu) && node != NUMA_NO_NODE)
- cpu_pda(cpu)->nodenumber = node;
-
- if (cpu_to_node_map)
- cpu_to_node_map[cpu] = node;
-
- else if (per_cpu_offset(cpu))
- per_cpu(x86_cpu_to_node_map, cpu) = node;
-
- else
- pr_debug("Setting node for non-present cpu %d\n", cpu);
-}
-
-void __cpuinit numa_clear_node(int cpu)
-{
- numa_set_node(cpu, NUMA_NO_NODE);
-}
-
-#ifndef CONFIG_DEBUG_PER_CPU_MAPS
-
-void __cpuinit numa_add_cpu(int cpu)
-{
- cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
-}
-
-void __cpuinit numa_remove_cpu(int cpu)
-{
- cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
-}
-
-#else /* CONFIG_DEBUG_PER_CPU_MAPS */
-
-/*
- * --------- debug versions of the numa functions ---------
- */
-static void __cpuinit numa_set_cpumask(int cpu, int enable)
-{
- int node = cpu_to_node(cpu);
- cpumask_t *mask;
- char buf[64];
-
- if (node_to_cpumask_map == NULL) {
- printk(KERN_ERR "node_to_cpumask_map NULL\n");
- dump_stack();
- return;
- }
-
- mask = &node_to_cpumask_map[node];
- if (enable)
- cpu_set(cpu, *mask);
- else
- cpu_clear(cpu, *mask);
-
- cpulist_scnprintf(buf, sizeof(buf), *mask);
- printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
- enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
- }
-
-void __cpuinit numa_add_cpu(int cpu)
-{
- numa_set_cpumask(cpu, 1);
-}
-
-void __cpuinit numa_remove_cpu(int cpu)
-{
- numa_set_cpumask(cpu, 0);
-}
-
-int cpu_to_node(int cpu)
-{
- if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
- printk(KERN_WARNING
- "cpu_to_node(%d): usage too early!\n", cpu);
- dump_stack();
- return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
- }
- return per_cpu(x86_cpu_to_node_map, cpu);
-}
-EXPORT_SYMBOL(cpu_to_node);
-
-/*
- * Same function as cpu_to_node() but used if called before the
- * per_cpu areas are setup.
- */
-int early_cpu_to_node(int cpu)
-{
- if (early_per_cpu_ptr(x86_cpu_to_node_map))
- return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
-
- if (!per_cpu_offset(cpu)) {
- printk(KERN_WARNING
- "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
- dump_stack();
- return NUMA_NO_NODE;
- }
- return per_cpu(x86_cpu_to_node_map, cpu);
+ /* Setup cpu initialized, callin, callout masks */
+ setup_cpu_local_masks();
}
-
-
-/* empty cpumask */
-static const cpumask_t cpu_mask_none;
-
-/*
- * Returns a pointer to the bitmask of CPUs on Node 'node'.
- */
-const cpumask_t *_node_to_cpumask_ptr(int node)
-{
- if (node_to_cpumask_map == NULL) {
- printk(KERN_WARNING
- "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
- node);
- dump_stack();
- return (const cpumask_t *)&cpu_online_map;
- }
- if (node >= nr_node_ids) {
- printk(KERN_WARNING
- "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n",
- node, nr_node_ids);
- dump_stack();
- return &cpu_mask_none;
- }
- return &node_to_cpumask_map[node];
-}
-EXPORT_SYMBOL(_node_to_cpumask_ptr);
-
-/*
- * Returns a bitmask of CPUs on Node 'node'.
- *
- * Side note: this function creates the returned cpumask on the stack
- * so with a high NR_CPUS count, excessive stack space is used. The
- * node_to_cpumask_ptr function should be used whenever possible.
- */
-cpumask_t node_to_cpumask(int node)
-{
- if (node_to_cpumask_map == NULL) {
- printk(KERN_WARNING
- "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
- dump_stack();
- return cpu_online_map;
- }
- if (node >= nr_node_ids) {
- printk(KERN_WARNING
- "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
- node, nr_node_ids);
- dump_stack();
- return cpu_mask_none;
- }
- return node_to_cpumask_map[node];
-}
-EXPORT_SYMBOL(node_to_cpumask);
-
-/*
- * --------- end of debug versions of the numa functions ---------
- */
-
-#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
-
-#endif /* X86_64_NUMA */
-
diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h
deleted file mode 100644
index cc673aa55ce..00000000000
--- a/arch/x86/kernel/sigframe.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifdef CONFIG_X86_32
-struct sigframe {
- char __user *pretcode;
- int sig;
- struct sigcontext sc;
- /*
- * fpstate is unused. fpstate is moved/allocated after
- * retcode[] below. This movement allows to have the FP state and the
- * future state extensions (xsave) stay together.
- * And at the same time retaining the unused fpstate, prevents changing
- * the offset of extramask[] in the sigframe and thus prevent any
- * legacy application accessing/modifying it.
- */
- struct _fpstate fpstate_unused;
- unsigned long extramask[_NSIG_WORDS-1];
- char retcode[8];
- /* fp state follows here */
-};
-
-struct rt_sigframe {
- char __user *pretcode;
- int sig;
- struct siginfo __user *pinfo;
- void __user *puc;
- struct siginfo info;
- struct ucontext uc;
- char retcode[8];
- /* fp state follows here */
-};
-#else
-struct rt_sigframe {
- char __user *pretcode;
- struct ucontext uc;
- struct siginfo info;
- /* fp state follows here */
-};
-
-int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs *regs);
-int ia32_setup_frame(int sig, struct k_sigaction *ka,
- sigset_t *set, struct pt_regs *regs);
-#endif
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal.c
index d6dd057d0f2..7cdcd16885e 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal.c
@@ -1,36 +1,41 @@
/*
* Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
*
* 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
* 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
+ * 2000-2002 x86-64 support by Andi Kleen
*/
-#include <linux/list.h>
-#include <linux/personality.h>
-#include <linux/binfmts.h>
-#include <linux/suspend.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
#include <linux/kernel.h>
-#include <linux/ptrace.h>
#include <linux/signal.h>
-#include <linux/stddef.h>
-#include <linux/unistd.h>
#include <linux/errno.h>
-#include <linux/sched.h>
#include <linux/wait.h>
+#include <linux/ptrace.h>
#include <linux/tracehook.h>
-#include <linux/elf.h>
-#include <linux/smp.h>
-#include <linux/mm.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/personality.h>
+#include <linux/uaccess.h>
#include <asm/processor.h>
#include <asm/ucontext.h>
-#include <asm/uaccess.h>
#include <asm/i387.h>
#include <asm/vdso.h>
+
+#ifdef CONFIG_X86_64
+#include <asm/proto.h>
+#include <asm/ia32_unistd.h>
+#include <asm/mce.h>
+#endif /* CONFIG_X86_64 */
+
#include <asm/syscall.h>
#include <asm/syscalls.h>
-#include "sigframe.h"
+#include <asm/sigframe.h>
#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
@@ -45,99 +50,24 @@
# define FIX_EFLAGS __FIX_EFLAGS
#endif
-/*
- * Atomically swap in the new signal mask, and wait for a signal.
- */
-asmlinkage int
-sys_sigsuspend(int history0, int history1, old_sigset_t mask)
-{
- mask &= _BLOCKABLE;
- spin_lock_irq(&current->sighand->siglock);
- current->saved_sigmask = current->blocked;
- siginitset(&current->blocked, mask);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- current->state = TASK_INTERRUPTIBLE;
- schedule();
- set_restore_sigmask();
+#define COPY(x) do { \
+ get_user_ex(regs->x, &sc->x); \
+} while (0)
- return -ERESTARTNOHAND;
-}
+#define GET_SEG(seg) ({ \
+ unsigned short tmp; \
+ get_user_ex(tmp, &sc->seg); \
+ tmp; \
+})
-asmlinkage int
-sys_sigaction(int sig, const struct old_sigaction __user *act,
- struct old_sigaction __user *oact)
-{
- struct k_sigaction new_ka, old_ka;
- int ret;
+#define COPY_SEG(seg) do { \
+ regs->seg = GET_SEG(seg); \
+} while (0)
- if (act) {
- old_sigset_t mask;
+#define COPY_SEG_CPL3(seg) do { \
+ regs->seg = GET_SEG(seg) | 3; \
+} while (0)
- if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
- __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
- __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
- return -EFAULT;
-
- __get_user(new_ka.sa.sa_flags, &act->sa_flags);
- __get_user(mask, &act->sa_mask);
- siginitset(&new_ka.sa.sa_mask, mask);
- }
-
- ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
-
- if (!ret && oact) {
- if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
- __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
- __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
- return -EFAULT;
-
- __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
- __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
- }
-
- return ret;
-}
-
-asmlinkage int sys_sigaltstack(unsigned long bx)
-{
- /*
- * This is needed to make gcc realize it doesn't own the
- * "struct pt_regs"
- */
- struct pt_regs *regs = (struct pt_regs *)&bx;
- const stack_t __user *uss = (const stack_t __user *)bx;
- stack_t __user *uoss = (stack_t __user *)regs->cx;
-
- return do_sigaltstack(uss, uoss, regs->sp);
-}
-
-#define COPY(x) { \
- err |= __get_user(regs->x, &sc->x); \
-}
-
-#define COPY_SEG(seg) { \
- unsigned short tmp; \
- err |= __get_user(tmp, &sc->seg); \
- regs->seg = tmp; \
-}
-
-#define COPY_SEG_STRICT(seg) { \
- unsigned short tmp; \
- err |= __get_user(tmp, &sc->seg); \
- regs->seg = tmp | 3; \
-}
-
-#define GET_SEG(seg) { \
- unsigned short tmp; \
- err |= __get_user(tmp, &sc->seg); \
- loadsegment(seg, tmp); \
-}
-
-/*
- * Do a signal return; undo the signal stack.
- */
static int
restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
unsigned long *pax)
@@ -149,150 +79,136 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
/* Always make any pending restarted system calls return -EINTR */
current_thread_info()->restart_block.fn = do_no_restart_syscall;
- GET_SEG(gs);
- COPY_SEG(fs);
- COPY_SEG(es);
- COPY_SEG(ds);
- COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
- COPY(dx); COPY(cx); COPY(ip);
- COPY_SEG_STRICT(cs);
- COPY_SEG_STRICT(ss);
-
- err |= __get_user(tmpflags, &sc->flags);
- regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
- regs->orig_ax = -1; /* disable syscall checks */
-
- err |= __get_user(buf, &sc->fpstate);
- err |= restore_i387_xstate(buf);
-
- err |= __get_user(*pax, &sc->ax);
- return err;
-}
+ get_user_try {
-asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
-{
- struct sigframe __user *frame;
- struct pt_regs *regs;
- unsigned long ax;
- sigset_t set;
+#ifdef CONFIG_X86_32
+ set_user_gs(regs, GET_SEG(gs));
+ COPY_SEG(fs);
+ COPY_SEG(es);
+ COPY_SEG(ds);
+#endif /* CONFIG_X86_32 */
- regs = (struct pt_regs *) &__unused;
- frame = (struct sigframe __user *)(regs->sp - 8);
+ COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
+ COPY(dx); COPY(cx); COPY(ip);
- if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
- goto badframe;
- if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1
- && __copy_from_user(&set.sig[1], &frame->extramask,
- sizeof(frame->extramask))))
- goto badframe;
+#ifdef CONFIG_X86_64
+ COPY(r8);
+ COPY(r9);
+ COPY(r10);
+ COPY(r11);
+ COPY(r12);
+ COPY(r13);
+ COPY(r14);
+ COPY(r15);
+#endif /* CONFIG_X86_64 */
- sigdelsetmask(&set, ~_BLOCKABLE);
- spin_lock_irq(&current->sighand->siglock);
- current->blocked = set;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+#ifdef CONFIG_X86_32
+ COPY_SEG_CPL3(cs);
+ COPY_SEG_CPL3(ss);
+#else /* !CONFIG_X86_32 */
+ /* Kernel saves and restores only the CS segment register on signals,
+ * which is the bare minimum needed to allow mixed 32/64-bit code.
+ * App's signal handler can save/restore other segments if needed. */
+ COPY_SEG_CPL3(cs);
+#endif /* CONFIG_X86_32 */
- if (restore_sigcontext(regs, &frame->sc, &ax))
- goto badframe;
- return ax;
+ get_user_ex(tmpflags, &sc->flags);
+ regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
+ regs->orig_ax = -1; /* disable syscall checks */
-badframe:
- if (show_unhandled_signals && printk_ratelimit()) {
- printk("%s%s[%d] bad frame in sigreturn frame:"
- "%p ip:%lx sp:%lx oeax:%lx",
- task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
- current->comm, task_pid_nr(current), frame, regs->ip,
- regs->sp, regs->orig_ax);
- print_vma_addr(" in ", regs->ip);
- printk(KERN_CONT "\n");
- }
+ get_user_ex(buf, &sc->fpstate);
+ err |= restore_i387_xstate(buf);
- force_sig(SIGSEGV, current);
+ get_user_ex(*pax, &sc->ax);
+ } get_user_catch(err);
- return 0;
+ return err;
}
-static long do_rt_sigreturn(struct pt_regs *regs)
+static int
+setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
+ struct pt_regs *regs, unsigned long mask)
{
- struct rt_sigframe __user *frame;
- unsigned long ax;
- sigset_t set;
-
- frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
- if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
- goto badframe;
- if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
- goto badframe;
-
- sigdelsetmask(&set, ~_BLOCKABLE);
- spin_lock_irq(&current->sighand->siglock);
- current->blocked = set;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+ int err = 0;
- if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
- goto badframe;
+ put_user_try {
- if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
- goto badframe;
+#ifdef CONFIG_X86_32
+ put_user_ex(get_user_gs(regs), (unsigned int __user *)&sc->gs);
+ put_user_ex(regs->fs, (unsigned int __user *)&sc->fs);
+ put_user_ex(regs->es, (unsigned int __user *)&sc->es);
+ put_user_ex(regs->ds, (unsigned int __user *)&sc->ds);
+#endif /* CONFIG_X86_32 */
- return ax;
+ put_user_ex(regs->di, &sc->di);
+ put_user_ex(regs->si, &sc->si);
+ put_user_ex(regs->bp, &sc->bp);
+ put_user_ex(regs->sp, &sc->sp);
+ put_user_ex(regs->bx, &sc->bx);
+ put_user_ex(regs->dx, &sc->dx);
+ put_user_ex(regs->cx, &sc->cx);
+ put_user_ex(regs->ax, &sc->ax);
+#ifdef CONFIG_X86_64
+ put_user_ex(regs->r8, &sc->r8);
+ put_user_ex(regs->r9, &sc->r9);
+ put_user_ex(regs->r10, &sc->r10);
+ put_user_ex(regs->r11, &sc->r11);
+ put_user_ex(regs->r12, &sc->r12);
+ put_user_ex(regs->r13, &sc->r13);
+ put_user_ex(regs->r14, &sc->r14);
+ put_user_ex(regs->r15, &sc->r15);
+#endif /* CONFIG_X86_64 */
+
+ put_user_ex(current->thread.trap_no, &sc->trapno);
+ put_user_ex(current->thread.error_code, &sc->err);
+ put_user_ex(regs->ip, &sc->ip);
+#ifdef CONFIG_X86_32
+ put_user_ex(regs->cs, (unsigned int __user *)&sc->cs);
+ put_user_ex(regs->flags, &sc->flags);
+ put_user_ex(regs->sp, &sc->sp_at_signal);
+ put_user_ex(regs->ss, (unsigned int __user *)&sc->ss);
+#else /* !CONFIG_X86_32 */
+ put_user_ex(regs->flags, &sc->flags);
+ put_user_ex(regs->cs, &sc->cs);
+ put_user_ex(0, &sc->gs);
+ put_user_ex(0, &sc->fs);
+#endif /* CONFIG_X86_32 */
-badframe:
- signal_fault(regs, frame, "rt_sigreturn");
- return 0;
-}
+ put_user_ex(fpstate, &sc->fpstate);
-asmlinkage int sys_rt_sigreturn(unsigned long __unused)
-{
- struct pt_regs *regs = (struct pt_regs *)&__unused;
+ /* non-iBCS2 extensions.. */
+ put_user_ex(mask, &sc->oldmask);
+ put_user_ex(current->thread.cr2, &sc->cr2);
+ } put_user_catch(err);
- return do_rt_sigreturn(regs);
+ return err;
}
/*
* Set up a signal frame.
*/
-static int
-setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
- struct pt_regs *regs, unsigned long mask)
-{
- int tmp, err = 0;
-
- err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs);
- savesegment(gs, tmp);
- err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
-
- err |= __put_user(regs->es, (unsigned int __user *)&sc->es);
- err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds);
- err |= __put_user(regs->di, &sc->di);
- err |= __put_user(regs->si, &sc->si);
- err |= __put_user(regs->bp, &sc->bp);
- err |= __put_user(regs->sp, &sc->sp);
- err |= __put_user(regs->bx, &sc->bx);
- err |= __put_user(regs->dx, &sc->dx);
- err |= __put_user(regs->cx, &sc->cx);
- err |= __put_user(regs->ax, &sc->ax);
- err |= __put_user(current->thread.trap_no, &sc->trapno);
- err |= __put_user(current->thread.error_code, &sc->err);
- err |= __put_user(regs->ip, &sc->ip);
- err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs);
- err |= __put_user(regs->flags, &sc->flags);
- err |= __put_user(regs->sp, &sc->sp_at_signal);
- err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
-
- tmp = save_i387_xstate(fpstate);
- if (tmp < 0)
- err = 1;
- else
- err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate);
-
- /* non-iBCS2 extensions.. */
- err |= __put_user(mask, &sc->oldmask);
- err |= __put_user(current->thread.cr2, &sc->cr2);
-
- return err;
-}
+#ifdef CONFIG_X86_32
+static const struct {
+ u16 poplmovl;
+ u32 val;
+ u16 int80;
+} __attribute__((packed)) retcode = {
+ 0xb858, /* popl %eax; movl $..., %eax */
+ __NR_sigreturn,
+ 0x80cd, /* int $0x80 */
+};
+
+static const struct {
+ u8 movl;
+ u32 val;
+ u16 int80;
+ u8 pad;
+} __attribute__((packed)) rt_retcode = {
+ 0xb8, /* movl $..., %eax */
+ __NR_rt_sigreturn,
+ 0x80cd, /* int $0x80 */
+ 0
+};
/*
* Determine which stack to use..
@@ -328,6 +244,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
if (used_math()) {
sp = sp - sig_xstate_size;
*fpstate = (struct _fpstate *) sp;
+ if (save_i387_xstate(*fpstate) < 0)
+ return (void __user *)-1L;
}
sp -= frame_size;
@@ -383,9 +301,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
* reasons and because gdb uses it as a signature to notice
* signal handler stack frames.
*/
- err |= __put_user(0xb858, (short __user *)(frame->retcode+0));
- err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2));
- err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
+ err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode);
if (err)
return -EFAULT;
@@ -418,45 +334,41 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
return -EFAULT;
- err |= __put_user(sig, &frame->sig);
- err |= __put_user(&frame->info, &frame->pinfo);
- err |= __put_user(&frame->uc, &frame->puc);
- err |= copy_siginfo_to_user(&frame->info, info);
- if (err)
- return -EFAULT;
-
- /* Create the ucontext. */
- if (cpu_has_xsave)
- err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
- else
- err |= __put_user(0, &frame->uc.uc_flags);
- err |= __put_user(0, &frame->uc.uc_link);
- err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
- err |= __put_user(sas_ss_flags(regs->sp),
- &frame->uc.uc_stack.ss_flags);
- err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
- err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
- regs, set->sig[0]);
- err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
- if (err)
- return -EFAULT;
+ put_user_try {
+ put_user_ex(sig, &frame->sig);
+ put_user_ex(&frame->info, &frame->pinfo);
+ put_user_ex(&frame->uc, &frame->puc);
+ err |= copy_siginfo_to_user(&frame->info, info);
- /* Set up to return from userspace. */
- restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
- if (ka->sa.sa_flags & SA_RESTORER)
- restorer = ka->sa.sa_restorer;
- err |= __put_user(restorer, &frame->pretcode);
+ /* Create the ucontext. */
+ if (cpu_has_xsave)
+ put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
+ else
+ put_user_ex(0, &frame->uc.uc_flags);
+ put_user_ex(0, &frame->uc.uc_link);
+ put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+ put_user_ex(sas_ss_flags(regs->sp),
+ &frame->uc.uc_stack.ss_flags);
+ put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
+ err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
+ regs, set->sig[0]);
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
+ /* Set up to return from userspace. */
+ restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
+ if (ka->sa.sa_flags & SA_RESTORER)
+ restorer = ka->sa.sa_restorer;
+ put_user_ex(restorer, &frame->pretcode);
- /*
- * This is movl $__NR_rt_sigreturn, %ax ; int $0x80
- *
- * WE DO NOT USE IT ANY MORE! It's only left here for historical
- * reasons and because gdb uses it as a signature to notice
- * signal handler stack frames.
- */
- err |= __put_user(0xb8, (char __user *)(frame->retcode+0));
- err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1));
- err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
+ /*
+ * This is movl $__NR_rt_sigreturn, %ax ; int $0x80
+ *
+ * WE DO NOT USE IT ANY MORE! It's only left here for historical
+ * reasons and because gdb uses it as a signature to notice
+ * signal handler stack frames.
+ */
+ put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
+ } put_user_catch(err);
if (err)
return -EFAULT;
@@ -475,23 +387,286 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
return 0;
}
+#else /* !CONFIG_X86_32 */
+/*
+ * Determine which stack to use..
+ */
+static void __user *
+get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size)
+{
+ /* Default to using normal stack - redzone*/
+ sp -= 128;
+
+ /* This is the X/Open sanctioned signal stack switching. */
+ if (ka->sa.sa_flags & SA_ONSTACK) {
+ if (sas_ss_flags(sp) == 0)
+ sp = current->sas_ss_sp + current->sas_ss_size;
+ }
+
+ return (void __user *)round_down(sp - size, 64);
+}
+
+static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+ sigset_t *set, struct pt_regs *regs)
+{
+ struct rt_sigframe __user *frame;
+ void __user *fp = NULL;
+ int err = 0;
+ struct task_struct *me = current;
+
+ if (used_math()) {
+ fp = get_stack(ka, regs->sp, sig_xstate_size);
+ frame = (void __user *)round_down(
+ (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
+
+ if (save_i387_xstate(fp) < 0)
+ return -EFAULT;
+ } else
+ frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8;
+
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ return -EFAULT;
+
+ if (ka->sa.sa_flags & SA_SIGINFO) {
+ if (copy_siginfo_to_user(&frame->info, info))
+ return -EFAULT;
+ }
+
+ put_user_try {
+ /* Create the ucontext. */
+ if (cpu_has_xsave)
+ put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
+ else
+ put_user_ex(0, &frame->uc.uc_flags);
+ put_user_ex(0, &frame->uc.uc_link);
+ put_user_ex(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+ put_user_ex(sas_ss_flags(regs->sp),
+ &frame->uc.uc_stack.ss_flags);
+ put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
+ err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
+ /* Set up to return from userspace. If provided, use a stub
+ already in userspace. */
+ /* x86-64 should always use SA_RESTORER. */
+ if (ka->sa.sa_flags & SA_RESTORER) {
+ put_user_ex(ka->sa.sa_restorer, &frame->pretcode);
+ } else {
+ /* could use a vstub here */
+ err |= -EFAULT;
+ }
+ } put_user_catch(err);
+
+ if (err)
+ return -EFAULT;
+
+ /* Set up registers for signal handler */
+ regs->di = sig;
+ /* In case the signal handler was declared without prototypes */
+ regs->ax = 0;
+
+ /* This also works for non SA_SIGINFO handlers because they expect the
+ next argument after the signal number on the stack. */
+ regs->si = (unsigned long)&frame->info;
+ regs->dx = (unsigned long)&frame->uc;
+ regs->ip = (unsigned long) ka->sa.sa_handler;
+
+ regs->sp = (unsigned long)frame;
+
+ /* Set up the CS register to run signal handlers in 64-bit mode,
+ even if the handler happens to be interrupting 32-bit code. */
+ regs->cs = __USER_CS;
+
+ return 0;
+}
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_X86_32
+/*
+ * Atomically swap in the new signal mask, and wait for a signal.
+ */
+asmlinkage int
+sys_sigsuspend(int history0, int history1, old_sigset_t mask)
+{
+ mask &= _BLOCKABLE;
+ spin_lock_irq(&current->sighand->siglock);
+ current->saved_sigmask = current->blocked;
+ siginitset(&current->blocked, mask);
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+ set_restore_sigmask();
+
+ return -ERESTARTNOHAND;
+}
+
+asmlinkage int
+sys_sigaction(int sig, const struct old_sigaction __user *act,
+ struct old_sigaction __user *oact)
+{
+ struct k_sigaction new_ka, old_ka;
+ int ret = 0;
+
+ if (act) {
+ old_sigset_t mask;
+
+ if (!access_ok(VERIFY_READ, act, sizeof(*act)))
+ return -EFAULT;
+
+ get_user_try {
+ get_user_ex(new_ka.sa.sa_handler, &act->sa_handler);
+ get_user_ex(new_ka.sa.sa_flags, &act->sa_flags);
+ get_user_ex(mask, &act->sa_mask);
+ get_user_ex(new_ka.sa.sa_restorer, &act->sa_restorer);
+ } get_user_catch(ret);
+
+ if (ret)
+ return -EFAULT;
+ siginitset(&new_ka.sa.sa_mask, mask);
+ }
+
+ ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+
+ if (!ret && oact) {
+ if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)))
+ return -EFAULT;
+
+ put_user_try {
+ put_user_ex(old_ka.sa.sa_handler, &oact->sa_handler);
+ put_user_ex(old_ka.sa.sa_flags, &oact->sa_flags);
+ put_user_ex(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
+ put_user_ex(old_ka.sa.sa_restorer, &oact->sa_restorer);
+ } put_user_catch(ret);
+
+ if (ret)
+ return -EFAULT;
+ }
+
+ return ret;
+}
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_X86_32
+int sys_sigaltstack(struct pt_regs *regs)
+{
+ const stack_t __user *uss = (const stack_t __user *)regs->bx;
+ stack_t __user *uoss = (stack_t __user *)regs->cx;
+
+ return do_sigaltstack(uss, uoss, regs->sp);
+}
+#else /* !CONFIG_X86_32 */
+asmlinkage long
+sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
+ struct pt_regs *regs)
+{
+ return do_sigaltstack(uss, uoss, regs->sp);
+}
+#endif /* CONFIG_X86_32 */
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+#ifdef CONFIG_X86_32
+unsigned long sys_sigreturn(struct pt_regs *regs)
+{
+ struct sigframe __user *frame;
+ unsigned long ax;
+ sigset_t set;
+
+ frame = (struct sigframe __user *)(regs->sp - 8);
+
+ if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+ goto badframe;
+ if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1
+ && __copy_from_user(&set.sig[1], &frame->extramask,
+ sizeof(frame->extramask))))
+ goto badframe;
+
+ sigdelsetmask(&set, ~_BLOCKABLE);
+ spin_lock_irq(&current->sighand->siglock);
+ current->blocked = set;
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ if (restore_sigcontext(regs, &frame->sc, &ax))
+ goto badframe;
+ return ax;
+
+badframe:
+ signal_fault(regs, frame, "sigreturn");
+
+ return 0;
+}
+#endif /* CONFIG_X86_32 */
+
+long sys_rt_sigreturn(struct pt_regs *regs)
+{
+ struct rt_sigframe __user *frame;
+ unsigned long ax;
+ sigset_t set;
+
+ frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
+ if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+ goto badframe;
+ if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+ goto badframe;
+
+ sigdelsetmask(&set, ~_BLOCKABLE);
+ spin_lock_irq(&current->sighand->siglock);
+ current->blocked = set;
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
+ goto badframe;
+
+ if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
+ goto badframe;
+
+ return ax;
+
+badframe:
+ signal_fault(regs, frame, "rt_sigreturn");
+ return 0;
+}
/*
* OK, we're invoking a handler:
*/
static int signr_convert(int sig)
{
+#ifdef CONFIG_X86_32
struct thread_info *info = current_thread_info();
if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
return info->exec_domain->signal_invmap[sig];
+#endif /* CONFIG_X86_32 */
return sig;
}
+#ifdef CONFIG_X86_32
+
#define is_ia32 1
#define ia32_setup_frame __setup_frame
#define ia32_setup_rt_frame __setup_rt_frame
+#else /* !CONFIG_X86_32 */
+
+#ifdef CONFIG_IA32_EMULATION
+#define is_ia32 test_thread_flag(TIF_IA32)
+#else /* !CONFIG_IA32_EMULATION */
+#define is_ia32 0
+#endif /* CONFIG_IA32_EMULATION */
+
+int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+ sigset_t *set, struct pt_regs *regs);
+int ia32_setup_frame(int sig, struct k_sigaction *ka,
+ sigset_t *set, struct pt_regs *regs);
+
+#endif /* CONFIG_X86_32 */
+
static int
setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
sigset_t *set, struct pt_regs *regs)
@@ -592,7 +767,13 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
return 0;
}
+#ifdef CONFIG_X86_32
#define NR_restart_syscall __NR_restart_syscall
+#else /* !CONFIG_X86_32 */
+#define NR_restart_syscall \
+ test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
+#endif /* CONFIG_X86_32 */
+
/*
* Note that 'init' is a special process: it doesn't get signals it doesn't
* want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -704,8 +885,9 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
struct task_struct *me = current;
if (show_unhandled_signals && printk_ratelimit()) {
- printk(KERN_INFO
+ printk("%s"
"%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
+ task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
me->comm, me->pid, where, frame,
regs->ip, regs->sp, regs->orig_ax);
print_vma_addr(" in ", regs->ip);
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
deleted file mode 100644
index a5c9627f4db..00000000000
--- a/arch/x86/kernel/signal_64.c
+++ /dev/null
@@ -1,516 +0,0 @@
-/*
- * Copyright (C) 1991, 1992 Linus Torvalds
- * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
- *
- * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
- * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
- * 2000-2002 x86-64 support by Andi Kleen
- */
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/kernel.h>
-#include <linux/signal.h>
-#include <linux/errno.h>
-#include <linux/wait.h>
-#include <linux/ptrace.h>
-#include <linux/tracehook.h>
-#include <linux/unistd.h>
-#include <linux/stddef.h>
-#include <linux/personality.h>
-#include <linux/compiler.h>
-#include <linux/uaccess.h>
-
-#include <asm/processor.h>
-#include <asm/ucontext.h>
-#include <asm/i387.h>
-#include <asm/proto.h>
-#include <asm/ia32_unistd.h>
-#include <asm/mce.h>
-#include <asm/syscall.h>
-#include <asm/syscalls.h>
-#include "sigframe.h"
-
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
-#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
- X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
- X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
- X86_EFLAGS_CF)
-
-#ifdef CONFIG_X86_32
-# define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF)
-#else
-# define FIX_EFLAGS __FIX_EFLAGS
-#endif
-
-asmlinkage long
-sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
- struct pt_regs *regs)
-{
- return do_sigaltstack(uss, uoss, regs->sp);
-}
-
-#define COPY(x) { \
- err |= __get_user(regs->x, &sc->x); \
-}
-
-#define COPY_SEG_STRICT(seg) { \
- unsigned short tmp; \
- err |= __get_user(tmp, &sc->seg); \
- regs->seg = tmp | 3; \
-}
-
-/*
- * Do a signal return; undo the signal stack.
- */
-static int
-restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
- unsigned long *pax)
-{
- void __user *buf;
- unsigned int tmpflags;
- unsigned int err = 0;
-
- /* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
-
- COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
- COPY(dx); COPY(cx); COPY(ip);
- COPY(r8);
- COPY(r9);
- COPY(r10);
- COPY(r11);
- COPY(r12);
- COPY(r13);
- COPY(r14);
- COPY(r15);
-
- /* Kernel saves and restores only the CS segment register on signals,
- * which is the bare minimum needed to allow mixed 32/64-bit code.
- * App's signal handler can save/restore other segments if needed. */
- COPY_SEG_STRICT(cs);
-
- err |= __get_user(tmpflags, &sc->flags);
- regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
- regs->orig_ax = -1; /* disable syscall checks */
-
- err |= __get_user(buf, &sc->fpstate);
- err |= restore_i387_xstate(buf);
-
- err |= __get_user(*pax, &sc->ax);
- return err;
-}
-
-static long do_rt_sigreturn(struct pt_regs *regs)
-{
- struct rt_sigframe __user *frame;
- unsigned long ax;
- sigset_t set;
-
- frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
- if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
- goto badframe;
- if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
- goto badframe;
-
- sigdelsetmask(&set, ~_BLOCKABLE);
- spin_lock_irq(&current->sighand->siglock);
- current->blocked = set;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
- goto badframe;
-
- if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
- goto badframe;
-
- return ax;
-
-badframe:
- signal_fault(regs, frame, "rt_sigreturn");
- return 0;
-}
-
-asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
-{
- return do_rt_sigreturn(regs);
-}
-
-/*
- * Set up a signal frame.
- */
-
-static inline int
-setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
- unsigned long mask, struct task_struct *me)
-{
- int err = 0;
-
- err |= __put_user(regs->cs, &sc->cs);
- err |= __put_user(0, &sc->gs);
- err |= __put_user(0, &sc->fs);
-
- err |= __put_user(regs->di, &sc->di);
- err |= __put_user(regs->si, &sc->si);
- err |= __put_user(regs->bp, &sc->bp);
- err |= __put_user(regs->sp, &sc->sp);
- err |= __put_user(regs->bx, &sc->bx);
- err |= __put_user(regs->dx, &sc->dx);
- err |= __put_user(regs->cx, &sc->cx);
- err |= __put_user(regs->ax, &sc->ax);
- err |= __put_user(regs->r8, &sc->r8);
- err |= __put_user(regs->r9, &sc->r9);
- err |= __put_user(regs->r10, &sc->r10);
- err |= __put_user(regs->r11, &sc->r11);
- err |= __put_user(regs->r12, &sc->r12);
- err |= __put_user(regs->r13, &sc->r13);
- err |= __put_user(regs->r14, &sc->r14);
- err |= __put_user(regs->r15, &sc->r15);
- err |= __put_user(me->thread.trap_no, &sc->trapno);
- err |= __put_user(me->thread.error_code, &sc->err);
- err |= __put_user(regs->ip, &sc->ip);
- err |= __put_user(regs->flags, &sc->flags);
- err |= __put_user(mask, &sc->oldmask);
- err |= __put_user(me->thread.cr2, &sc->cr2);
-
- return err;
-}
-
-/*
- * Determine which stack to use..
- */
-
-static void __user *
-get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
-{
- unsigned long sp;
-
- /* Default to using normal stack - redzone*/
- sp = regs->sp - 128;
-
- /* This is the X/Open sanctioned signal stack switching. */
- if (ka->sa.sa_flags & SA_ONSTACK) {
- if (sas_ss_flags(sp) == 0)
- sp = current->sas_ss_sp + current->sas_ss_size;
- }
-
- return (void __user *)round_down(sp - size, 64);
-}
-
-static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs *regs)
-{
- struct rt_sigframe __user *frame;
- void __user *fp = NULL;
- int err = 0;
- struct task_struct *me = current;
-
- if (used_math()) {
- fp = get_stack(ka, regs, sig_xstate_size);
- frame = (void __user *)round_down(
- (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
-
- if (save_i387_xstate(fp) < 0)
- return -EFAULT;
- } else
- frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
-
- if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
- return -EFAULT;
-
- if (ka->sa.sa_flags & SA_SIGINFO) {
- if (copy_siginfo_to_user(&frame->info, info))
- return -EFAULT;
- }
-
- /* Create the ucontext. */
- if (cpu_has_xsave)
- err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
- else
- err |= __put_user(0, &frame->uc.uc_flags);
- err |= __put_user(0, &frame->uc.uc_link);
- err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
- err |= __put_user(sas_ss_flags(regs->sp),
- &frame->uc.uc_stack.ss_flags);
- err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
- err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
- err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
- if (sizeof(*set) == 16) {
- __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
- __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
- } else
- err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
-
- /* Set up to return from userspace. If provided, use a stub
- already in userspace. */
- /* x86-64 should always use SA_RESTORER. */
- if (ka->sa.sa_flags & SA_RESTORER) {
- err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
- } else {
- /* could use a vstub here */
- return -EFAULT;
- }
-
- if (err)
- return -EFAULT;
-
- /* Set up registers for signal handler */
- regs->di = sig;
- /* In case the signal handler was declared without prototypes */
- regs->ax = 0;
-
- /* This also works for non SA_SIGINFO handlers because they expect the
- next argument after the signal number on the stack. */
- regs->si = (unsigned long)&frame->info;
- regs->dx = (unsigned long)&frame->uc;
- regs->ip = (unsigned long) ka->sa.sa_handler;
-
- regs->sp = (unsigned long)frame;
-
- /* Set up the CS register to run signal handlers in 64-bit mode,
- even if the handler happens to be interrupting 32-bit code. */
- regs->cs = __USER_CS;
-
- return 0;
-}
-
-/*
- * OK, we're invoking a handler
- */
-static int signr_convert(int sig)
-{
- return sig;
-}
-
-#ifdef CONFIG_IA32_EMULATION
-#define is_ia32 test_thread_flag(TIF_IA32)
-#else
-#define is_ia32 0
-#endif
-
-static int
-setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs *regs)
-{
- int usig = signr_convert(sig);
- int ret;
-
- /* Set up the stack frame */
- if (is_ia32) {
- if (ka->sa.sa_flags & SA_SIGINFO)
- ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
- else
- ret = ia32_setup_frame(usig, ka, set, regs);
- } else
- ret = __setup_rt_frame(sig, ka, info, set, regs);
-
- if (ret) {
- force_sigsegv(sig, current);
- return -EFAULT;
- }
-
- return ret;
-}
-
-static int
-handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
- sigset_t *oldset, struct pt_regs *regs)
-{
- int ret;
-
- /* Are we from a system call? */
- if (syscall_get_nr(current, regs) >= 0) {
- /* If so, check system call restarting.. */
- switch (syscall_get_error(current, regs)) {
- case -ERESTART_RESTARTBLOCK:
- case -ERESTARTNOHAND:
- regs->ax = -EINTR;
- break;
-
- case -ERESTARTSYS:
- if (!(ka->sa.sa_flags & SA_RESTART)) {
- regs->ax = -EINTR;
- break;
- }
- /* fallthrough */
- case -ERESTARTNOINTR:
- regs->ax = regs->orig_ax;
- regs->ip -= 2;
- break;
- }
- }
-
- /*
- * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
- * flag so that register information in the sigcontext is correct.
- */
- if (unlikely(regs->flags & X86_EFLAGS_TF) &&
- likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
- regs->flags &= ~X86_EFLAGS_TF;
-
- ret = setup_rt_frame(sig, ka, info, oldset, regs);
-
- if (ret)
- return ret;
-
-#ifdef CONFIG_X86_64
- /*
- * This has nothing to do with segment registers,
- * despite the name. This magic affects uaccess.h
- * macros' behavior. Reset it to the normal setting.
- */
- set_fs(USER_DS);
-#endif
-
- /*
- * Clear the direction flag as per the ABI for function entry.
- */
- regs->flags &= ~X86_EFLAGS_DF;
-
- /*
- * Clear TF when entering the signal handler, but
- * notify any tracer that was single-stepping it.
- * The tracer may want to single-step inside the
- * handler too.
- */
- regs->flags &= ~X86_EFLAGS_TF;
-
- spin_lock_irq(&current->sighand->siglock);
- sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
- if (!(ka->sa.sa_flags & SA_NODEFER))
- sigaddset(&current->blocked, sig);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- tracehook_signal_handler(sig, info, ka, regs,
- test_thread_flag(TIF_SINGLESTEP));
-
- return 0;
-}
-
-#define NR_restart_syscall \
- test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
-/*
- * Note that 'init' is a special process: it doesn't get signals it doesn't
- * want to handle. Thus you cannot kill init even with a SIGKILL even by
- * mistake.
- */
-static void do_signal(struct pt_regs *regs)
-{
- struct k_sigaction ka;
- siginfo_t info;
- int signr;
- sigset_t *oldset;
-
- /*
- * We want the common case to go fast, which is why we may in certain
- * cases get here from kernel mode. Just return without doing anything
- * if so.
- * X86_32: vm86 regs switched out by assembly code before reaching
- * here, so testing against kernel CS suffices.
- */
- if (!user_mode(regs))
- return;
-
- if (current_thread_info()->status & TS_RESTORE_SIGMASK)
- oldset = &current->saved_sigmask;
- else
- oldset = &current->blocked;
-
- signr = get_signal_to_deliver(&info, &ka, regs, NULL);
- if (signr > 0) {
- /*
- * Re-enable any watchpoints before delivering the
- * signal to user space. The processor register will
- * have been cleared if the watchpoint triggered
- * inside the kernel.
- */
- if (current->thread.debugreg7)
- set_debugreg(current->thread.debugreg7, 7);
-
- /* Whee! Actually deliver the signal. */
- if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
- /*
- * A signal was successfully delivered; the saved
- * sigmask will have been stored in the signal frame,
- * and will be restored by sigreturn, so we can simply
- * clear the TS_RESTORE_SIGMASK flag.
- */
- current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
- }
- return;
- }
-
- /* Did we come from a system call? */
- if (syscall_get_nr(current, regs) >= 0) {
- /* Restart the system call - no handlers present */
- switch (syscall_get_error(current, regs)) {
- case -ERESTARTNOHAND:
- case -ERESTARTSYS:
- case -ERESTARTNOINTR:
- regs->ax = regs->orig_ax;
- regs->ip -= 2;
- break;
-
- case -ERESTART_RESTARTBLOCK:
- regs->ax = NR_restart_syscall;
- regs->ip -= 2;
- break;
- }
- }
-
- /*
- * If there's no signal to deliver, we just put the saved sigmask
- * back.
- */
- if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
- current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
- sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
- }
-}
-
-/*
- * notification of userspace execution resumption
- * - triggered by the TIF_WORK_MASK flags
- */
-void
-do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
-{
-#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
- /* notify userspace of pending MCEs */
- if (thread_info_flags & _TIF_MCE_NOTIFY)
- mce_notify_user();
-#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
-
- /* deal with pending signal delivery */
- if (thread_info_flags & _TIF_SIGPENDING)
- do_signal(regs);
-
- if (thread_info_flags & _TIF_NOTIFY_RESUME) {
- clear_thread_flag(TIF_NOTIFY_RESUME);
- tracehook_notify_resume(regs);
- }
-
-#ifdef CONFIG_X86_32
- clear_thread_flag(TIF_IRET);
-#endif /* CONFIG_X86_32 */
-}
-
-void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
-{
- struct task_struct *me = current;
-
- if (show_unhandled_signals && printk_ratelimit()) {
- printk(KERN_INFO
- "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
- me->comm, me->pid, where, frame,
- regs->ip, regs->sp, regs->orig_ax);
- print_vma_addr(" in ", regs->ip);
- printk(KERN_CONT "\n");
- }
-
- force_sig(SIGSEGV, me);
-}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 18f9b19f5f8..eaaffae31cc 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -1,8 +1,8 @@
/*
* Intel SMP support routines.
*
- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
+ * (c) 1998-99, 2000, 2009 Ingo Molnar <mingo@redhat.com>
* (c) 2002,2003 Andi Kleen, SuSE Labs.
*
* i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
@@ -26,8 +26,7 @@
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
-#include <mach_ipi.h>
-#include <mach_apic.h>
+#include <asm/genapic.h>
/*
* Some notes on x86 processor bugs affecting SMP operation:
*
@@ -118,39 +117,33 @@ static void native_smp_send_reschedule(int cpu)
WARN_ON(1);
return;
}
- send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+ apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
}
void native_send_call_func_single_ipi(int cpu)
{
- send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR);
+ apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
}
-void native_send_call_func_ipi(cpumask_t mask)
+void native_send_call_func_ipi(const struct cpumask *mask)
{
- cpumask_t allbutself;
+ cpumask_var_t allbutself;
- allbutself = cpu_online_map;
- cpu_clear(smp_processor_id(), allbutself);
+ if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) {
+ apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+ return;
+ }
+
+ cpumask_copy(allbutself, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), allbutself);
- if (cpus_equal(mask, allbutself) &&
- cpus_equal(cpu_online_map, cpu_callout_map))
- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+ if (cpumask_equal(mask, allbutself) &&
+ cpumask_equal(cpu_online_mask, cpu_callout_mask))
+ apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR);
else
- send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
-}
+ apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
-static void stop_this_cpu(void *dummy)
-{
- local_irq_disable();
- /*
- * Remove this CPU:
- */
- cpu_clear(smp_processor_id(), cpu_online_map);
- disable_local_APIC();
- if (hlt_works(smp_processor_id()))
- for (;;) halt();
- for (;;);
+ free_cpumask_var(allbutself);
}
/*
@@ -178,11 +171,7 @@ static void native_smp_send_stop(void)
void smp_reschedule_interrupt(struct pt_regs *regs)
{
ack_APIC_irq();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_resched_count++;
-#else
- add_pda(irq_resched_count, 1);
-#endif
+ inc_irq_stat(irq_resched_count);
}
void smp_call_function_interrupt(struct pt_regs *regs)
@@ -190,11 +179,7 @@ void smp_call_function_interrupt(struct pt_regs *regs)
ack_APIC_irq();
irq_enter();
generic_smp_call_function_interrupt();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_call_count++;
-#else
- add_pda(irq_call_count, 1);
-#endif
+ inc_irq_stat(irq_call_count);
irq_exit();
}
@@ -203,11 +188,7 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
ack_APIC_irq();
irq_enter();
generic_smp_call_function_single_interrupt();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_call_count++;
-#else
- add_pda(irq_call_count, 1);
-#endif
+ inc_irq_stat(irq_call_count);
irq_exit();
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7b109339731..af57f88186e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1,8 +1,8 @@
/*
* x86 SMP booting functions
*
- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
+ * (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
* Copyright 2001 Andi Kleen, SuSE Labs.
*
* Much of the core SMP work is based on previous work by Thomas Radke, to
@@ -53,7 +53,6 @@
#include <asm/nmi.h>
#include <asm/irq.h>
#include <asm/idle.h>
-#include <asm/smp.h>
#include <asm/trampoline.h>
#include <asm/cpu.h>
#include <asm/numa.h>
@@ -62,11 +61,12 @@
#include <asm/mtrr.h>
#include <asm/vmi.h>
#include <asm/genapic.h>
+#include <asm/setup.h>
+#include <asm/uv/uv.h>
#include <linux/mc146818rtc.h>
-#include <mach_apic.h>
-#include <mach_wakecpu.h>
-#include <smpboot_hooks.h>
+#include <asm/genapic.h>
+#include <asm/smpboot_hooks.h>
#ifdef CONFIG_X86_32
u8 apicid_2_node[MAX_APICID];
@@ -101,15 +101,6 @@ EXPORT_SYMBOL(smp_num_siblings);
/* Last level cache ID of each logical CPU */
DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
-/* bitmap of online cpus */
-cpumask_t cpu_online_map __read_mostly;
-EXPORT_SYMBOL(cpu_online_map);
-
-cpumask_t cpu_callin_map;
-cpumask_t cpu_callout_map;
-cpumask_t cpu_possible_map;
-EXPORT_SYMBOL(cpu_possible_map);
-
/* representing HT siblings of each logical CPU */
DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
@@ -125,9 +116,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
static atomic_t init_deasserted;
-/* representing cpus for which sibling maps can be computed */
-static cpumask_t cpu_sibling_setup_map;
-
/* Set if we find a B stepping CPU */
static int __cpuinitdata smp_b_stepping;
@@ -145,7 +133,7 @@ EXPORT_SYMBOL(cpu_to_node_map);
static void map_cpu_to_node(int cpu, int node)
{
printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node);
- cpu_set(cpu, node_to_cpumask_map[node]);
+ cpumask_set_cpu(cpu, &node_to_cpumask_map[node]);
cpu_to_node_map[cpu] = node;
}
@@ -156,7 +144,7 @@ static void unmap_cpu_to_node(int cpu)
printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu);
for (node = 0; node < MAX_NUMNODES; node++)
- cpu_clear(cpu, node_to_cpumask_map[node]);
+ cpumask_clear_cpu(cpu, &node_to_cpumask_map[node]);
cpu_to_node_map[cpu] = 0;
}
#else /* !(CONFIG_NUMA && CONFIG_X86_32) */
@@ -174,7 +162,7 @@ static void map_cpu_to_logical_apicid(void)
{
int cpu = smp_processor_id();
int apicid = logical_smp_processor_id();
- int node = apicid_to_node(apicid);
+ int node = apic->apicid_to_node(apicid);
if (!node_online(node))
node = first_online_node;
@@ -207,14 +195,15 @@ static void __cpuinit smp_callin(void)
* our local APIC. We have to wait for the IPI or we'll
* lock up on an APIC access.
*/
- wait_for_init_deassert(&init_deasserted);
+ if (apic->wait_for_init_deassert)
+ apic->wait_for_init_deassert(&init_deasserted);
/*
* (This works even if the APIC is not enabled.)
*/
phys_id = read_apic_id();
cpuid = smp_processor_id();
- if (cpu_isset(cpuid, cpu_callin_map)) {
+ if (cpumask_test_cpu(cpuid, cpu_callin_mask)) {
panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
phys_id, cpuid);
}
@@ -236,7 +225,7 @@ static void __cpuinit smp_callin(void)
/*
* Has the boot CPU finished it's STARTUP sequence?
*/
- if (cpu_isset(cpuid, cpu_callout_map))
+ if (cpumask_test_cpu(cpuid, cpu_callout_mask))
break;
cpu_relax();
}
@@ -254,7 +243,8 @@ static void __cpuinit smp_callin(void)
*/
pr_debug("CALLIN, before setup_local_APIC().\n");
- smp_callin_clear_local_apic();
+ if (apic->smp_callin_clear_local_apic)
+ apic->smp_callin_clear_local_apic();
setup_local_APIC();
end_local_APIC_setup();
map_cpu_to_logical_apicid();
@@ -279,7 +269,7 @@ static void __cpuinit smp_callin(void)
/*
* Allow the master to continue.
*/
- cpu_set(cpuid, cpu_callin_map);
+ cpumask_set_cpu(cpuid, cpu_callin_mask);
}
static int __cpuinitdata unsafe_smp;
@@ -287,16 +277,14 @@ static int __cpuinitdata unsafe_smp;
/*
* Activate a secondary processor.
*/
-static void __cpuinit start_secondary(void *unused)
+notrace static void __cpuinit start_secondary(void *unused)
{
/*
* Don't put *anything* before cpu_init(), SMP booting is too
* fragile that we want to limit the things done here to the
* most necessary things.
*/
-#ifdef CONFIG_VMI
vmi_bringup();
-#endif
cpu_init();
preempt_disable();
smp_callin();
@@ -339,7 +327,7 @@ static void __cpuinit start_secondary(void *unused)
ipi_call_lock();
lock_vector_lock();
__setup_vector_irq(smp_processor_id());
- cpu_set(smp_processor_id(), cpu_online_map);
+ set_cpu_online(smp_processor_id(), true);
unlock_vector_lock();
ipi_call_unlock();
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
@@ -445,50 +433,52 @@ void __cpuinit set_cpu_sibling_map(int cpu)
int i;
struct cpuinfo_x86 *c = &cpu_data(cpu);
- cpu_set(cpu, cpu_sibling_setup_map);
+ cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
if (smp_num_siblings > 1) {
- for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
- if (c->phys_proc_id == cpu_data(i).phys_proc_id &&
- c->cpu_core_id == cpu_data(i).cpu_core_id) {
- cpu_set(i, per_cpu(cpu_sibling_map, cpu));
- cpu_set(cpu, per_cpu(cpu_sibling_map, i));
- cpu_set(i, per_cpu(cpu_core_map, cpu));
- cpu_set(cpu, per_cpu(cpu_core_map, i));
- cpu_set(i, c->llc_shared_map);
- cpu_set(cpu, cpu_data(i).llc_shared_map);
+ for_each_cpu(i, cpu_sibling_setup_mask) {
+ struct cpuinfo_x86 *o = &cpu_data(i);
+
+ if (c->phys_proc_id == o->phys_proc_id &&
+ c->cpu_core_id == o->cpu_core_id) {
+ cpumask_set_cpu(i, cpu_sibling_mask(cpu));
+ cpumask_set_cpu(cpu, cpu_sibling_mask(i));
+ cpumask_set_cpu(i, cpu_core_mask(cpu));
+ cpumask_set_cpu(cpu, cpu_core_mask(i));
+ cpumask_set_cpu(i, &c->llc_shared_map);
+ cpumask_set_cpu(cpu, &o->llc_shared_map);
}
}
} else {
- cpu_set(cpu, per_cpu(cpu_sibling_map, cpu));
+ cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
}
- cpu_set(cpu, c->llc_shared_map);
+ cpumask_set_cpu(cpu, &c->llc_shared_map);
if (current_cpu_data.x86_max_cores == 1) {
- per_cpu(cpu_core_map, cpu) = per_cpu(cpu_sibling_map, cpu);
+ cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
c->booted_cores = 1;
return;
}
- for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
+ for_each_cpu(i, cpu_sibling_setup_mask) {
if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
- cpu_set(i, c->llc_shared_map);
- cpu_set(cpu, cpu_data(i).llc_shared_map);
+ cpumask_set_cpu(i, &c->llc_shared_map);
+ cpumask_set_cpu(cpu, &cpu_data(i).llc_shared_map);
}
if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
- cpu_set(i, per_cpu(cpu_core_map, cpu));
- cpu_set(cpu, per_cpu(cpu_core_map, i));
+ cpumask_set_cpu(i, cpu_core_mask(cpu));
+ cpumask_set_cpu(cpu, cpu_core_mask(i));
/*
* Does this new cpu bringup a new core?
*/
- if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) {
+ if (cpumask_weight(cpu_sibling_mask(cpu)) == 1) {
/*
* for each core in package, increment
* the booted_cores for this new cpu
*/
- if (first_cpu(per_cpu(cpu_sibling_map, i)) == i)
+ if (cpumask_first(cpu_sibling_mask(i)) == i)
c->booted_cores++;
/*
* increment the core count for all
@@ -503,7 +493,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
}
/* maps the cpu to the sched domain representing multi-core */
-cpumask_t cpu_coregroup_map(int cpu)
+const struct cpumask *cpu_coregroup_mask(int cpu)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
/*
@@ -511,9 +501,14 @@ cpumask_t cpu_coregroup_map(int cpu)
* And for power savings, we return cpu_core_map
*/
if (sched_mc_power_savings || sched_smt_power_savings)
- return per_cpu(cpu_core_map, cpu);
+ return cpu_core_mask(cpu);
else
- return c->llc_shared_map;
+ return &c->llc_shared_map;
+}
+
+cpumask_t cpu_coregroup_map(int cpu)
+{
+ return *cpu_coregroup_mask(cpu);
}
static void impress_friends(void)
@@ -525,7 +520,7 @@ static void impress_friends(void)
*/
pr_debug("Before bogomips.\n");
for_each_possible_cpu(cpu)
- if (cpu_isset(cpu, cpu_callout_map))
+ if (cpumask_test_cpu(cpu, cpu_callout_mask))
bogosum += cpu_data(cpu).loops_per_jiffy;
printk(KERN_INFO
"Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
@@ -536,7 +531,7 @@ static void impress_friends(void)
pr_debug("Before bogocount - setting activated=1.\n");
}
-static inline void __inquire_remote_apic(int apicid)
+void __inquire_remote_apic(int apicid)
{
unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
char *names[] = { "ID", "VERSION", "SPIV" };
@@ -575,14 +570,13 @@ static inline void __inquire_remote_apic(int apicid)
}
}
-#ifdef WAKE_SECONDARY_VIA_NMI
/*
* Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
* INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
* won't ... remember to clear down the APIC, etc later.
*/
-static int __devinit
-wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
+int __devinit
+wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
{
unsigned long send_status, accept_status = 0;
int maxlvt;
@@ -590,7 +584,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
/* Target chip */
/* Boot on the stack */
/* Kick the second */
- apic_icr_write(APIC_DM_NMI | APIC_DEST_LOGICAL, logical_apicid);
+ apic_icr_write(APIC_DM_NMI | apic->dest_logical, logical_apicid);
pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
@@ -599,7 +593,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
* Give the other CPU some time to accept the IPI.
*/
udelay(200);
- if (APIC_INTEGRATED(apic_version[phys_apicid])) {
+ if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
maxlvt = lapic_get_maxlvt();
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
@@ -614,11 +608,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
return (send_status | accept_status);
}
-#endif /* WAKE_SECONDARY_VIA_NMI */
-#ifdef WAKE_SECONDARY_VIA_INIT
-static int __devinit
-wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
+int __devinit
+wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
{
unsigned long send_status, accept_status = 0;
int maxlvt, num_starts, j;
@@ -737,7 +729,6 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
return (send_status | accept_status);
}
-#endif /* WAKE_SECONDARY_VIA_INIT */
struct create_idle {
struct work_struct work;
@@ -755,57 +746,11 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
complete(&c_idle->done);
}
-#ifdef CONFIG_X86_64
-
-/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
-static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
-{
- if (!after_bootmem)
- free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
-}
-
-/*
- * Allocate node local memory for the AP pda.
- *
- * Must be called after the _cpu_pda pointer table is initialized.
- */
-int __cpuinit get_local_pda(int cpu)
-{
- struct x8664_pda *oldpda, *newpda;
- unsigned long size = sizeof(struct x8664_pda);
- int node = cpu_to_node(cpu);
-
- if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
- return 0;
-
- oldpda = cpu_pda(cpu);
- newpda = kmalloc_node(size, GFP_ATOMIC, node);
- if (!newpda) {
- printk(KERN_ERR "Could not allocate node local PDA "
- "for CPU %d on node %d\n", cpu, node);
-
- if (oldpda)
- return 0; /* have a usable pda */
- else
- return -1;
- }
-
- if (oldpda) {
- memcpy(newpda, oldpda, size);
- free_bootmem_pda(oldpda);
- }
-
- newpda->in_bootmem = 0;
- cpu_pda(cpu) = newpda;
- return 0;
-}
-#endif /* CONFIG_X86_64 */
-
static int __cpuinit do_boot_cpu(int apicid, int cpu)
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
* (ie clustered apic addressing mode), this is a LOGICAL apic ID.
- * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu.
+ * Returns zero if CPU booted OK, else error code from ->wakeup_cpu.
*/
{
unsigned long boot_error = 0;
@@ -818,16 +763,6 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
};
INIT_WORK(&c_idle.work, do_fork_idle);
-#ifdef CONFIG_X86_64
- /* Allocate node local memory for AP pdas */
- if (cpu > 0) {
- boot_error = get_local_pda(cpu);
- if (boot_error)
- goto restore_state;
- /* if can't get pda memory, can't start cpu */
- }
-#endif
-
alternatives_smp_switch(1);
c_idle.idle = get_idle_for_cpu(cpu);
@@ -857,14 +792,16 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
set_idle_for_cpu(cpu, c_idle.idle);
do_rest:
-#ifdef CONFIG_X86_32
per_cpu(current_task, cpu) = c_idle.idle;
- init_gdt(cpu);
+#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
#else
- cpu_pda(cpu)->pcurrent = c_idle.idle;
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
+ initial_gs = per_cpu_offset(cpu);
+ per_cpu(kernel_stack, cpu) =
+ (unsigned long)task_stack_page(c_idle.idle) -
+ KERNEL_STACK_OFFSET + THREAD_SIZE;
#endif
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
@@ -888,7 +825,8 @@ do_rest:
pr_debug("Setting warm reset code and vector.\n");
- store_NMI_vector(&nmi_high, &nmi_low);
+ if (apic->store_NMI_vector)
+ apic->store_NMI_vector(&nmi_high, &nmi_low);
smpboot_setup_warm_reset_vector(start_ip);
/*
@@ -903,26 +841,26 @@ do_rest:
/*
* Starting actual IPI sequence...
*/
- boot_error = wakeup_secondary_cpu(apicid, start_ip);
+ boot_error = apic->wakeup_cpu(apicid, start_ip);
if (!boot_error) {
/*
* allow APs to start initializing.
*/
pr_debug("Before Callout %d.\n", cpu);
- cpu_set(cpu, cpu_callout_map);
+ cpumask_set_cpu(cpu, cpu_callout_mask);
pr_debug("After Callout %d.\n", cpu);
/*
* Wait 5s total for a response
*/
for (timeout = 0; timeout < 50000; timeout++) {
- if (cpu_isset(cpu, cpu_callin_map))
+ if (cpumask_test_cpu(cpu, cpu_callin_mask))
break; /* It has booted */
udelay(100);
}
- if (cpu_isset(cpu, cpu_callin_map)) {
+ if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
/* number CPUs logically, starting from 1 (BSP is 0) */
pr_debug("OK.\n");
printk(KERN_INFO "CPU%d: ", cpu);
@@ -937,19 +875,22 @@ do_rest:
else
/* trampoline code not run */
printk(KERN_ERR "Not responding.\n");
- if (get_uv_system_type() != UV_NON_UNIQUE_APIC)
- inquire_remote_apic(apicid);
+ if (apic->inquire_remote_apic)
+ apic->inquire_remote_apic(apicid);
}
}
-#ifdef CONFIG_X86_64
-restore_state:
-#endif
+
if (boot_error) {
/* Try to put things back the way they were before ... */
numa_remove_cpu(cpu); /* was set by numa_add_cpu */
- cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */
- cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
- cpu_clear(cpu, cpu_present_map);
+
+ /* was set by do_boot_cpu() */
+ cpumask_clear_cpu(cpu, cpu_callout_mask);
+
+ /* was set by cpu_init() */
+ cpumask_clear_cpu(cpu, cpu_initialized_mask);
+
+ set_cpu_present(cpu, false);
per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
}
@@ -966,7 +907,7 @@ restore_state:
int __cpuinit native_cpu_up(unsigned int cpu)
{
- int apicid = cpu_present_to_apicid(cpu);
+ int apicid = apic->cpu_present_to_apicid(cpu);
unsigned long flags;
int err;
@@ -983,7 +924,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
/*
* Already booted CPU?
*/
- if (cpu_isset(cpu, cpu_callin_map)) {
+ if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
pr_debug("do_boot_cpu %d Already started\n", cpu);
return -ENOSYS;
}
@@ -1038,8 +979,9 @@ int __cpuinit native_cpu_up(unsigned int cpu)
*/
static __init void disable_smp(void)
{
- cpu_present_map = cpumask_of_cpu(0);
- cpu_possible_map = cpumask_of_cpu(0);
+ /* use the read/write pointers to the present and possible maps */
+ cpumask_copy(&cpu_present_map, cpumask_of(0));
+ cpumask_copy(&cpu_possible_map, cpumask_of(0));
smpboot_clear_io_apic_irqs();
if (smp_found_config)
@@ -1047,8 +989,8 @@ static __init void disable_smp(void)
else
physid_set_mask_of_physid(0, &phys_cpu_present_map);
map_cpu_to_logical_apicid();
- cpu_set(0, per_cpu(cpu_sibling_map, 0));
- cpu_set(0, per_cpu(cpu_core_map, 0));
+ cpumask_set_cpu(0, cpu_sibling_mask(0));
+ cpumask_set_cpu(0, cpu_core_mask(0));
}
/*
@@ -1058,26 +1000,26 @@ static int __init smp_sanity_check(unsigned max_cpus)
{
preempt_disable();
-#if defined(CONFIG_X86_PC) && defined(CONFIG_X86_32)
+#if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32)
if (def_to_bigsmp && nr_cpu_ids > 8) {
unsigned int cpu;
unsigned nr;
printk(KERN_WARNING
"More than 8 CPUs detected - skipping them.\n"
- "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n");
+ "Use CONFIG_X86_BIGSMP.\n");
nr = 0;
for_each_present_cpu(cpu) {
if (nr >= 8)
- cpu_clear(cpu, cpu_present_map);
+ set_cpu_present(cpu, false);
nr++;
}
nr = 0;
for_each_possible_cpu(cpu) {
if (nr >= 8)
- cpu_clear(cpu, cpu_possible_map);
+ set_cpu_possible(cpu, false);
nr++;
}
@@ -1086,8 +1028,10 @@ static int __init smp_sanity_check(unsigned max_cpus)
#endif
if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
- printk(KERN_WARNING "weird, boot CPU (#%d) not listed"
- "by the BIOS.\n", hard_smp_processor_id());
+ printk(KERN_WARNING
+ "weird, boot CPU (#%d) not listed by the BIOS.\n",
+ hard_smp_processor_id());
+
physid_set(hard_smp_processor_id(), phys_cpu_present_map);
}
@@ -1109,7 +1053,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
* Should not be necessary because the MP table should list the boot
* CPU too, but we do it for the sake of robustness anyway.
*/
- if (!check_phys_apicid_present(boot_cpu_physical_apicid)) {
+ if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
printk(KERN_NOTICE
"weird, boot CPU (#%d) not listed by the BIOS.\n",
boot_cpu_physical_apicid);
@@ -1127,6 +1071,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
printk(KERN_ERR "... forcing use of dummy APIC emulation."
"(tell your hw vendor)\n");
smpboot_clear_io_apic();
+ arch_disable_smp_support();
return -1;
}
@@ -1158,7 +1103,7 @@ static void __init smp_cpu_index_default(void)
for_each_possible_cpu(i) {
c = &cpu_data(i);
/* mark all to hotplug */
- c->cpu_index = NR_CPUS;
+ c->cpu_index = nr_cpu_ids;
}
}
@@ -1171,7 +1116,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
preempt_disable();
smp_cpu_index_default();
current_cpu_data = boot_cpu_data;
- cpu_callin_map = cpumask_of_cpu(0);
+ cpumask_copy(cpu_callin_mask, cpumask_of(0));
mb();
/*
* Setup boot CPU information
@@ -1185,7 +1130,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
#ifdef CONFIG_X86_64
enable_IR_x2apic();
- setup_apic_routing();
+ default_setup_apic_routing();
#endif
if (smp_sanity_check(max_cpus) < 0) {
@@ -1220,7 +1165,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
map_cpu_to_logical_apicid();
- setup_portio_remap();
+ if (apic->setup_portio_remap)
+ apic->setup_portio_remap();
smpboot_setup_io_apic();
/*
@@ -1242,12 +1188,9 @@ out:
void __init native_smp_prepare_boot_cpu(void)
{
int me = smp_processor_id();
-#ifdef CONFIG_X86_32
- init_gdt(me);
-#endif
- switch_to_new_gdt();
- /* already set me in cpu_online_map in boot_cpu_init() */
- cpu_set(me, cpu_callout_map);
+ switch_to_new_gdt(me);
+ /* already set me in cpu_online_mask in boot_cpu_init() */
+ cpumask_set_cpu(me, cpu_callout_mask);
per_cpu(cpu_state, me) = CPU_ONLINE;
}
@@ -1263,6 +1206,15 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
check_nmi_watchdog();
}
+static int __initdata setup_possible_cpus = -1;
+static int __init _setup_possible_cpus(char *str)
+{
+ get_option(&str, &setup_possible_cpus);
+ return 0;
+}
+early_param("possible_cpus", _setup_possible_cpus);
+
+
/*
* cpu_possible_map should be static, it cannot change as cpu's
* are onlined, or offlined. The reason is per-cpu data-structures
@@ -1275,7 +1227,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
*
* Three ways to find out the number of additional hotplug CPUs:
* - If the BIOS specified disabled CPUs in ACPI/mptables use that.
- * - The user can overwrite it with additional_cpus=NUM
+ * - The user can overwrite it with possible_cpus=NUM
* - Otherwise don't reserve additional CPUs.
* We do this because additional CPUs waste a lot of memory.
* -AK
@@ -1288,15 +1240,25 @@ __init void prefill_possible_map(void)
if (!num_processors)
num_processors = 1;
- possible = num_processors + disabled_cpus;
- if (possible > NR_CPUS)
- possible = NR_CPUS;
+ if (setup_possible_cpus == -1)
+ possible = num_processors + disabled_cpus;
+ else
+ possible = setup_possible_cpus;
+
+ total_cpus = max_t(int, possible, num_processors + disabled_cpus);
+
+ if (possible > CONFIG_NR_CPUS) {
+ printk(KERN_WARNING
+ "%d Processors exceeds NR_CPUS limit of %d\n",
+ possible, CONFIG_NR_CPUS);
+ possible = CONFIG_NR_CPUS;
+ }
printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
possible, max_t(int, possible - num_processors, 0));
for (i = 0; i < possible; i++)
- cpu_set(i, cpu_possible_map);
+ set_cpu_possible(i, true);
nr_cpu_ids = possible;
}
@@ -1308,31 +1270,31 @@ static void remove_siblinginfo(int cpu)
int sibling;
struct cpuinfo_x86 *c = &cpu_data(cpu);
- for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) {
- cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
+ for_each_cpu(sibling, cpu_core_mask(cpu)) {
+ cpumask_clear_cpu(cpu, cpu_core_mask(sibling));
/*/
* last thread sibling in this cpu core going down
*/
- if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1)
+ if (cpumask_weight(cpu_sibling_mask(cpu)) == 1)
cpu_data(sibling).booted_cores--;
}
- for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu))
- cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
- cpus_clear(per_cpu(cpu_sibling_map, cpu));
- cpus_clear(per_cpu(cpu_core_map, cpu));
+ for_each_cpu(sibling, cpu_sibling_mask(cpu))
+ cpumask_clear_cpu(cpu, cpu_sibling_mask(sibling));
+ cpumask_clear(cpu_sibling_mask(cpu));
+ cpumask_clear(cpu_core_mask(cpu));
c->phys_proc_id = 0;
c->cpu_core_id = 0;
- cpu_clear(cpu, cpu_sibling_setup_map);
+ cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
}
static void __ref remove_cpu_from_maps(int cpu)
{
- cpu_clear(cpu, cpu_online_map);
- cpu_clear(cpu, cpu_callout_map);
- cpu_clear(cpu, cpu_callin_map);
+ set_cpu_online(cpu, false);
+ cpumask_clear_cpu(cpu, cpu_callout_mask);
+ cpumask_clear_cpu(cpu, cpu_callin_mask);
/* was set by cpu_init() */
- cpu_clear(cpu, cpu_initialized);
+ cpumask_clear_cpu(cpu, cpu_initialized_mask);
numa_remove_cpu(cpu);
}
@@ -1355,7 +1317,7 @@ void cpu_disable_common(void)
lock_vector_lock();
remove_cpu_from_maps(cpu);
unlock_vector_lock();
- fixup_irqs(cpu_online_map);
+ fixup_irqs();
}
int native_cpu_disable(void)
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
deleted file mode 100644
index 397e309839d..00000000000
--- a/arch/x86/kernel/smpcommon.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * SMP stuff which is common to all sub-architectures.
- */
-#include <linux/module.h>
-#include <asm/smp.h>
-
-#ifdef CONFIG_X86_32
-DEFINE_PER_CPU(unsigned long, this_cpu_off);
-EXPORT_PER_CPU_SYMBOL(this_cpu_off);
-
-/*
- * Initialize the CPU's GDT. This is either the boot CPU doing itself
- * (still using the master per-cpu area), or a CPU doing it for a
- * secondary which will soon come up.
- */
-__cpuinit void init_gdt(int cpu)
-{
- struct desc_struct gdt;
-
- pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF,
- 0x2 | DESCTYPE_S, 0x8);
- gdt.s = 1;
-
- write_gdt_entry(get_cpu_gdt_table(cpu),
- GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
-
- per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
- per_cpu(cpu_number, cpu) = cpu;
-}
-#endif
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index a03e7f6d90c..f7bddc2e37d 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -1,11 +1,12 @@
/*
* Stack trace management functions
*
- * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*/
#include <linux/sched.h>
#include <linux/stacktrace.h>
#include <linux/module.h>
+#include <linux/uaccess.h>
#include <asm/stacktrace.h>
static void save_stack_warning(void *data, char *msg)
@@ -83,3 +84,66 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
trace->entries[trace->nr_entries++] = ULONG_MAX;
}
EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
+
+/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
+
+struct stack_frame {
+ const void __user *next_fp;
+ unsigned long ret_addr;
+};
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+ int ret;
+
+ if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+ return 0;
+
+ ret = 1;
+ pagefault_disable();
+ if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+ ret = 0;
+ pagefault_enable();
+
+ return ret;
+}
+
+static inline void __save_stack_trace_user(struct stack_trace *trace)
+{
+ const struct pt_regs *regs = task_pt_regs(current);
+ const void __user *fp = (const void __user *)regs->bp;
+
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = regs->ip;
+
+ while (trace->nr_entries < trace->max_entries) {
+ struct stack_frame frame;
+
+ frame.next_fp = NULL;
+ frame.ret_addr = 0;
+ if (!copy_stack_frame(fp, &frame))
+ break;
+ if ((unsigned long)fp < regs->sp)
+ break;
+ if (frame.ret_addr) {
+ trace->entries[trace->nr_entries++] =
+ frame.ret_addr;
+ }
+ if (fp == frame.next_fp)
+ break;
+ fp = frame.next_fp;
+ }
+}
+
+void save_stack_trace_user(struct stack_trace *trace)
+{
+ /*
+ * Trace user stack if we are not a kernel thread
+ */
+ if (current->mm) {
+ __save_stack_trace_user(trace);
+ }
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
index 7b987852e87..1e733eff9b3 100644
--- a/arch/x86/kernel/summit_32.c
+++ b/arch/x86/kernel/summit_32.c
@@ -30,8 +30,364 @@
#include <linux/init.h>
#include <asm/io.h>
#include <asm/bios_ebda.h>
-#include <asm/summit/mpparse.h>
+/*
+ * APIC driver for the IBM "Summit" chipset.
+ */
+#define APIC_DEFINITION 1
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <asm/smp.h>
+#include <asm/genapic.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <asm/ipi.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <linux/smp.h>
+
+static inline unsigned summit_get_apic_id(unsigned long x)
+{
+ return (x >> 24) & 0xFF;
+}
+
+static inline void summit_send_IPI_mask(const cpumask_t *mask, int vector)
+{
+ default_send_IPI_mask_sequence_logical(mask, vector);
+}
+
+static inline void summit_send_IPI_allbutself(int vector)
+{
+ cpumask_t mask = cpu_online_map;
+ cpu_clear(smp_processor_id(), mask);
+
+ if (!cpus_empty(mask))
+ summit_send_IPI_mask(&mask, vector);
+}
+
+static inline void summit_send_IPI_all(int vector)
+{
+ summit_send_IPI_mask(&cpu_online_map, vector);
+}
+
+#include <asm/tsc.h>
+
+extern int use_cyclone;
+
+#ifdef CONFIG_X86_SUMMIT_NUMA
+extern void setup_summit(void);
+#else
+#define setup_summit() {}
+#endif
+
+static inline int
+summit_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
+{
+ if (!strncmp(oem, "IBM ENSW", 8) &&
+ (!strncmp(productid, "VIGIL SMP", 9)
+ || !strncmp(productid, "EXA", 3)
+ || !strncmp(productid, "RUTHLESS SMP", 12))){
+ mark_tsc_unstable("Summit based system");
+ use_cyclone = 1; /*enable cyclone-timer*/
+ setup_summit();
+ return 1;
+ }
+ return 0;
+}
+
+/* Hook from generic ACPI tables.c */
+static inline int summit_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if (!strncmp(oem_id, "IBM", 3) &&
+ (!strncmp(oem_table_id, "SERVIGIL", 8)
+ || !strncmp(oem_table_id, "EXA", 3))){
+ mark_tsc_unstable("Summit based system");
+ use_cyclone = 1; /*enable cyclone-timer*/
+ setup_summit();
+ return 1;
+ }
+ return 0;
+}
+
+struct rio_table_hdr {
+ unsigned char version; /* Version number of this data structure */
+ /* Version 3 adds chassis_num & WP_index */
+ unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */
+ unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */
+} __attribute__((packed));
+
+struct scal_detail {
+ unsigned char node_id; /* Scalability Node ID */
+ unsigned long CBAR; /* Address of 1MB register space */
+ unsigned char port0node; /* Node ID port connected to: 0xFF=None */
+ unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char port1node; /* Node ID port connected to: 0xFF = None */
+ unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char port2node; /* Node ID port connected to: 0xFF = None */
+ unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */
+} __attribute__((packed));
+
+struct rio_detail {
+ unsigned char node_id; /* RIO Node ID */
+ unsigned long BBAR; /* Address of 1MB register space */
+ unsigned char type; /* Type of device */
+ unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/
+ /* For CYC: Node ID of Twister that owns this CYC */
+ unsigned char port0node; /* Node ID port connected to: 0xFF=None */
+ unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char port1node; /* Node ID port connected to: 0xFF=None */
+ unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */
+ /* For CYC: 0 */
+ unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */
+ /* = 0 : the XAPIC is not used, ie:*/
+ /* ints fwded to another XAPIC */
+ /* Bits1:7 Reserved */
+ /* For CYC: Bits0:7 Reserved */
+ unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */
+ /* lower slot numbers/PCI bus numbers */
+ /* For CYC: No meaning */
+ unsigned char chassis_num; /* 1 based Chassis number */
+ /* For LookOut WPEGs this field indicates the */
+ /* Expansion Chassis #, enumerated from Boot */
+ /* Node WPEG external port, then Boot Node CYC */
+ /* external port, then Next Vigil chassis WPEG */
+ /* external port, etc. */
+ /* Shared Lookouts have only 1 chassis number (the */
+ /* first one assigned) */
+} __attribute__((packed));
+
+
+typedef enum {
+ CompatTwister = 0, /* Compatibility Twister */
+ AltTwister = 1, /* Alternate Twister of internal 8-way */
+ CompatCyclone = 2, /* Compatibility Cyclone */
+ AltCyclone = 3, /* Alternate Cyclone of internal 8-way */
+ CompatWPEG = 4, /* Compatibility WPEG */
+ AltWPEG = 5, /* Second Planar WPEG */
+ LookOutAWPEG = 6, /* LookOut WPEG */
+ LookOutBWPEG = 7, /* LookOut WPEG */
+} node_type;
+
+static inline int is_WPEG(struct rio_detail *rio){
+ return (rio->type == CompatWPEG || rio->type == AltWPEG ||
+ rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
+}
+
+
+/* In clustered mode, the high nibble of APIC ID is a cluster number.
+ * The low nibble is a 4-bit bitmap. */
+#define XAPIC_DEST_CPUS_SHIFT 4
+#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
+#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
+
+#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
+
+static inline const cpumask_t *summit_target_cpus(void)
+{
+ /* CPU_MASK_ALL (0xff) has undefined behaviour with
+ * dest_LowestPrio mode logical clustered apic interrupt routing
+ * Just start on cpu 0. IRQ balancing will spread load
+ */
+ return &cpumask_of_cpu(0);
+}
+
+static inline unsigned long
+summit_check_apicid_used(physid_mask_t bitmap, int apicid)
+{
+ return 0;
+}
+
+/* we don't use the phys_cpu_present_map to indicate apicid presence */
+static inline unsigned long summit_check_apicid_present(int bit)
+{
+ return 1;
+}
+
+#define apicid_cluster(apicid) ((apicid) & XAPIC_DEST_CLUSTER_MASK)
+
+extern u8 cpu_2_logical_apicid[];
+
+static inline void summit_init_apic_ldr(void)
+{
+ unsigned long val, id;
+ int count = 0;
+ u8 my_id = (u8)hard_smp_processor_id();
+ u8 my_cluster = (u8)apicid_cluster(my_id);
+#ifdef CONFIG_SMP
+ u8 lid;
+ int i;
+
+ /* Create logical APIC IDs by counting CPUs already in cluster. */
+ for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
+ lid = cpu_2_logical_apicid[i];
+ if (lid != BAD_APICID && apicid_cluster(lid) == my_cluster)
+ ++count;
+ }
+#endif
+ /* We only have a 4 wide bitmap in cluster mode. If a deranged
+ * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
+ BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
+ id = my_cluster | (1UL << count);
+ apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(id);
+ apic_write(APIC_LDR, val);
+}
+
+static inline int summit_apic_id_registered(void)
+{
+ return 1;
+}
+
+static inline void summit_setup_apic_routing(void)
+{
+ printk("Enabling APIC mode: Summit. Using %d I/O APICs\n",
+ nr_ioapics);
+}
+
+static inline int summit_apicid_to_node(int logical_apicid)
+{
+#ifdef CONFIG_SMP
+ return apicid_2_node[hard_smp_processor_id()];
+#else
+ return 0;
+#endif
+}
+
+/* Mapping from cpu number to logical apicid */
+static inline int summit_cpu_to_logical_apicid(int cpu)
+{
+#ifdef CONFIG_SMP
+ if (cpu >= nr_cpu_ids)
+ return BAD_APICID;
+ return (int)cpu_2_logical_apicid[cpu];
+#else
+ return logical_smp_processor_id();
+#endif
+}
+
+static inline int summit_cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < nr_cpu_ids)
+ return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
+ else
+ return BAD_APICID;
+}
+
+static inline physid_mask_t
+summit_ioapic_phys_id_map(physid_mask_t phys_id_map)
+{
+ /* For clustered we don't have a good way to do this yet - hack */
+ return physids_promote(0x0F);
+}
+
+static inline physid_mask_t summit_apicid_to_cpu_present(int apicid)
+{
+ return physid_mask_of_physid(0);
+}
+
+static inline void summit_setup_portio_remap(void)
+{
+}
+
+static inline int summit_check_phys_apicid_present(int boot_cpu_physical_apicid)
+{
+ return 1;
+}
+
+static inline unsigned int summit_cpu_mask_to_apicid(const cpumask_t *cpumask)
+{
+ int cpus_found = 0;
+ int num_bits_set;
+ int apicid;
+ int cpu;
+
+ num_bits_set = cpus_weight(*cpumask);
+ /* Return id to all */
+ if (num_bits_set >= nr_cpu_ids)
+ return 0xFF;
+ /*
+ * The cpus in the mask must all be on the apic cluster. If are not
+ * on the same apicid cluster return default value of target_cpus():
+ */
+ cpu = first_cpu(*cpumask);
+ apicid = summit_cpu_to_logical_apicid(cpu);
+
+ while (cpus_found < num_bits_set) {
+ if (cpu_isset(cpu, *cpumask)) {
+ int new_apicid = summit_cpu_to_logical_apicid(cpu);
+
+ if (apicid_cluster(apicid) !=
+ apicid_cluster(new_apicid)) {
+ printk ("%s: Not a valid mask!\n", __func__);
+
+ return 0xFF;
+ }
+ apicid = apicid | new_apicid;
+ cpus_found++;
+ }
+ cpu++;
+ }
+ return apicid;
+}
+
+static inline unsigned int
+summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
+ const struct cpumask *andmask)
+{
+ int apicid = summit_cpu_to_logical_apicid(0);
+ cpumask_var_t cpumask;
+
+ if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
+ return apicid;
+
+ cpumask_and(cpumask, inmask, andmask);
+ cpumask_and(cpumask, cpumask, cpu_online_mask);
+ apicid = summit_cpu_mask_to_apicid(cpumask);
+
+ free_cpumask_var(cpumask);
+
+ return apicid;
+}
+
+/*
+ * cpuid returns the value latched in the HW at reset, not the APIC ID
+ * register's value. For any box whose BIOS changes APIC IDs, like
+ * clustered APIC systems, we must use hard_smp_processor_id.
+ *
+ * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
+ */
+static inline int summit_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return hard_smp_processor_id() >> index_msb;
+}
+
+static int probe_summit(void)
+{
+ /* probed later in mptable/ACPI hooks */
+ return 0;
+}
+
+static void summit_vector_allocation_domain(int cpu, cpumask_t *retmask)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
+}
+
+#ifdef CONFIG_X86_SUMMIT_NUMA
static struct rio_table_hdr *rio_table_hdr __initdata;
static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata;
@@ -186,3 +542,61 @@ void __init setup_summit(void)
next_wpeg = 0;
} while (next_wpeg != 0);
}
+#endif
+
+struct genapic apic_summit = {
+
+ .name = "summit",
+ .probe = probe_summit,
+ .acpi_madt_oem_check = summit_acpi_madt_oem_check,
+ .apic_id_registered = summit_apic_id_registered,
+
+ .irq_delivery_mode = dest_LowestPrio,
+ /* logical delivery broadcast to all CPUs: */
+ .irq_dest_mode = 1,
+
+ .target_cpus = summit_target_cpus,
+ .disable_esr = 1,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = summit_check_apicid_used,
+ .check_apicid_present = summit_check_apicid_present,
+
+ .vector_allocation_domain = summit_vector_allocation_domain,
+ .init_apic_ldr = summit_init_apic_ldr,
+
+ .ioapic_phys_id_map = summit_ioapic_phys_id_map,
+ .setup_apic_routing = summit_setup_apic_routing,
+ .multi_timer_check = NULL,
+ .apicid_to_node = summit_apicid_to_node,
+ .cpu_to_logical_apicid = summit_cpu_to_logical_apicid,
+ .cpu_present_to_apicid = summit_cpu_present_to_apicid,
+ .apicid_to_cpu_present = summit_apicid_to_cpu_present,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = summit_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = summit_phys_pkg_id,
+ .mps_oem_check = summit_mps_oem_check,
+
+ .get_apic_id = summit_get_apic_id,
+ .set_apic_id = NULL,
+ .apic_id_mask = 0xFF << 24,
+
+ .cpu_mask_to_apicid = summit_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = summit_send_IPI_mask,
+ .send_IPI_mask_allbutself = NULL,
+ .send_IPI_allbutself = summit_send_IPI_allbutself,
+ .send_IPI_all = summit_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+
+ .wakeup_cpu = NULL,
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+
+ .wait_for_init_deassert = default_wait_for_init_deassert,
+
+ .smp_callin_clear_local_apic = NULL,
+ .store_NMI_vector = NULL,
+ .inquire_remote_apic = default_inquire_remote_apic,
+};
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d44395ff34c..3bdb64829b8 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -1,7 +1,7 @@
ENTRY(sys_call_table)
.long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */
.long sys_exit
- .long sys_fork
+ .long ptregs_fork
.long sys_read
.long sys_write
.long sys_open /* 5 */
@@ -10,7 +10,7 @@ ENTRY(sys_call_table)
.long sys_creat
.long sys_link
.long sys_unlink /* 10 */
- .long sys_execve
+ .long ptregs_execve
.long sys_chdir
.long sys_time
.long sys_mknod
@@ -88,7 +88,7 @@ ENTRY(sys_call_table)
.long sys_uselib
.long sys_swapon
.long sys_reboot
- .long old_readdir
+ .long sys_old_readdir
.long old_mmap /* 90 */
.long sys_munmap
.long sys_truncate
@@ -109,17 +109,17 @@ ENTRY(sys_call_table)
.long sys_newlstat
.long sys_newfstat
.long sys_uname
- .long sys_iopl /* 110 */
+ .long ptregs_iopl /* 110 */
.long sys_vhangup
.long sys_ni_syscall /* old "idle" system call */
- .long sys_vm86old
+ .long ptregs_vm86old
.long sys_wait4
.long sys_swapoff /* 115 */
.long sys_sysinfo
.long sys_ipc
.long sys_fsync
- .long sys_sigreturn
- .long sys_clone /* 120 */
+ .long ptregs_sigreturn
+ .long ptregs_clone /* 120 */
.long sys_setdomainname
.long sys_newuname
.long sys_modify_ldt
@@ -165,14 +165,14 @@ ENTRY(sys_call_table)
.long sys_mremap
.long sys_setresuid16
.long sys_getresuid16 /* 165 */
- .long sys_vm86
+ .long ptregs_vm86
.long sys_ni_syscall /* Old sys_query_module */
.long sys_poll
.long sys_nfsservctl
.long sys_setresgid16 /* 170 */
.long sys_getresgid16
.long sys_prctl
- .long sys_rt_sigreturn
+ .long ptregs_rt_sigreturn
.long sys_rt_sigaction
.long sys_rt_sigprocmask /* 175 */
.long sys_rt_sigpending
@@ -185,11 +185,11 @@ ENTRY(sys_call_table)
.long sys_getcwd
.long sys_capget
.long sys_capset /* 185 */
- .long sys_sigaltstack
+ .long ptregs_sigaltstack
.long sys_sendfile
.long sys_ni_syscall /* reserved for streams1 */
.long sys_ni_syscall /* reserved for streams2 */
- .long sys_vfork /* 190 */
+ .long ptregs_vfork /* 190 */
.long sys_getrlimit
.long sys_mmap2
.long sys_truncate64
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 77b400f06ea..764c74e871f 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -38,7 +38,7 @@
#include <asm/time.h>
#include <asm/timer.h>
-#include "do_timer.h"
+#include <asm/do_timer.h>
int timer_ack;
@@ -75,7 +75,7 @@ EXPORT_SYMBOL(profile_pc);
irqreturn_t timer_interrupt(int irq, void *dev_id)
{
/* Keep nmi watchdog up to date */
- per_cpu(irq_stat, smp_processor_id()).irq0_irqs++;
+ inc_irq_stat(irq0_irqs);
#ifdef CONFIG_X86_IO_APIC
if (timer_ack) {
@@ -105,8 +105,8 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
high bit of the PPI port B (0x61). Note that some PS/2s,
notably the 55SX, work fine if this is removed. */
- u8 irq_v = inb_p( 0x61 ); /* read the current state */
- outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */
+ u8 irq_v = inb_p(0x61); /* read the current state */
+ outb_p(irq_v | 0x80, 0x61); /* reset the IRQ */
}
#endif
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index cb19d650c21..e6e695acd72 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -17,10 +17,10 @@
#include <linux/module.h>
#include <linux/time.h>
#include <linux/mca.h>
+#include <linux/nmi.h>
#include <asm/i8253.h>
#include <asm/hpet.h>
-#include <asm/nmi.h>
#include <asm/vgtod.h>
#include <asm/time.h>
#include <asm/timer.h>
@@ -49,9 +49,9 @@ unsigned long profile_pc(struct pt_regs *regs)
}
EXPORT_SYMBOL(profile_pc);
-irqreturn_t timer_interrupt(int irq, void *dev_id)
+static irqreturn_t timer_interrupt(int irq, void *dev_id)
{
- add_pda(irq0_irqs, 1);
+ inc_irq_stat(irq0_irqs);
global_clock_event->event_handler(global_clock_event);
@@ -80,6 +80,8 @@ unsigned long __init calibrate_cpu(void)
break;
no_ctr_free = (i == 4);
if (no_ctr_free) {
+ WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
+ "cpu_khz value may be incorrect.\n");
i = 3;
rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
wrmsrl(MSR_K7_EVNTSEL3, 0);
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
deleted file mode 100644
index f4049f3513b..00000000000
--- a/arch/x86/kernel/tlb_32.c
+++ /dev/null
@@ -1,257 +0,0 @@
-#include <linux/spinlock.h>
-#include <linux/cpu.h>
-#include <linux/interrupt.h>
-
-#include <asm/tlbflush.h>
-
-DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate)
- ____cacheline_aligned = { &init_mm, 0, };
-
-/* must come after the send_IPI functions above for inlining */
-#include <mach_ipi.h>
-
-/*
- * Smarter SMP flushing macros.
- * c/o Linus Torvalds.
- *
- * These mean you can really definitely utterly forget about
- * writing to user space from interrupts. (Its not allowed anyway).
- *
- * Optimizations Manfred Spraul <manfred@colorfullife.com>
- */
-
-static cpumask_t flush_cpumask;
-static struct mm_struct *flush_mm;
-static unsigned long flush_va;
-static DEFINE_SPINLOCK(tlbstate_lock);
-
-/*
- * We cannot call mmdrop() because we are in interrupt context,
- * instead update mm->cpu_vm_mask.
- *
- * We need to reload %cr3 since the page tables may be going
- * away from under us..
- */
-void leave_mm(int cpu)
-{
- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
- BUG();
- cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
- load_cr3(swapper_pg_dir);
-}
-EXPORT_SYMBOL_GPL(leave_mm);
-
-/*
- *
- * The flush IPI assumes that a thread switch happens in this order:
- * [cpu0: the cpu that switches]
- * 1) switch_mm() either 1a) or 1b)
- * 1a) thread switch to a different mm
- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
- * Stop ipi delivery for the old mm. This is not synchronized with
- * the other cpus, but smp_invalidate_interrupt ignore flush ipis
- * for the wrong mm, and in the worst case we perform a superfluous
- * tlb flush.
- * 1a2) set cpu_tlbstate to TLBSTATE_OK
- * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
- * was in lazy tlb mode.
- * 1a3) update cpu_tlbstate[].active_mm
- * Now cpu0 accepts tlb flushes for the new mm.
- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
- * Now the other cpus will send tlb flush ipis.
- * 1a4) change cr3.
- * 1b) thread switch without mm change
- * cpu_tlbstate[].active_mm is correct, cpu0 already handles
- * flush ipis.
- * 1b1) set cpu_tlbstate to TLBSTATE_OK
- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- * Atomically set the bit [other cpus will start sending flush ipis],
- * and test the bit.
- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
- * 2) switch %%esp, ie current
- *
- * The interrupt must handle 2 special cases:
- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
- * runs in kernel space, the cpu could load tlb entries for user space
- * pages.
- *
- * The good news is that cpu_tlbstate is local to each cpu, no
- * write/read ordering problems.
- */
-
-/*
- * TLB flush IPI:
- *
- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
- * 2) Leave the mm if we are in the lazy tlb mode.
- */
-
-void smp_invalidate_interrupt(struct pt_regs *regs)
-{
- unsigned long cpu;
-
- cpu = get_cpu();
-
- if (!cpu_isset(cpu, flush_cpumask))
- goto out;
- /*
- * This was a BUG() but until someone can quote me the
- * line from the intel manual that guarantees an IPI to
- * multiple CPUs is retried _only_ on the erroring CPUs
- * its staying as a return
- *
- * BUG();
- */
-
- if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
- if (flush_va == TLB_FLUSH_ALL)
- local_flush_tlb();
- else
- __flush_tlb_one(flush_va);
- } else
- leave_mm(cpu);
- }
- ack_APIC_irq();
- smp_mb__before_clear_bit();
- cpu_clear(cpu, flush_cpumask);
- smp_mb__after_clear_bit();
-out:
- put_cpu_no_resched();
- __get_cpu_var(irq_stat).irq_tlb_count++;
-}
-
-void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
- unsigned long va)
-{
- cpumask_t cpumask = *cpumaskp;
-
- /*
- * A couple of (to be removed) sanity checks:
- *
- * - current CPU must not be in mask
- * - mask must exist :)
- */
- BUG_ON(cpus_empty(cpumask));
- BUG_ON(cpu_isset(smp_processor_id(), cpumask));
- BUG_ON(!mm);
-
-#ifdef CONFIG_HOTPLUG_CPU
- /* If a CPU which we ran on has gone down, OK. */
- cpus_and(cpumask, cpumask, cpu_online_map);
- if (unlikely(cpus_empty(cpumask)))
- return;
-#endif
-
- /*
- * i'm not happy about this global shared spinlock in the
- * MM hot path, but we'll see how contended it is.
- * AK: x86-64 has a faster method that could be ported.
- */
- spin_lock(&tlbstate_lock);
-
- flush_mm = mm;
- flush_va = va;
- cpus_or(flush_cpumask, cpumask, flush_cpumask);
-
- /*
- * Make the above memory operations globally visible before
- * sending the IPI.
- */
- smp_mb();
- /*
- * We have to send the IPI only to
- * CPUs affected.
- */
- send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
-
- while (!cpus_empty(flush_cpumask))
- /* nothing. lockup detection does not belong here */
- cpu_relax();
-
- flush_mm = NULL;
- flush_va = 0;
- spin_unlock(&tlbstate_lock);
-}
-
-void flush_tlb_current_task(void)
-{
- struct mm_struct *mm = current->mm;
- cpumask_t cpu_mask;
-
- preempt_disable();
- cpu_mask = mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
-
- local_flush_tlb();
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
- preempt_enable();
-}
-
-void flush_tlb_mm(struct mm_struct *mm)
-{
- cpumask_t cpu_mask;
-
- preempt_disable();
- cpu_mask = mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
-
- if (current->active_mm == mm) {
- if (current->mm)
- local_flush_tlb();
- else
- leave_mm(smp_processor_id());
- }
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
-
- preempt_enable();
-}
-
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
-{
- struct mm_struct *mm = vma->vm_mm;
- cpumask_t cpu_mask;
-
- preempt_disable();
- cpu_mask = mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
-
- if (current->active_mm == mm) {
- if (current->mm)
- __flush_tlb_one(va);
- else
- leave_mm(smp_processor_id());
- }
-
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, va);
-
- preempt_enable();
-}
-EXPORT_SYMBOL(flush_tlb_page);
-
-static void do_flush_tlb_all(void *info)
-{
- unsigned long cpu = smp_processor_id();
-
- __flush_tlb_all();
- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
- leave_mm(cpu);
-}
-
-void flush_tlb_all(void)
-{
- on_each_cpu(do_flush_tlb_all, NULL, 1);
-}
-
-void reset_lazy_tlbstate(void)
-{
- int cpu = raw_smp_processor_id();
-
- per_cpu(cpu_tlbstate, cpu).state = 0;
- per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
-}
-
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 04431f34fd1..f396e61bcb3 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -11,6 +11,7 @@
#include <linux/kernel.h>
#include <asm/mmu_context.h>
+#include <asm/uv/uv.h>
#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
#include <asm/uv/uv_bau.h>
@@ -19,7 +20,7 @@
#include <asm/tsc.h>
#include <asm/irq_vectors.h>
-#include <mach_apic.h>
+#include <asm/genapic.h>
static struct bau_control **uv_bau_table_bases __read_mostly;
static int uv_bau_retry_limit __read_mostly;
@@ -200,6 +201,7 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
destination_timeouts = 0;
}
}
+ cpu_relax();
}
return FLUSH_COMPLETE;
}
@@ -209,14 +211,15 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
*
* Send a broadcast and wait for a broadcast message to complete.
*
- * The cpumaskp mask contains the cpus the broadcast was sent to.
+ * The flush_mask contains the cpus the broadcast was sent to.
*
- * Returns 1 if all remote flushing was done. The mask is zeroed.
- * Returns 0 if some remote flushing remains to be done. The mask is left
- * unchanged.
+ * Returns NULL if all remote flushing was done. The mask is zeroed.
+ * Returns @flush_mask if some remote flushing remains to be done. The
+ * mask will have some bits still set.
*/
-int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
- cpumask_t *cpumaskp)
+const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade,
+ struct bau_desc *bau_desc,
+ struct cpumask *flush_mask)
{
int completion_status = 0;
int right_shift;
@@ -256,66 +259,76 @@ int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
* the cpu's, all of which are still in the mask.
*/
__get_cpu_var(ptcstats).ptc_i++;
- return 0;
+ return flush_mask;
}
/*
* Success, so clear the remote cpu's from the mask so we don't
* use the IPI method of shootdown on them.
*/
- for_each_cpu_mask(bit, *cpumaskp) {
+ for_each_cpu(bit, flush_mask) {
blade = uv_cpu_to_blade_id(bit);
if (blade == this_blade)
continue;
- cpu_clear(bit, *cpumaskp);
+ cpumask_clear_cpu(bit, flush_mask);
}
- if (!cpus_empty(*cpumaskp))
- return 0;
- return 1;
+ if (!cpumask_empty(flush_mask))
+ return flush_mask;
+ return NULL;
}
/**
* uv_flush_tlb_others - globally purge translation cache of a virtual
* address or all TLB's
- * @cpumaskp: mask of all cpu's in which the address is to be removed
+ * @cpumask: mask of all cpu's in which the address is to be removed
* @mm: mm_struct containing virtual address range
* @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
+ * @cpu: the current cpu
*
* This is the entry point for initiating any UV global TLB shootdown.
*
* Purges the translation caches of all specified processors of the given
* virtual address, or purges all TLB's on specified processors.
*
- * The caller has derived the cpumaskp from the mm_struct and has subtracted
- * the local cpu from the mask. This function is called only if there
- * are bits set in the mask. (e.g. flush_tlb_page())
+ * The caller has derived the cpumask from the mm_struct. This function
+ * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
*
- * The cpumaskp is converted into a nodemask of the nodes containing
+ * The cpumask is converted into a nodemask of the nodes containing
* the cpus.
*
- * Returns 1 if all remote flushing was done.
- * Returns 0 if some remote flushing remains to be done.
+ * Note that this function should be called with preemption disabled.
+ *
+ * Returns NULL if all remote flushing was done.
+ * Returns pointer to cpumask if some remote flushing remains to be
+ * done. The returned pointer is valid till preemption is re-enabled.
*/
-int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
- unsigned long va)
+const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
+ struct mm_struct *mm,
+ unsigned long va, unsigned int cpu)
{
+ static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
+ struct cpumask *flush_mask = &__get_cpu_var(flush_tlb_mask);
int i;
int bit;
int blade;
- int cpu;
+ int uv_cpu;
int this_blade;
int locals = 0;
struct bau_desc *bau_desc;
- cpu = uv_blade_processor_id();
+ WARN_ON(!in_atomic());
+
+ cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+
+ uv_cpu = uv_blade_processor_id();
this_blade = uv_numa_blade_id();
bau_desc = __get_cpu_var(bau_control).descriptor_base;
- bau_desc += UV_ITEMS_PER_DESCRIPTOR * cpu;
+ bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
i = 0;
- for_each_cpu_mask(bit, *cpumaskp) {
+ for_each_cpu(bit, flush_mask) {
blade = uv_cpu_to_blade_id(bit);
BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1));
if (blade == this_blade) {
@@ -330,17 +343,17 @@ int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
* no off_node flushing; return status for local node
*/
if (locals)
- return 0;
+ return flush_mask;
else
- return 1;
+ return NULL;
}
__get_cpu_var(ptcstats).requestor++;
__get_cpu_var(ptcstats).ntargeted += i;
bau_desc->payload.address = va;
- bau_desc->payload.sending_cpu = smp_processor_id();
+ bau_desc->payload.sending_cpu = cpu;
- return uv_flush_send_and_wait(cpu, this_blade, bau_desc, cpumaskp);
+ return uv_flush_send_and_wait(uv_cpu, this_blade, bau_desc, flush_mask);
}
/*
@@ -566,14 +579,10 @@ static int __init uv_ptc_init(void)
if (!is_uv_system())
return 0;
- if (!proc_mkdir("sgi_uv", NULL))
- return -EINVAL;
-
proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL);
if (!proc_uv_ptc) {
printk(KERN_ERR "unable to create %s proc entry\n",
UV_PTC_BASENAME);
- remove_proc_entry("sgi_uv", NULL);
return -EINVAL;
}
proc_uv_ptc->proc_fops = &proc_uv_ptc_operations;
@@ -586,7 +595,6 @@ static int __init uv_ptc_init(void)
static struct bau_control * __init uv_table_bases_init(int blade, int node)
{
int i;
- int *ip;
struct bau_msg_status *msp;
struct bau_control *bau_tabp;
@@ -603,13 +611,6 @@ static struct bau_control * __init uv_table_bases_init(int blade, int node)
bau_cpubits_clear(&msp->seen_by, (int)
uv_blade_nr_possible_cpus(blade));
- bau_tabp->watching =
- kmalloc_node(sizeof(int) * DEST_NUM_RESOURCES, GFP_KERNEL, node);
- BUG_ON(!bau_tabp->watching);
-
- for (i = 0, ip = bau_tabp->watching; i < DEST_Q_SIZE; i++, ip++)
- *ip = 0;
-
uv_bau_table_bases[blade] = bau_tabp;
return bau_tabp;
@@ -632,7 +633,6 @@ uv_table_bases_finish(int blade, int node, int cur_cpu,
bcp->bau_msg_head = bau_tablesp->va_queue_first;
bcp->va_queue_first = bau_tablesp->va_queue_first;
bcp->va_queue_last = bau_tablesp->va_queue_last;
- bcp->watching = bau_tablesp->watching;
bcp->msg_statuses = bau_tablesp->msg_statuses;
bcp->descriptor_base = adp;
}
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index 1106fac6024..808031a5ba1 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -1,10 +1,26 @@
#include <linux/io.h>
#include <asm/trampoline.h>
+#include <asm/e820.h>
/* ready for x86_64 and x86 */
unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
+void __init reserve_trampoline_memory(void)
+{
+#ifdef CONFIG_X86_32
+ /*
+ * But first pinch a few for the stack/trampoline stuff
+ * FIXME: Don't need the extra page at 4K, but need to fix
+ * trampoline before removing it. (see the GDT stuff)
+ */
+ reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
+#endif
+ /* Has to be in very low memory so we can execute real-mode AP code. */
+ reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE,
+ "TRAMPOLINE");
+}
+
/*
* Currently trivial. Write the real->protected mode
* bootstrap into the page concerned. The caller
@@ -12,7 +28,6 @@ unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
*/
unsigned long setup_trampoline(void)
{
- memcpy(trampoline_base, trampoline_data,
- trampoline_end - trampoline_data);
+ memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
return virt_to_phys(trampoline_base);
}
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 894293c598d..95a012a4664 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -29,6 +29,7 @@
#include <asm/page.h>
#include <asm/msr.h>
#include <asm/segment.h>
+#include <asm/processor-flags.h>
.section .rodata, "a", @progbits
@@ -37,7 +38,7 @@
ENTRY(trampoline_data)
r_base = .
cli # We should be safe anyway
- wbinvd
+ wbinvd
mov %cs, %ax # Code and data in the same place
mov %ax, %ds
mov %ax, %es
@@ -73,9 +74,8 @@ r_base = .
lidtl tidt - r_base # load idt with 0, 0
lgdtl tgdt - r_base # load gdt with whatever is appropriate
- xor %ax, %ax
- inc %ax # protected mode (PE) bit
- lmsw %ax # into protected mode
+ mov $X86_CR0_PE, %ax # protected mode (PE) bit
+ lmsw %ax # into protected mode
# flush prefetch and jump to startup_32
ljmpl *(startup_32_vector - r_base)
@@ -86,9 +86,8 @@ startup_32:
movl $__KERNEL_DS, %eax # Initialize the %ds segment register
movl %eax, %ds
- xorl %eax, %eax
- btsl $5, %eax # Enable PAE mode
- movl %eax, %cr4
+ movl $X86_CR4_PAE, %eax
+ movl %eax, %cr4 # Enable PAE mode
# Setup trampoline 4 level pagetables
leal (trampoline_level4_pgt - r_base)(%esi), %eax
@@ -99,9 +98,9 @@ startup_32:
xorl %edx, %edx
wrmsr
- xorl %eax, %eax
- btsl $31, %eax # Enable paging and in turn activate Long Mode
- btsl $0, %eax # Enable protected mode
+ # Enable paging and in turn activate Long Mode
+ # Enable protected mode
+ movl $(X86_CR0_PG | X86_CR0_PE), %eax
movl %eax, %cr0
/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 04d242ab016..bde57f0f161 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -20,7 +20,6 @@
#include <linux/module.h>
#include <linux/ptrace.h>
#include <linux/string.h>
-#include <linux/unwind.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/kexec.h>
@@ -51,30 +50,22 @@
#include <asm/debugreg.h>
#include <asm/atomic.h>
#include <asm/system.h>
-#include <asm/unwind.h>
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/i387.h>
-#include <mach_traps.h>
+#include <asm/mach_traps.h>
#ifdef CONFIG_X86_64
#include <asm/pgalloc.h>
#include <asm/proto.h>
-#include <asm/pda.h>
#else
#include <asm/processor-flags.h>
#include <asm/arch_hooks.h>
-#include <asm/nmi.h>
-#include <asm/smp.h>
-#include <asm/io.h>
#include <asm/traps.h>
#include "cpu/mcheck/mce.h"
-DECLARE_BITMAP(used_vectors, NR_VECTORS);
-EXPORT_SYMBOL_GPL(used_vectors);
-
asmlinkage int system_call(void);
/* Do we ignore FPU interrupts ? */
@@ -89,6 +80,9 @@ gate_desc idt_table[256]
__attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
#endif
+DECLARE_BITMAP(used_vectors, NR_VECTORS);
+EXPORT_SYMBOL_GPL(used_vectors);
+
static int ignore_nmis;
static inline void conditional_sti(struct pt_regs *regs)
@@ -292,8 +286,10 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
tsk->thread.error_code = error_code;
tsk->thread.trap_no = 8;
- /* This is always a kernel trap and never fixable (and thus must
- never return). */
+ /*
+ * This is always a kernel trap and never fixable (and thus must
+ * never return).
+ */
for (;;)
die(str, regs, error_code);
}
@@ -481,11 +477,7 @@ do_nmi(struct pt_regs *regs, long error_code)
{
nmi_enter();
-#ifdef CONFIG_X86_32
- { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); }
-#else
- add_pda(__nmi_count, 1);
-#endif
+ inc_irq_stat(__nmi_count);
if (!ignore_nmis)
default_do_nmi(regs);
@@ -524,9 +516,11 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
}
#ifdef CONFIG_X86_64
-/* Help handler running on IST stack to switch back to user stack
- for scheduling or signal handling. The actual stack switch is done in
- entry.S */
+/*
+ * Help handler running on IST stack to switch back to user stack
+ * for scheduling or signal handling. The actual stack switch is done in
+ * entry.S
+ */
asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
{
struct pt_regs *regs = eregs;
@@ -536,8 +530,10 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
/* Exception from user space */
else if (user_mode(eregs))
regs = task_pt_regs(current);
- /* Exception from kernel and interrupts are enabled. Move to
- kernel process stack. */
+ /*
+ * Exception from kernel and interrupts are enabled. Move to
+ * kernel process stack.
+ */
else if (eregs->flags & X86_EFLAGS_IF)
regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
if (eregs != regs)
@@ -664,7 +660,7 @@ void math_error(void __user *ip)
{
struct task_struct *task;
siginfo_t info;
- unsigned short cwd, swd;
+ unsigned short cwd, swd, err;
/*
* Save the info for the exception handler and clear the error.
@@ -675,7 +671,6 @@ void math_error(void __user *ip)
task->thread.error_code = 0;
info.si_signo = SIGFPE;
info.si_errno = 0;
- info.si_code = __SI_FAULT;
info.si_addr = ip;
/*
* (~cwd & swd) will mask out exceptions that are not set to unmasked
@@ -689,34 +684,30 @@ void math_error(void __user *ip)
*/
cwd = get_fpu_cwd(task);
swd = get_fpu_swd(task);
- switch (swd & ~cwd & 0x3f) {
- case 0x000: /* No unmasked exception */
-#ifdef CONFIG_X86_32
- return;
-#endif
- default: /* Multiple exceptions */
- break;
- case 0x001: /* Invalid Op */
+
+ err = swd & ~cwd;
+
+ if (err & 0x001) { /* Invalid op */
/*
* swd & 0x240 == 0x040: Stack Underflow
* swd & 0x240 == 0x240: Stack Overflow
* User must clear the SF bit (0x40) if set
*/
info.si_code = FPE_FLTINV;
- break;
- case 0x002: /* Denormalize */
- case 0x010: /* Underflow */
- info.si_code = FPE_FLTUND;
- break;
- case 0x004: /* Zero Divide */
+ } else if (err & 0x004) { /* Divide by Zero */
info.si_code = FPE_FLTDIV;
- break;
- case 0x008: /* Overflow */
+ } else if (err & 0x008) { /* Overflow */
info.si_code = FPE_FLTOVF;
- break;
- case 0x020: /* Precision */
+ } else if (err & 0x012) { /* Denormal, Underflow */
+ info.si_code = FPE_FLTUND;
+ } else if (err & 0x020) { /* Precision */
info.si_code = FPE_FLTRES;
- break;
+ } else {
+ /*
+ * If we're using IRQ 13, or supposedly even some trap 16
+ * implementations, it's possible we get a spurious trap...
+ */
+ return; /* Spurious trap, no error */
}
force_sig_info(SIGFPE, &info, task);
}
@@ -904,7 +895,7 @@ asmlinkage void math_state_restore(void)
EXPORT_SYMBOL_GPL(math_state_restore);
#ifndef CONFIG_MATH_EMULATION
-asmlinkage void math_emulate(long arg)
+void math_emulate(struct math_emu_info *info)
{
printk(KERN_EMERG
"math-emulation not enabled and no coprocessor found.\n");
@@ -915,12 +906,16 @@ asmlinkage void math_emulate(long arg)
#endif /* CONFIG_MATH_EMULATION */
dotraplinkage void __kprobes
-do_device_not_available(struct pt_regs *regs, long error)
+do_device_not_available(struct pt_regs *regs, long error_code)
{
#ifdef CONFIG_X86_32
if (read_cr0() & X86_CR0_EM) {
+ struct math_emu_info info = { };
+
conditional_sti(regs);
- math_emulate(0);
+
+ info.regs = regs;
+ math_emulate(&info);
} else {
math_state_restore(); /* interrupts still off */
conditional_sti(regs);
@@ -949,9 +944,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
void __init trap_init(void)
{
-#ifdef CONFIG_X86_32
int i;
-#endif
#ifdef CONFIG_EISA
void __iomem *p = early_ioremap(0x0FFFD9, 4);
@@ -1008,11 +1001,15 @@ void __init trap_init(void)
}
set_system_trap_gate(SYSCALL_VECTOR, &system_call);
+#endif
/* Reserve all the builtin and the syscall vector: */
for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
set_bit(i, used_vectors);
+#ifdef CONFIG_X86_64
+ set_bit(IA32_SYSCALL_VECTOR, used_vectors);
+#else
set_bit(SYSCALL_VECTOR, used_vectors);
#endif
/*
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 424093b157d..83d53ce5d4c 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -15,6 +15,7 @@
#include <asm/vgtod.h>
#include <asm/time.h>
#include <asm/delay.h>
+#include <asm/hypervisor.h>
unsigned int cpu_khz; /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
@@ -31,6 +32,7 @@ static int tsc_unstable;
erroneous rdtsc usage on !cpu_has_tsc processors */
static int tsc_disabled = -1;
+static int tsc_clocksource_reliable;
/*
* Scheduler clock - returns current time in nanosec units.
*/
@@ -98,6 +100,15 @@ int __init notsc_setup(char *str)
__setup("notsc", notsc_setup);
+static int __init tsc_setup(char *str)
+{
+ if (!strcmp(str, "reliable"))
+ tsc_clocksource_reliable = 1;
+ return 1;
+}
+
+__setup("tsc=", tsc_setup);
+
#define MAX_RETRIES 5
#define SMI_TRESHOLD 50000
@@ -352,9 +363,15 @@ unsigned long native_calibrate_tsc(void)
{
u64 tsc1, tsc2, delta, ref1, ref2;
unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
- unsigned long flags, latch, ms, fast_calibrate;
+ unsigned long flags, latch, ms, fast_calibrate, tsc_khz;
int hpet = is_hpet_enabled(), i, loopmin;
+ tsc_khz = get_hypervisor_tsc_freq();
+ if (tsc_khz) {
+ printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
+ return tsc_khz;
+ }
+
local_irq_save(flags);
fast_calibrate = quick_pit_calibrate();
local_irq_restore(flags);
@@ -731,24 +748,21 @@ static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
{}
};
-/*
- * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
- */
+static void __init check_system_tsc_reliable(void)
+{
#ifdef CONFIG_MGEODE_LX
-/* RTSC counts during suspend */
+ /* RTSC counts during suspend */
#define RTSC_SUSP 0x100
-
-static void __init check_geode_tsc_reliable(void)
-{
unsigned long res_low, res_high;
rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
+ /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */
if (res_low & RTSC_SUSP)
- clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
-}
-#else
-static inline void check_geode_tsc_reliable(void) { }
+ tsc_clocksource_reliable = 1;
#endif
+ if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
+ tsc_clocksource_reliable = 1;
+}
/*
* Make an educated guess if the TSC is trustworthy and synchronized
@@ -759,7 +773,7 @@ __cpuinit int unsynchronized_tsc(void)
if (!cpu_has_tsc || tsc_unstable)
return 1;
-#ifdef CONFIG_X86_SMP
+#ifdef CONFIG_SMP
if (apic_is_clustered_box())
return 1;
#endif
@@ -783,6 +797,8 @@ static void __init init_tsc_clocksource(void)
{
clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
clocksource_tsc.shift);
+ if (tsc_clocksource_reliable)
+ clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
/* lower the rating if we already know its unstable: */
if (check_tsc_unstable()) {
clocksource_tsc.rating = 0;
@@ -843,7 +859,7 @@ void __init tsc_init(void)
if (unsynchronized_tsc())
mark_tsc_unstable("TSCs unsynchronized");
- check_geode_tsc_reliable();
+ check_system_tsc_reliable();
init_tsc_clocksource();
}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 9ffb01c31c4..bf36328f6ef 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -46,7 +46,9 @@ static __cpuinit void check_tsc_warp(void)
cycles_t start, now, prev, end;
int i;
+ rdtsc_barrier();
start = get_cycles();
+ rdtsc_barrier();
/*
* The measurement runs for 20 msecs:
*/
@@ -61,7 +63,9 @@ static __cpuinit void check_tsc_warp(void)
*/
__raw_spin_lock(&sync_lock);
prev = last_tsc;
+ rdtsc_barrier();
now = get_cycles();
+ rdtsc_barrier();
last_tsc = now;
__raw_spin_unlock(&sync_lock);
@@ -108,6 +112,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
if (unsynchronized_tsc())
return;
+ if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
+ printk(KERN_INFO
+ "Skipping synchronization checks as TSC is reliable.\n");
+ return;
+ }
+
printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
smp_processor_id(), cpu);
@@ -161,7 +171,7 @@ void __cpuinit check_tsc_sync_target(void)
{
int cpus = 2;
- if (unsynchronized_tsc())
+ if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
return;
/*
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 0c9667f0752..4fd646e6dd4 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -32,9 +32,9 @@
#include <asm/e820.h>
#include <asm/io.h>
-#include <mach_ipi.h>
+#include <asm/genapic.h>
-#include "mach_apic.h"
+#include <asm/genapic.h>
#include <linux/kernel_stat.h>
@@ -176,33 +176,31 @@ static int __init visws_get_smp_config(unsigned int early)
* No problem for Linux.
*/
-static void __init MP_processor_info(struct mpc_config_processor *m)
+static void __init MP_processor_info(struct mpc_cpu *m)
{
int ver, logical_apicid;
physid_mask_t apic_cpus;
- if (!(m->mpc_cpuflag & CPU_ENABLED))
+ if (!(m->cpuflag & CPU_ENABLED))
return;
- logical_apicid = m->mpc_apicid;
+ logical_apicid = m->apicid;
printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
- m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
- m->mpc_apicid,
- (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
- (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
- m->mpc_apicver);
+ m->cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
+ m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
+ (m->cpufeature & CPU_MODEL_MASK) >> 4, m->apicver);
- if (m->mpc_cpuflag & CPU_BOOTPROCESSOR)
- boot_cpu_physical_apicid = m->mpc_apicid;
+ if (m->cpuflag & CPU_BOOTPROCESSOR)
+ boot_cpu_physical_apicid = m->apicid;
- ver = m->mpc_apicver;
- if ((ver >= 0x14 && m->mpc_apicid >= 0xff) || m->mpc_apicid >= 0xf) {
+ ver = m->apicver;
+ if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) {
printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
- m->mpc_apicid, MAX_APICS);
+ m->apicid, MAX_APICS);
return;
}
- apic_cpus = apicid_to_cpu_present(m->mpc_apicid);
+ apic_cpus = apic->apicid_to_cpu_present(m->apicid);
physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
/*
* Validate version
@@ -210,15 +208,15 @@ static void __init MP_processor_info(struct mpc_config_processor *m)
if (ver == 0x0) {
printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! "
"fixing up to 0x10. (tell your hw vendor)\n",
- m->mpc_apicid);
+ m->apicid);
ver = 0x10;
}
- apic_version[m->mpc_apicid] = ver;
+ apic_version[m->apicid] = ver;
}
static int __init visws_find_smp_config(unsigned int reserve)
{
- struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
+ struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
if (ncpus > CO_CPU_MAX) {
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 4eeb5cf9720..d7ac84e7fc1 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -158,7 +158,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
ret = KVM86->regs32;
ret->fs = current->thread.saved_fs;
- loadsegment(gs, current->thread.saved_gs);
+ set_user_gs(ret, current->thread.saved_gs);
return ret;
}
@@ -197,9 +197,9 @@ out:
static int do_vm86_irq_handling(int subfunction, int irqnumber);
static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
-asmlinkage int sys_vm86old(struct pt_regs regs)
+int sys_vm86old(struct pt_regs *regs)
{
- struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.bx;
+ struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs->bx;
struct kernel_vm86_struct info; /* declare this _on top_,
* this avoids wasting of stack space.
* This remains on the stack until we
@@ -218,7 +218,7 @@ asmlinkage int sys_vm86old(struct pt_regs regs)
if (tmp)
goto out;
memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
- info.regs32 = &regs;
+ info.regs32 = regs;
tsk->thread.vm86_info = v86;
do_sys_vm86(&info, tsk);
ret = 0; /* we never return here */
@@ -227,7 +227,7 @@ out:
}
-asmlinkage int sys_vm86(struct pt_regs regs)
+int sys_vm86(struct pt_regs *regs)
{
struct kernel_vm86_struct info; /* declare this _on top_,
* this avoids wasting of stack space.
@@ -239,12 +239,12 @@ asmlinkage int sys_vm86(struct pt_regs regs)
struct vm86plus_struct __user *v86;
tsk = current;
- switch (regs.bx) {
+ switch (regs->bx) {
case VM86_REQUEST_IRQ:
case VM86_FREE_IRQ:
case VM86_GET_IRQ_BITS:
case VM86_GET_AND_RESET_IRQ:
- ret = do_vm86_irq_handling(regs.bx, (int)regs.cx);
+ ret = do_vm86_irq_handling(regs->bx, (int)regs->cx);
goto out;
case VM86_PLUS_INSTALL_CHECK:
/*
@@ -261,14 +261,14 @@ asmlinkage int sys_vm86(struct pt_regs regs)
ret = -EPERM;
if (tsk->thread.saved_sp0)
goto out;
- v86 = (struct vm86plus_struct __user *)regs.cx;
+ v86 = (struct vm86plus_struct __user *)regs->cx;
tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
offsetof(struct kernel_vm86_struct, regs32) -
sizeof(info.regs));
ret = -EFAULT;
if (tmp)
goto out;
- info.regs32 = &regs;
+ info.regs32 = regs;
info.vm86plus.is_vm86pus = 1;
tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
do_sys_vm86(&info, tsk);
@@ -323,7 +323,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
info->regs32->ax = 0;
tsk->thread.saved_sp0 = tsk->thread.sp0;
tsk->thread.saved_fs = info->regs32->fs;
- savesegment(gs, tsk->thread.saved_gs);
+ tsk->thread.saved_gs = get_user_gs(info->regs32);
tss = &per_cpu(init_tss, get_cpu());
tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 8b6c393ab9f..f052c84ecbe 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -266,109 +266,6 @@ static void vmi_nop(void)
{
}
-#ifdef CONFIG_DEBUG_PAGE_TYPE
-
-#ifdef CONFIG_X86_PAE
-#define MAX_BOOT_PTS (2048+4+1)
-#else
-#define MAX_BOOT_PTS (1024+1)
-#endif
-
-/*
- * During boot, mem_map is not yet available in paging_init, so stash
- * all the boot page allocations here.
- */
-static struct {
- u32 pfn;
- int type;
-} boot_page_allocations[MAX_BOOT_PTS];
-static int num_boot_page_allocations;
-static int boot_allocations_applied;
-
-void vmi_apply_boot_page_allocations(void)
-{
- int i;
- BUG_ON(!mem_map);
- for (i = 0; i < num_boot_page_allocations; i++) {
- struct page *page = pfn_to_page(boot_page_allocations[i].pfn);
- page->type = boot_page_allocations[i].type;
- page->type = boot_page_allocations[i].type &
- ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
- }
- boot_allocations_applied = 1;
-}
-
-static void record_page_type(u32 pfn, int type)
-{
- BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS);
- boot_page_allocations[num_boot_page_allocations].pfn = pfn;
- boot_page_allocations[num_boot_page_allocations].type = type;
- num_boot_page_allocations++;
-}
-
-static void check_zeroed_page(u32 pfn, int type, struct page *page)
-{
- u32 *ptr;
- int i;
- int limit = PAGE_SIZE / sizeof(int);
-
- if (page_address(page))
- ptr = (u32 *)page_address(page);
- else
- ptr = (u32 *)__va(pfn << PAGE_SHIFT);
- /*
- * When cloning the root in non-PAE mode, only the userspace
- * pdes need to be zeroed.
- */
- if (type & VMI_PAGE_CLONE)
- limit = KERNEL_PGD_BOUNDARY;
- for (i = 0; i < limit; i++)
- BUG_ON(ptr[i]);
-}
-
-/*
- * We stash the page type into struct page so we can verify the page
- * types are used properly.
- */
-static void vmi_set_page_type(u32 pfn, int type)
-{
- /* PAE can have multiple roots per page - don't track */
- if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
- return;
-
- if (boot_allocations_applied) {
- struct page *page = pfn_to_page(pfn);
- if (type != VMI_PAGE_NORMAL)
- BUG_ON(page->type);
- else
- BUG_ON(page->type == VMI_PAGE_NORMAL);
- page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
- if (type & VMI_PAGE_ZEROED)
- check_zeroed_page(pfn, type, page);
- } else {
- record_page_type(pfn, type);
- }
-}
-
-static void vmi_check_page_type(u32 pfn, int type)
-{
- /* PAE can have multiple roots per page - skip checks */
- if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
- return;
-
- type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
- if (boot_allocations_applied) {
- struct page *page = pfn_to_page(pfn);
- BUG_ON((page->type ^ type) & VMI_PAGE_PAE);
- BUG_ON(type == VMI_PAGE_NORMAL && page->type);
- BUG_ON((type & page->type) == 0);
- }
-}
-#else
-#define vmi_set_page_type(p,t) do { } while (0)
-#define vmi_check_page_type(p,t) do { } while (0)
-#endif
-
#ifdef CONFIG_HIGHPTE
static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
{
@@ -395,7 +292,6 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
{
- vmi_set_page_type(pfn, VMI_PAGE_L1);
vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
}
@@ -406,27 +302,32 @@ static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
* It is called only for swapper_pg_dir, which already has
* data on it.
*/
- vmi_set_page_type(pfn, VMI_PAGE_L2);
vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
}
static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
{
- vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
- vmi_check_page_type(clonepfn, VMI_PAGE_L2);
vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
}
static void vmi_release_pte(unsigned long pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L1);
- vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
}
static void vmi_release_pmd(unsigned long pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L2);
- vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
+}
+
+/*
+ * We use the pgd_free hook for releasing the pgd page:
+ */
+static void vmi_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+ unsigned long pfn = __pa(pgd) >> PAGE_SHIFT;
+
+ vmi_ops.release_page(pfn, VMI_PAGE_L2);
}
/*
@@ -450,26 +351,22 @@ static void vmi_release_pmd(unsigned long pfn)
static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
- vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
}
static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
- vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
}
static void vmi_set_pte(pte_t *ptep, pte_t pte)
{
/* XXX because of set_pmd_pte, this can be called on PT or PD layers */
- vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD);
vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
}
static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
{
- vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
}
@@ -477,10 +374,8 @@ static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
#ifdef CONFIG_X86_PAE
const pte_t pte = { .pte = pmdval.pmd };
- vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
#else
const pte_t pte = { pmdval.pud.pgd.pgd };
- vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD);
#endif
vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
}
@@ -502,7 +397,6 @@ static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
{
- vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
}
@@ -510,21 +404,18 @@ static void vmi_set_pud(pud_t *pudp, pud_t pudval)
{
/* Um, eww */
const pte_t pte = { .pte = pudval.pgd.pgd };
- vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
}
static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
const pte_t pte = { .pte = 0 };
- vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
}
static void vmi_pmd_clear(pmd_t *pmd)
{
const pte_t pte = { .pte = 0 };
- vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
}
#endif
@@ -789,10 +680,11 @@ static inline int __init activate_vmi(void)
para_fill(pv_mmu_ops.write_cr2, SetCR2);
para_fill(pv_mmu_ops.write_cr3, SetCR3);
para_fill(pv_cpu_ops.write_cr4, SetCR4);
- para_fill(pv_irq_ops.save_fl, GetInterruptMask);
- para_fill(pv_irq_ops.restore_fl, SetInterruptMask);
- para_fill(pv_irq_ops.irq_disable, DisableInterrupts);
- para_fill(pv_irq_ops.irq_enable, EnableInterrupts);
+
+ para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
+ para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
+ para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
+ para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
para_fill(pv_cpu_ops.wbinvd, WBINVD);
para_fill(pv_cpu_ops.read_tsc, RDTSC);
@@ -881,6 +773,7 @@ static inline int __init activate_vmi(void)
if (vmi_ops.release_page) {
pv_mmu_ops.release_pte = vmi_release_pte;
pv_mmu_ops.release_pmd = vmi_release_pmd;
+ pv_mmu_ops.pgd_free = vmi_pgd_free;
}
/* Set linear is needed in all cases */
@@ -960,8 +853,6 @@ static inline int __init activate_vmi(void)
void __init vmi_init(void)
{
- unsigned long flags;
-
if (!vmi_rom)
probe_vmi_rom();
else
@@ -973,13 +864,21 @@ void __init vmi_init(void)
reserve_top_address(-vmi_rom->virtual_top);
- local_irq_save(flags);
- activate_vmi();
-
#ifdef CONFIG_X86_IO_APIC
/* This is virtual hardware; timer routing is wired correctly */
no_timer_check = 1;
#endif
+}
+
+void __init vmi_activate(void)
+{
+ unsigned long flags;
+
+ if (!vmi_rom)
+ return;
+
+ local_irq_save(flags);
+ activate_vmi();
local_irq_restore(flags & X86_EFLAGS_IF);
}
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 254ee07f863..a4791ef412d 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void)
/* Upper bound is clockevent's use of ulong for cycle deltas. */
evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
evt->min_delta_ns = clockevent_delta2ns(1, evt);
- evt->cpumask = cpumask_of_cpu(cpu);
+ evt->cpumask = cpumask_of(cpu);
printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
evt->name, evt->mult, evt->shift);
@@ -256,7 +256,7 @@ void __devinit vmi_time_bsp_init(void)
*/
clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
local_irq_disable();
-#ifdef CONFIG_X86_SMP
+#ifdef CONFIG_SMP
/*
* XXX handle_percpu_irq only defined for SMP; we need to switch over
* to using it, since this is a local interrupt, which each CPU must
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index a9b8560adbc..3eba7f7bac0 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -44,6 +44,7 @@ SECTIONS
SCHED_TEXT
LOCK_TEXT
KPROBES_TEXT
+ IRQENTRY_TEXT
*(.fixup)
*(.gnu.warning)
_etext = .; /* End of text section */
@@ -177,14 +178,7 @@ SECTIONS
__initramfs_end = .;
}
#endif
- . = ALIGN(PAGE_SIZE);
- .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
- __per_cpu_start = .;
- *(.data.percpu.page_aligned)
- *(.data.percpu)
- *(.data.percpu.shared_aligned)
- __per_cpu_end = .;
- }
+ PERCPU(PAGE_SIZE)
. = ALIGN(PAGE_SIZE);
/* freed after init ends here */
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 46e05447405..087a7f2c639 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -5,6 +5,7 @@
#define LOAD_OFFSET __START_KERNEL_map
#include <asm-generic/vmlinux.lds.h>
+#include <asm/asm-offsets.h>
#include <asm/page.h>
#undef i386 /* in case the preprocessor is a 32bit one */
@@ -13,12 +14,15 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
OUTPUT_ARCH(i386:x86-64)
ENTRY(phys_startup_64)
jiffies_64 = jiffies;
-_proxy_pda = 1;
PHDRS {
text PT_LOAD FLAGS(5); /* R_E */
data PT_LOAD FLAGS(7); /* RWE */
user PT_LOAD FLAGS(7); /* RWE */
data.init PT_LOAD FLAGS(7); /* RWE */
+#ifdef CONFIG_SMP
+ percpu PT_LOAD FLAGS(7); /* RWE */
+#endif
+ data.init2 PT_LOAD FLAGS(7); /* RWE */
note PT_NOTE FLAGS(0); /* ___ */
}
SECTIONS
@@ -35,6 +39,7 @@ SECTIONS
SCHED_TEXT
LOCK_TEXT
KPROBES_TEXT
+ IRQENTRY_TEXT
*(.fixup)
*(.gnu.warning)
_etext = .; /* End of text section */
@@ -207,14 +212,28 @@ SECTIONS
__initramfs_end = .;
#endif
+#ifdef CONFIG_SMP
+ /*
+ * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
+ * output PHDR, so the next output section - __data_nosave - should
+ * start another section data.init2. Also, pda should be at the head of
+ * percpu area. Preallocate it and define the percpu offset symbol
+ * so that it can be accessed as a percpu variable.
+ */
+ . = ALIGN(PAGE_SIZE);
+ PERCPU_VADDR(0, :percpu)
+#else
PERCPU(PAGE_SIZE)
+#endif
. = ALIGN(PAGE_SIZE);
__init_end = .;
. = ALIGN(PAGE_SIZE);
__nosave_begin = .;
- .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
+ .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+ *(.data.nosave)
+ } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
. = ALIGN(PAGE_SIZE);
__nosave_end = .;
@@ -238,8 +257,21 @@ SECTIONS
DWARF_DEBUG
}
+ /*
+ * Per-cpu symbols which need to be offset from __per_cpu_load
+ * for the boot processor.
+ */
+#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
+INIT_PER_CPU(gdt_page);
+INIT_PER_CPU(irq_stack_union);
+
/*
* Build-time check on the image size:
*/
ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
"kernel image bigger than KERNEL_IMAGE_SIZE")
+
+#ifdef CONFIG_SMP
+ASSERT((per_cpu__irq_stack_union == 0),
+ "irq_stack_union is not at start of per-cpu area");
+#endif
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index a688f3bfaec..c609205df59 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -37,6 +37,7 @@ static unsigned long vsmp_save_fl(void)
flags &= ~X86_EFLAGS_IF;
return flags;
}
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl);
static void vsmp_restore_fl(unsigned long flags)
{
@@ -46,6 +47,7 @@ static void vsmp_restore_fl(unsigned long flags)
flags |= X86_EFLAGS_AC;
native_restore_fl(flags);
}
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl);
static void vsmp_irq_disable(void)
{
@@ -53,6 +55,7 @@ static void vsmp_irq_disable(void)
native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
}
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable);
static void vsmp_irq_enable(void)
{
@@ -60,6 +63,7 @@ static void vsmp_irq_enable(void)
native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
}
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable);
static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
@@ -90,10 +94,10 @@ static void __init set_vsmp_pv_ops(void)
cap, ctl);
if (cap & ctl & (1 << 4)) {
/* Setup irq ops and turn on vSMP IRQ fastpath handling */
- pv_irq_ops.irq_disable = vsmp_irq_disable;
- pv_irq_ops.irq_enable = vsmp_irq_enable;
- pv_irq_ops.save_fl = vsmp_save_fl;
- pv_irq_ops.restore_fl = vsmp_restore_fl;
+ pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
+ pv_irq_ops.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable);
+ pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl);
+ pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl);
pv_init_ops.patch = vsmp_patch;
ctl &= ~(1 << 4);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 0b8b6690a86..44153afc906 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -17,6 +17,9 @@
* want per guest time just set the kernel.vsyscall64 sysctl to 0.
*/
+/* Disable profiling for userspace code: */
+#define DISABLE_BRANCH_PROFILING
+
#include <linux/time.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -128,7 +131,16 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
gettimeofday(tv,NULL);
return;
}
+
+ /*
+ * Surround the RDTSC by barriers, to make sure it's not
+ * speculated to outside the seqlock critical section and
+ * does not cause time warps:
+ */
+ rdtsc_barrier();
now = vread();
+ rdtsc_barrier();
+
base = __vsyscall_gtod_data.clock.cycle_last;
mask = __vsyscall_gtod_data.clock.mask;
mult = __vsyscall_gtod_data.clock.mult;
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 695e426aa35..3909e3ba5ce 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -58,5 +58,3 @@ EXPORT_SYMBOL(__memcpy);
EXPORT_SYMBOL(empty_zero_page);
EXPORT_SYMBOL(init_level4_pgt);
EXPORT_SYMBOL(load_gs_index);
-
-EXPORT_SYMBOL(_proxy_pda);
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index b13acb75e82..2b54fe002e9 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -159,7 +159,7 @@ int save_i387_xstate(void __user *buf)
* Restore the extended state if present. Otherwise, restore the FP/SSE
* state.
*/
-int restore_user_xstate(void __user *buf)
+static int restore_user_xstate(void __user *buf)
{
struct _fpx_sw_bytes fx_sw_user;
u64 mask;
@@ -310,7 +310,7 @@ static void __init setup_xstate_init(void)
/*
* Enable and initialize the xsave feature.
*/
-void __init xsave_cntxt_init(void)
+void __ref xsave_cntxt_init(void)
{
unsigned int eax, ebx, ecx, edx;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ce3251ce550..b81125f0bde 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -20,6 +20,8 @@ if VIRTUALIZATION
config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
depends on HAVE_KVM
+ # for device assignment:
+ depends on PCI
select PREEMPT_NOTIFIERS
select MMU_NOTIFIER
select ANON_INODES
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index c02343594b4..d3ec292f00f 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,8 +7,8 @@ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
ifeq ($(CONFIG_KVM_TRACE),y)
common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
endif
-ifeq ($(CONFIG_DMAR),y)
-common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
+ifeq ($(CONFIG_IOMMU_API),y)
+common-objs += $(addprefix ../../../virt/kvm/, iommu.o)
endif
EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 8772dc94682..e665d1c623c 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -548,8 +548,10 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
mutex_lock(&kvm->lock);
pit->irq_source_id = kvm_request_irq_source_id(kvm);
mutex_unlock(&kvm->lock);
- if (pit->irq_source_id < 0)
+ if (pit->irq_source_id < 0) {
+ kfree(pit);
return NULL;
+ }
mutex_init(&pit->pit_state.lock);
mutex_lock(&pit->pit_state.lock);
@@ -601,10 +603,29 @@ void kvm_free_pit(struct kvm *kvm)
static void __inject_pit_timer_intr(struct kvm *kvm)
{
+ struct kvm_vcpu *vcpu;
+ int i;
+
mutex_lock(&kvm->lock);
kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
mutex_unlock(&kvm->lock);
+
+ /*
+ * Provides NMI watchdog support via Virtual Wire mode.
+ * The route is: PIT -> PIC -> LVT0 in NMI mode.
+ *
+ * Note: Our Virtual Wire implementation is simplified, only
+ * propagating PIT interrupts to all VCPUs when they have set
+ * LVT0 to NMI delivery. Other PIC interrupts are just sent to
+ * VCPU0, and only if its LVT0 is in EXTINT mode.
+ */
+ if (kvm->arch.vapics_in_nmi_mode > 0)
+ for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+ vcpu = kvm->vcpus[i];
+ if (vcpu)
+ kvm_apic_nmi_wd_deliver(vcpu);
+ }
}
void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 17e41e165f1..179dcb0103f 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -26,10 +26,40 @@
* Port from Qemu.
*/
#include <linux/mm.h>
+#include <linux/bitops.h>
#include "irq.h"
#include <linux/kvm_host.h>
+static void pic_lock(struct kvm_pic *s)
+{
+ spin_lock(&s->lock);
+}
+
+static void pic_unlock(struct kvm_pic *s)
+{
+ struct kvm *kvm = s->kvm;
+ unsigned acks = s->pending_acks;
+ bool wakeup = s->wakeup_needed;
+ struct kvm_vcpu *vcpu;
+
+ s->pending_acks = 0;
+ s->wakeup_needed = false;
+
+ spin_unlock(&s->lock);
+
+ while (acks) {
+ kvm_notify_acked_irq(kvm, __ffs(acks));
+ acks &= acks - 1;
+ }
+
+ if (wakeup) {
+ vcpu = s->kvm->vcpus[0];
+ if (vcpu)
+ kvm_vcpu_kick(vcpu);
+ }
+}
+
static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
{
s->isr &= ~(1 << irq);
@@ -136,17 +166,21 @@ static void pic_update_irq(struct kvm_pic *s)
void kvm_pic_update_irq(struct kvm_pic *s)
{
+ pic_lock(s);
pic_update_irq(s);
+ pic_unlock(s);
}
void kvm_pic_set_irq(void *opaque, int irq, int level)
{
struct kvm_pic *s = opaque;
+ pic_lock(s);
if (irq >= 0 && irq < PIC_NUM_PINS) {
pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
pic_update_irq(s);
}
+ pic_unlock(s);
}
/*
@@ -172,6 +206,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
int irq, irq2, intno;
struct kvm_pic *s = pic_irqchip(kvm);
+ pic_lock(s);
irq = pic_get_irq(&s->pics[0]);
if (irq >= 0) {
pic_intack(&s->pics[0], irq);
@@ -196,6 +231,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
intno = s->pics[0].irq_base + irq;
}
pic_update_irq(s);
+ pic_unlock(s);
kvm_notify_acked_irq(kvm, irq);
return intno;
@@ -203,7 +239,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
void kvm_pic_reset(struct kvm_kpic_state *s)
{
- int irq, irqbase;
+ int irq, irqbase, n;
struct kvm *kvm = s->pics_state->irq_request_opaque;
struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
@@ -214,8 +250,10 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
- if (s->irr & (1 << irq) || s->isr & (1 << irq))
- kvm_notify_acked_irq(kvm, irq+irqbase);
+ if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
+ n = irq + irqbase;
+ s->pics_state->pending_acks |= 1 << n;
+ }
}
s->last_irr = 0;
s->irr = 0;
@@ -406,6 +444,7 @@ static void picdev_write(struct kvm_io_device *this,
printk(KERN_ERR "PIC: non byte write\n");
return;
}
+ pic_lock(s);
switch (addr) {
case 0x20:
case 0x21:
@@ -418,6 +457,7 @@ static void picdev_write(struct kvm_io_device *this,
elcr_ioport_write(&s->pics[addr & 1], addr, data);
break;
}
+ pic_unlock(s);
}
static void picdev_read(struct kvm_io_device *this,
@@ -431,6 +471,7 @@ static void picdev_read(struct kvm_io_device *this,
printk(KERN_ERR "PIC: non byte read\n");
return;
}
+ pic_lock(s);
switch (addr) {
case 0x20:
case 0x21:
@@ -444,6 +485,7 @@ static void picdev_read(struct kvm_io_device *this,
break;
}
*(unsigned char *)val = data;
+ pic_unlock(s);
}
/*
@@ -459,7 +501,7 @@ static void pic_irq_request(void *opaque, int level)
s->output = level;
if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
s->pics[0].isr_ack &= ~(1 << irq);
- kvm_vcpu_kick(vcpu);
+ s->wakeup_needed = true;
}
}
@@ -469,6 +511,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
if (!s)
return NULL;
+ spin_lock_init(&s->lock);
+ s->kvm = kvm;
s->pics[0].elcr_mask = 0xf8;
s->pics[1].elcr_mask = 0xde;
s->irq_request = pic_irq_request;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index f17c8f5bbf3..2bf32a03cee 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -25,6 +25,7 @@
#include <linux/mm_types.h>
#include <linux/hrtimer.h>
#include <linux/kvm_host.h>
+#include <linux/spinlock.h>
#include "iodev.h"
#include "ioapic.h"
@@ -59,6 +60,10 @@ struct kvm_kpic_state {
};
struct kvm_pic {
+ spinlock_t lock;
+ bool wakeup_needed;
+ unsigned pending_acks;
+ struct kvm *kvm;
struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
irq_request_func *irq_request;
void *irq_request_opaque;
@@ -87,6 +92,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s);
void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index 65ef0fc2c03..8e5ee99551f 100644
--- a/arch/x86/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -7,7 +7,7 @@
#include <linux/kvm_host.h>
#include <asm/msr.h>
-#include "svm.h"
+#include <asm/svm.h>
static const u32 host_save_user_msrs[] = {
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0fc3cab4894..afac68c0815 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -130,6 +130,11 @@ static inline int apic_lvtt_period(struct kvm_lapic *apic)
return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
}
+static inline int apic_lvt_nmi_mode(u32 lvt_val)
+{
+ return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
+}
+
static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */
LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
@@ -354,6 +359,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
case APIC_DM_NMI:
kvm_inject_nmi(vcpu);
+ kvm_vcpu_kick(vcpu);
break;
case APIC_DM_INIT:
@@ -380,6 +386,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
}
break;
+ case APIC_DM_EXTINT:
+ /*
+ * Should only be called by kvm_apic_local_deliver() with LVT0,
+ * before NMI watchdog was enabled. Already handled by
+ * kvm_apic_accept_pic_intr().
+ */
+ break;
+
default:
printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
delivery_mode);
@@ -663,6 +677,20 @@ static void start_apic_timer(struct kvm_lapic *apic)
apic->timer.period)));
}
+static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
+{
+ int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0));
+
+ if (apic_lvt_nmi_mode(lvt0_val)) {
+ if (!nmi_wd_enabled) {
+ apic_debug("Receive NMI setting on APIC_LVT0 "
+ "for cpu %d\n", apic->vcpu->vcpu_id);
+ apic->vcpu->kvm->arch.vapics_in_nmi_mode++;
+ }
+ } else if (nmi_wd_enabled)
+ apic->vcpu->kvm->arch.vapics_in_nmi_mode--;
+}
+
static void apic_mmio_write(struct kvm_io_device *this,
gpa_t address, int len, const void *data)
{
@@ -743,10 +771,11 @@ static void apic_mmio_write(struct kvm_io_device *this,
apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
break;
+ case APIC_LVT0:
+ apic_manage_nmi_watchdog(apic, val);
case APIC_LVTT:
case APIC_LVTTHMR:
case APIC_LVTPC:
- case APIC_LVT0:
case APIC_LVT1:
case APIC_LVTERR:
/* TODO: Check vector */
@@ -961,12 +990,26 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
return 0;
}
-static int __inject_apic_timer_irq(struct kvm_lapic *apic)
+static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
+{
+ u32 reg = apic_get_reg(apic, lvt_type);
+ int vector, mode, trig_mode;
+
+ if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
+ vector = reg & APIC_VECTOR_MASK;
+ mode = reg & APIC_MODE_MASK;
+ trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
+ return __apic_accept_irq(apic, mode, vector, 1, trig_mode);
+ }
+ return 0;
+}
+
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
{
- int vector;
+ struct kvm_lapic *apic = vcpu->arch.apic;
- vector = apic_lvt_vector(apic, APIC_LVTT);
- return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
+ if (apic)
+ kvm_apic_local_deliver(apic, APIC_LVT0);
}
static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
@@ -1061,9 +1104,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
- atomic_read(&apic->timer.pending) > 0) {
- if (__inject_apic_timer_irq(apic))
+ if (apic && atomic_read(&apic->timer.pending) > 0) {
+ if (kvm_apic_local_deliver(apic, APIC_LVTT))
atomic_dec(&apic->timer.pending);
}
}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2a5e64881d9..83f11c7474a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -17,7 +17,6 @@
*
*/
-#include "vmx.h"
#include "mmu.h"
#include <linux/kvm_host.h>
@@ -33,6 +32,7 @@
#include <asm/page.h>
#include <asm/cmpxchg.h>
#include <asm/io.h>
+#include <asm/vmx.h>
/*
* When setting this variable to true it enables Two-Dimensional-Paging
@@ -168,6 +168,7 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
static u64 __read_mostly shadow_user_mask;
static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
+static u64 __read_mostly shadow_mt_mask;
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
{
@@ -183,13 +184,14 @@ void kvm_mmu_set_base_ptes(u64 base_pte)
EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask)
+ u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask)
{
shadow_user_mask = user_mask;
shadow_accessed_mask = accessed_mask;
shadow_dirty_mask = dirty_mask;
shadow_nx_mask = nx_mask;
shadow_x_mask = x_mask;
+ shadow_mt_mask = mt_mask;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
@@ -314,7 +316,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
if (r)
goto out;
r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
- rmap_desc_cache, 1);
+ rmap_desc_cache, 4);
if (r)
goto out;
r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -384,7 +386,9 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
{
int *write_count;
- write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+ gfn = unalias_gfn(kvm, gfn);
+ write_count = slot_largepage_idx(gfn,
+ gfn_to_memslot_unaliased(kvm, gfn));
*write_count += 1;
}
@@ -392,16 +396,20 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
{
int *write_count;
- write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+ gfn = unalias_gfn(kvm, gfn);
+ write_count = slot_largepage_idx(gfn,
+ gfn_to_memslot_unaliased(kvm, gfn));
*write_count -= 1;
WARN_ON(*write_count < 0);
}
static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
{
- struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+ struct kvm_memory_slot *slot;
int *largepage_idx;
+ gfn = unalias_gfn(kvm, gfn);
+ slot = gfn_to_memslot_unaliased(kvm, gfn);
if (slot) {
largepage_idx = slot_largepage_idx(gfn, slot);
return *largepage_idx;
@@ -613,7 +621,7 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
return NULL;
}
-static void rmap_write_protect(struct kvm *kvm, u64 gfn)
+static int rmap_write_protect(struct kvm *kvm, u64 gfn)
{
unsigned long *rmapp;
u64 *spte;
@@ -659,8 +667,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
spte = rmap_next(kvm, rmapp, spte);
}
- if (write_protected)
- kvm_flush_remote_tlbs(kvm);
+ return write_protected;
}
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
@@ -786,9 +793,11 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+ INIT_LIST_HEAD(&sp->oos_link);
ASSERT(is_empty_shadow_page(sp->spt));
- sp->slot_bitmap = 0;
+ bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
sp->multimapped = 0;
+ sp->global = 1;
sp->parent_pte = parent_pte;
--vcpu->kvm->arch.n_free_mmu_pages;
return sp;
@@ -900,8 +909,9 @@ static void kvm_mmu_update_unsync_bitmap(u64 *spte)
struct kvm_mmu_page *sp = page_header(__pa(spte));
index = spte - sp->spt;
- __set_bit(index, sp->unsync_child_bitmap);
- sp->unsync_children = 1;
+ if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
+ sp->unsync_children++;
+ WARN_ON(!sp->unsync_children);
}
static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
@@ -928,7 +938,6 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
- sp->unsync_children = 1;
kvm_mmu_update_parents_unsync(sp);
return 1;
}
@@ -959,38 +968,66 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
}
+#define KVM_PAGE_ARRAY_NR 16
+
+struct kvm_mmu_pages {
+ struct mmu_page_and_offset {
+ struct kvm_mmu_page *sp;
+ unsigned int idx;
+ } page[KVM_PAGE_ARRAY_NR];
+ unsigned int nr;
+};
+
#define for_each_unsync_children(bitmap, idx) \
for (idx = find_first_bit(bitmap, 512); \
idx < 512; \
idx = find_next_bit(bitmap, 512, idx+1))
-static int mmu_unsync_walk(struct kvm_mmu_page *sp,
- struct kvm_unsync_walk *walker)
+int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
+ int idx)
{
- int i, ret;
+ int i;
- if (!sp->unsync_children)
- return 0;
+ if (sp->unsync)
+ for (i=0; i < pvec->nr; i++)
+ if (pvec->page[i].sp == sp)
+ return 0;
+
+ pvec->page[pvec->nr].sp = sp;
+ pvec->page[pvec->nr].idx = idx;
+ pvec->nr++;
+ return (pvec->nr == KVM_PAGE_ARRAY_NR);
+}
+
+static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
+ struct kvm_mmu_pages *pvec)
+{
+ int i, ret, nr_unsync_leaf = 0;
for_each_unsync_children(sp->unsync_child_bitmap, i) {
u64 ent = sp->spt[i];
- if (is_shadow_present_pte(ent)) {
+ if (is_shadow_present_pte(ent) && !is_large_pte(ent)) {
struct kvm_mmu_page *child;
child = page_header(ent & PT64_BASE_ADDR_MASK);
if (child->unsync_children) {
- ret = mmu_unsync_walk(child, walker);
- if (ret)
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+
+ ret = __mmu_unsync_walk(child, pvec);
+ if (!ret)
+ __clear_bit(i, sp->unsync_child_bitmap);
+ else if (ret > 0)
+ nr_unsync_leaf += ret;
+ else
return ret;
- __clear_bit(i, sp->unsync_child_bitmap);
}
if (child->unsync) {
- ret = walker->entry(child, walker);
- __clear_bit(i, sp->unsync_child_bitmap);
- if (ret)
- return ret;
+ nr_unsync_leaf++;
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
}
}
}
@@ -998,7 +1035,17 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
sp->unsync_children = 0;
- return 0;
+ return nr_unsync_leaf;
+}
+
+static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+ struct kvm_mmu_pages *pvec)
+{
+ if (!sp->unsync_children)
+ return 0;
+
+ mmu_pages_add(pvec, sp, 0);
+ return __mmu_unsync_walk(sp, pvec);
}
static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
@@ -1021,10 +1068,18 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
return NULL;
}
+static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ list_del(&sp->oos_link);
+ --kvm->stat.mmu_unsync_global;
+}
+
static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
WARN_ON(!sp->unsync);
sp->unsync = 0;
+ if (sp->global)
+ kvm_unlink_unsync_global(kvm, sp);
--kvm->stat.mmu_unsync;
}
@@ -1037,41 +1092,101 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
return 1;
}
- rmap_write_protect(vcpu->kvm, sp->gfn);
+ if (rmap_write_protect(vcpu->kvm, sp->gfn))
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
kvm_mmu_zap_page(vcpu->kvm, sp);
return 1;
}
kvm_mmu_flush_tlb(vcpu);
- kvm_unlink_unsync_page(vcpu->kvm, sp);
return 0;
}
-struct sync_walker {
- struct kvm_vcpu *vcpu;
- struct kvm_unsync_walk walker;
+struct mmu_page_path {
+ struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
+ unsigned int idx[PT64_ROOT_LEVEL-1];
};
-static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+#define for_each_sp(pvec, sp, parents, i) \
+ for (i = mmu_pages_next(&pvec, &parents, -1), \
+ sp = pvec.page[i].sp; \
+ i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
+ i = mmu_pages_next(&pvec, &parents, i))
+
+int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents,
+ int i)
{
- struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
- walker);
- struct kvm_vcpu *vcpu = sync_walk->vcpu;
+ int n;
+
+ for (n = i+1; n < pvec->nr; n++) {
+ struct kvm_mmu_page *sp = pvec->page[n].sp;
+
+ if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
+ parents->idx[0] = pvec->page[n].idx;
+ return n;
+ }
- kvm_sync_page(vcpu, sp);
- return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
+ parents->parent[sp->role.level-2] = sp;
+ parents->idx[sp->role.level-1] = pvec->page[n].idx;
+ }
+
+ return n;
}
-static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+void mmu_pages_clear_parents(struct mmu_page_path *parents)
{
- struct sync_walker walker = {
- .walker = { .entry = mmu_sync_fn, },
- .vcpu = vcpu,
- };
+ struct kvm_mmu_page *sp;
+ unsigned int level = 0;
+
+ do {
+ unsigned int idx = parents->idx[level];
+
+ sp = parents->parent[level];
+ if (!sp)
+ return;
+
+ --sp->unsync_children;
+ WARN_ON((int)sp->unsync_children < 0);
+ __clear_bit(idx, sp->unsync_child_bitmap);
+ level++;
+ } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
+}
+
+static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
+ struct mmu_page_path *parents,
+ struct kvm_mmu_pages *pvec)
+{
+ parents->parent[parent->role.level-1] = NULL;
+ pvec->nr = 0;
+}
- while (mmu_unsync_walk(sp, &walker.walker))
+static void mmu_sync_children(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *parent)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+ struct mmu_page_path parents;
+ struct kvm_mmu_pages pages;
+
+ kvm_mmu_pages_init(parent, &parents, &pages);
+ while (mmu_unsync_walk(parent, &pages)) {
+ int protected = 0;
+
+ for_each_sp(pages, sp, parents, i)
+ protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
+
+ if (protected)
+ kvm_flush_remote_tlbs(vcpu->kvm);
+
+ for_each_sp(pages, sp, parents, i) {
+ kvm_sync_page(vcpu, sp);
+ mmu_pages_clear_parents(&parents);
+ }
cond_resched_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_pages_init(parent, &parents, &pages);
+ }
}
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
@@ -1129,7 +1244,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
sp->role = role;
hlist_add_head(&sp->hash_link, bucket);
if (!metaphysical) {
- rmap_write_protect(vcpu->kvm, gfn);
+ if (rmap_write_protect(vcpu->kvm, gfn))
+ kvm_flush_remote_tlbs(vcpu->kvm);
account_shadowed(vcpu->kvm, gfn);
}
if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
@@ -1153,6 +1269,8 @@ static int walk_shadow(struct kvm_shadow_walk *walker,
if (level == PT32E_ROOT_LEVEL) {
shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
shadow_addr &= PT64_BASE_ADDR_MASK;
+ if (!shadow_addr)
+ return 1;
--level;
}
@@ -1237,33 +1355,29 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
}
}
-struct zap_walker {
- struct kvm_unsync_walk walker;
- struct kvm *kvm;
- int zapped;
-};
-
-static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+static int mmu_zap_unsync_children(struct kvm *kvm,
+ struct kvm_mmu_page *parent)
{
- struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
- walker);
- kvm_mmu_zap_page(zap_walk->kvm, sp);
- zap_walk->zapped = 1;
- return 0;
-}
+ int i, zapped = 0;
+ struct mmu_page_path parents;
+ struct kvm_mmu_pages pages;
-static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- struct zap_walker walker = {
- .walker = { .entry = mmu_zap_fn, },
- .kvm = kvm,
- .zapped = 0,
- };
-
- if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+ if (parent->role.level == PT_PAGE_TABLE_LEVEL)
return 0;
- mmu_unsync_walk(sp, &walker.walker);
- return walker.zapped;
+
+ kvm_mmu_pages_init(parent, &parents, &pages);
+ while (mmu_unsync_walk(parent, &pages)) {
+ struct kvm_mmu_page *sp;
+
+ for_each_sp(pages, sp, parents, i) {
+ kvm_mmu_zap_page(kvm, sp);
+ mmu_pages_clear_parents(&parents);
+ }
+ zapped += pages.nr;
+ kvm_mmu_pages_init(parent, &parents, &pages);
+ }
+
+ return zapped;
}
static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -1362,7 +1476,7 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
struct kvm_mmu_page *sp = page_header(__pa(pte));
- __set_bit(slot, &sp->slot_bitmap);
+ __set_bit(slot, sp->slot_bitmap);
}
static void mmu_convert_notrap(struct kvm_mmu_page *sp)
@@ -1393,6 +1507,110 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
return page;
}
+/*
+ * The function is based on mtrr_type_lookup() in
+ * arch/x86/kernel/cpu/mtrr/generic.c
+ */
+static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
+ u64 start, u64 end)
+{
+ int i;
+ u64 base, mask;
+ u8 prev_match, curr_match;
+ int num_var_ranges = KVM_NR_VAR_MTRR;
+
+ if (!mtrr_state->enabled)
+ return 0xFF;
+
+ /* Make end inclusive end, instead of exclusive */
+ end--;
+
+ /* Look in fixed ranges. Just return the type as per start */
+ if (mtrr_state->have_fixed && (start < 0x100000)) {
+ int idx;
+
+ if (start < 0x80000) {
+ idx = 0;
+ idx += (start >> 16);
+ return mtrr_state->fixed_ranges[idx];
+ } else if (start < 0xC0000) {
+ idx = 1 * 8;
+ idx += ((start - 0x80000) >> 14);
+ return mtrr_state->fixed_ranges[idx];
+ } else if (start < 0x1000000) {
+ idx = 3 * 8;
+ idx += ((start - 0xC0000) >> 12);
+ return mtrr_state->fixed_ranges[idx];
+ }
+ }
+
+ /*
+ * Look in variable ranges
+ * Look of multiple ranges matching this address and pick type
+ * as per MTRR precedence
+ */
+ if (!(mtrr_state->enabled & 2))
+ return mtrr_state->def_type;
+
+ prev_match = 0xFF;
+ for (i = 0; i < num_var_ranges; ++i) {
+ unsigned short start_state, end_state;
+
+ if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
+ continue;
+
+ base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
+ (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
+ mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
+ (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
+
+ start_state = ((start & mask) == (base & mask));
+ end_state = ((end & mask) == (base & mask));
+ if (start_state != end_state)
+ return 0xFE;
+
+ if ((start & mask) != (base & mask))
+ continue;
+
+ curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
+ if (prev_match == 0xFF) {
+ prev_match = curr_match;
+ continue;
+ }
+
+ if (prev_match == MTRR_TYPE_UNCACHABLE ||
+ curr_match == MTRR_TYPE_UNCACHABLE)
+ return MTRR_TYPE_UNCACHABLE;
+
+ if ((prev_match == MTRR_TYPE_WRBACK &&
+ curr_match == MTRR_TYPE_WRTHROUGH) ||
+ (prev_match == MTRR_TYPE_WRTHROUGH &&
+ curr_match == MTRR_TYPE_WRBACK)) {
+ prev_match = MTRR_TYPE_WRTHROUGH;
+ curr_match = MTRR_TYPE_WRTHROUGH;
+ }
+
+ if (prev_match != curr_match)
+ return MTRR_TYPE_UNCACHABLE;
+ }
+
+ if (prev_match != 0xFF)
+ return prev_match;
+
+ return mtrr_state->def_type;
+}
+
+static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ u8 mtrr;
+
+ mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
+ (gfn << PAGE_SHIFT) + PAGE_SIZE);
+ if (mtrr == 0xfe || mtrr == 0xff)
+ mtrr = MTRR_TYPE_WRBACK;
+ return mtrr;
+}
+
static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
unsigned index;
@@ -1409,9 +1627,15 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
if (s->role.word != sp->role.word)
return 1;
}
- kvm_mmu_mark_parents_unsync(vcpu, sp);
++vcpu->kvm->stat.mmu_unsync;
sp->unsync = 1;
+
+ if (sp->global) {
+ list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages);
+ ++vcpu->kvm->stat.mmu_unsync_global;
+ } else
+ kvm_mmu_mark_parents_unsync(vcpu, sp);
+
mmu_convert_notrap(sp);
return 0;
}
@@ -1437,11 +1661,24 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pte_access, int user_fault,
int write_fault, int dirty, int largepage,
- gfn_t gfn, pfn_t pfn, bool speculative,
+ int global, gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync)
{
u64 spte;
int ret = 0;
+ u64 mt_mask = shadow_mt_mask;
+ struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
+
+ if (!(vcpu->arch.cr4 & X86_CR4_PGE))
+ global = 0;
+ if (!global && sp->global) {
+ sp->global = 0;
+ if (sp->unsync) {
+ kvm_unlink_unsync_global(vcpu->kvm, sp);
+ kvm_mmu_mark_parents_unsync(vcpu, sp);
+ }
+ }
+
/*
* We don't set the accessed bit, since we sometimes want to see
* whether the guest actually used the pte (in order to detect
@@ -1460,6 +1697,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
spte |= shadow_user_mask;
if (largepage)
spte |= PT_PAGE_SIZE_MASK;
+ if (mt_mask) {
+ mt_mask = get_memory_type(vcpu, gfn) <<
+ kvm_x86_ops->get_mt_mask_shift();
+ spte |= mt_mask;
+ }
spte |= (u64)pfn << PAGE_SHIFT;
@@ -1474,6 +1716,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
spte |= PT_WRITABLE_MASK;
+ /*
+ * Optimization: for pte sync, if spte was writable the hash
+ * lookup is unnecessary (and expensive). Write protection
+ * is responsibility of mmu_get_page / kvm_sync_page.
+ * Same reasoning can be applied to dirty page accounting.
+ */
+ if (!can_unsync && is_writeble_pte(*shadow_pte))
+ goto set_pte;
+
if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
__func__, gfn);
@@ -1495,8 +1746,8 @@ set_pte:
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty,
- int *ptwrite, int largepage, gfn_t gfn,
- pfn_t pfn, bool speculative)
+ int *ptwrite, int largepage, int global,
+ gfn_t gfn, pfn_t pfn, bool speculative)
{
int was_rmapped = 0;
int was_writeble = is_writeble_pte(*shadow_pte);
@@ -1529,7 +1780,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
}
}
if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
- dirty, largepage, gfn, pfn, speculative, true)) {
+ dirty, largepage, global, gfn, pfn, speculative, true)) {
if (write_fault)
*ptwrite = 1;
kvm_x86_ops->tlb_flush(vcpu);
@@ -1586,7 +1837,7 @@ static int direct_map_entry(struct kvm_shadow_walk *_walk,
|| (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
0, walk->write, 1, &walk->pt_write,
- walk->largepage, gfn, walk->pfn, false);
+ walk->largepage, 0, gfn, walk->pfn, false);
++vcpu->stat.pf_fixed;
return 1;
}
@@ -1773,6 +2024,15 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
}
}
+static void mmu_sync_global(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_mmu_page *sp, *n;
+
+ list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link)
+ kvm_sync_page(vcpu, sp);
+}
+
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
{
spin_lock(&vcpu->kvm->mmu_lock);
@@ -1780,6 +2040,13 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
spin_unlock(&vcpu->kvm->mmu_lock);
}
+void kvm_mmu_sync_global(struct kvm_vcpu *vcpu)
+{
+ spin_lock(&vcpu->kvm->mmu_lock);
+ mmu_sync_global(vcpu);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+}
+
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
{
return vaddr;
@@ -2178,7 +2445,8 @@ static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
}
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes)
+ const u8 *new, int bytes,
+ bool guest_initiated)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
struct kvm_mmu_page *sp;
@@ -2204,15 +2472,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
kvm_mmu_free_some_pages(vcpu);
++vcpu->kvm->stat.mmu_pte_write;
kvm_mmu_audit(vcpu, "pre pte write");
- if (gfn == vcpu->arch.last_pt_write_gfn
- && !last_updated_pte_accessed(vcpu)) {
- ++vcpu->arch.last_pt_write_count;
- if (vcpu->arch.last_pt_write_count >= 3)
- flooded = 1;
- } else {
- vcpu->arch.last_pt_write_gfn = gfn;
- vcpu->arch.last_pt_write_count = 1;
- vcpu->arch.last_pte_updated = NULL;
+ if (guest_initiated) {
+ if (gfn == vcpu->arch.last_pt_write_gfn
+ && !last_updated_pte_accessed(vcpu)) {
+ ++vcpu->arch.last_pt_write_count;
+ if (vcpu->arch.last_pt_write_count >= 3)
+ flooded = 1;
+ } else {
+ vcpu->arch.last_pt_write_gfn = gfn;
+ vcpu->arch.last_pt_write_count = 1;
+ vcpu->arch.last_pte_updated = NULL;
+ }
}
index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
@@ -2352,9 +2622,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
- spin_lock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.invlpg(vcpu, gva);
- spin_unlock(&vcpu->kvm->mmu_lock);
kvm_mmu_flush_tlb(vcpu);
++vcpu->stat.invlpg;
}
@@ -2451,7 +2719,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
int i;
u64 *pt;
- if (!test_bit(slot, &sp->slot_bitmap))
+ if (!test_bit(slot, sp->slot_bitmap))
continue;
pt = sp->spt;
@@ -2860,8 +3128,8 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
if (sp->role.metaphysical)
continue;
- slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
gfn = unalias_gfn(vcpu->kvm, sp->gfn);
+ slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
rmapp = &slot->rmap[gfn - slot->base_gfn];
if (*rmapp)
printk(KERN_ERR "%s: (%s) shadow page has writable"
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 613ec9aa674..9fd78b6e17a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -82,6 +82,7 @@ struct shadow_walker {
int *ptwrite;
pfn_t pfn;
u64 *sptep;
+ gpa_t pte_gpa;
};
static gfn_t gpte_to_gfn(pt_element_t gpte)
@@ -222,7 +223,7 @@ walk:
if (ret)
goto walk;
pte |= PT_DIRTY_MASK;
- kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
+ kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0);
walker->ptes[walker->level - 1] = pte;
}
@@ -274,7 +275,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
return;
kvm_get_pfn(pfn);
mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
- gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
+ gpte & PT_DIRTY_MASK, NULL, largepage,
+ gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte),
pfn, true);
}
@@ -301,8 +303,9 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
sw->user_fault, sw->write_fault,
gw->ptes[gw->level-1] & PT_DIRTY_MASK,
- sw->ptwrite, sw->largepage, gw->gfn, sw->pfn,
- false);
+ sw->ptwrite, sw->largepage,
+ gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
+ gw->gfn, sw->pfn, false);
sw->sptep = sptep;
return 1;
}
@@ -331,6 +334,7 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2],
&curr_pte, sizeof(curr_pte));
if (r || curr_pte != gw->ptes[level - 2]) {
+ kvm_mmu_put_page(shadow_page, sptep);
kvm_release_pfn_clean(sw->pfn);
sw->sptep = NULL;
return 1;
@@ -465,10 +469,22 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
struct kvm_vcpu *vcpu, u64 addr,
u64 *sptep, int level)
{
+ struct shadow_walker *sw =
+ container_of(_sw, struct shadow_walker, walker);
- if (level == PT_PAGE_TABLE_LEVEL) {
- if (is_shadow_present_pte(*sptep))
+ /* FIXME: properly handle invlpg on large guest pages */
+ if (level == PT_PAGE_TABLE_LEVEL ||
+ ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) {
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+ sw->pte_gpa = (sp->gfn << PAGE_SHIFT);
+ sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
+
+ if (is_shadow_present_pte(*sptep)) {
rmap_remove(vcpu->kvm, sptep);
+ if (is_large_pte(*sptep))
+ --vcpu->kvm->stat.lpages;
+ }
set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
return 1;
}
@@ -479,11 +495,26 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
{
+ pt_element_t gpte;
struct shadow_walker walker = {
.walker = { .entry = FNAME(shadow_invlpg_entry), },
+ .pte_gpa = -1,
};
+ spin_lock(&vcpu->kvm->mmu_lock);
walk_shadow(&walker.walker, vcpu, gva);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ if (walker.pte_gpa == -1)
+ return;
+ if (kvm_read_guest_atomic(vcpu->kvm, walker.pte_gpa, &gpte,
+ sizeof(pt_element_t)))
+ return;
+ if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) {
+ if (mmu_topup_memory_caches(vcpu))
+ return;
+ kvm_mmu_pte_write(vcpu, walker.pte_gpa, (const u8 *)&gpte,
+ sizeof(pt_element_t), 0);
+ }
}
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -579,7 +610,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
nr_present++;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
- is_dirty_pte(gpte), 0, gfn,
+ is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn,
spte_to_pfn(sp->spt[i]), true, false);
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9c4ce657d96..1452851ae25 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -28,6 +28,8 @@
#include <asm/desc.h>
+#include <asm/virtext.h>
+
#define __ex(x) __kvm_handle_fault_on_reboot(x)
MODULE_AUTHOR("Qumranet");
@@ -245,34 +247,19 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
static int has_svm(void)
{
- uint32_t eax, ebx, ecx, edx;
-
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
- printk(KERN_INFO "has_svm: not amd\n");
- return 0;
- }
+ const char *msg;
- cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
- if (eax < SVM_CPUID_FUNC) {
- printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n");
+ if (!cpu_has_svm(&msg)) {
+ printk(KERN_INFO "has_svn: %s\n", msg);
return 0;
}
- cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
- if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
- printk(KERN_DEBUG "has_svm: svm not available\n");
- return 0;
- }
return 1;
}
static void svm_hardware_disable(void *garbage)
{
- uint64_t efer;
-
- wrmsrl(MSR_VM_HSAVE_PA, 0);
- rdmsrl(MSR_EFER, efer);
- wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
+ cpu_svm_disable();
}
static void svm_hardware_enable(void *garbage)
@@ -772,6 +759,22 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
+
+ /*
+ * SVM always stores 0 for the 'G' bit in the CS selector in
+ * the VMCB on a VMEXIT. This hurts cross-vendor migration:
+ * Intel's VMENTRY has a check on the 'G' bit.
+ */
+ if (seg == VCPU_SREG_CS)
+ var->g = s->limit > 0xfffff;
+
+ /*
+ * Work around a bug where the busy flag in the tr selector
+ * isn't exposed
+ */
+ if (seg == VCPU_SREG_TR)
+ var->type |= 0x2;
+
var->unusable = !var->present;
}
@@ -1099,6 +1102,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
rep = (io_info & SVM_IOIO_REP_MASK) != 0;
down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
+ skip_emulated_instruction(&svm->vcpu);
return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
}
@@ -1912,6 +1916,11 @@ static int get_npt_level(void)
#endif
}
+static int svm_get_mt_mask_shift(void)
+{
+ return 0;
+}
+
static struct kvm_x86_ops svm_x86_ops = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -1967,6 +1976,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_tss_addr = svm_set_tss_addr,
.get_tdp_level = get_npt_level,
+ .get_mt_mask_shift = svm_get_mt_mask_shift,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2643b430d83..6259d746764 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -16,7 +16,6 @@
*/
#include "irq.h"
-#include "vmx.h"
#include "mmu.h"
#include <linux/kvm_host.h>
@@ -31,6 +30,8 @@
#include <asm/io.h>
#include <asm/desc.h>
+#include <asm/vmx.h>
+#include <asm/virtext.h>
#define __ex(x) __kvm_handle_fault_on_reboot(x)
@@ -90,6 +91,11 @@ struct vcpu_vmx {
} rmode;
int vpid;
bool emulation_required;
+
+ /* Support for vnmi-less CPUs */
+ int soft_vnmi_blocked;
+ ktime_t entry_time;
+ s64 vnmi_blocked_time;
};
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -122,7 +128,7 @@ static struct vmcs_config {
u32 vmentry_ctrl;
} vmcs_config;
-struct vmx_capability {
+static struct vmx_capability {
u32 ept;
u32 vpid;
} vmx_capability;
@@ -957,6 +963,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
break;
+ case MSR_IA32_CR_PAT:
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ vmcs_write64(GUEST_IA32_PAT, data);
+ vcpu->arch.pat = data;
+ break;
+ }
+ /* Otherwise falls through to kvm_set_msr_common */
default:
vmx_load_host_state(vmx);
msr = find_msr_entry(vmx, msr_index);
@@ -1032,8 +1045,7 @@ static int vmx_get_irq(struct kvm_vcpu *vcpu)
static __init int cpu_has_kvm_support(void)
{
- unsigned long ecx = cpuid_ecx(1);
- return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
+ return cpu_has_vmx();
}
static __init int vmx_disabled_by_bios(void)
@@ -1079,13 +1091,22 @@ static void vmclear_local_vcpus(void)
__vcpu_clear(vmx);
}
-static void hardware_disable(void *garbage)
+
+/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
+ * tricks.
+ */
+static void kvm_cpu_vmxoff(void)
{
- vmclear_local_vcpus();
asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
write_cr4(read_cr4() & ~X86_CR4_VMXE);
}
+static void hardware_disable(void *garbage)
+{
+ vmclear_local_vcpus();
+ kvm_cpu_vmxoff();
+}
+
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
u32 msr, u32 *result)
{
@@ -1176,12 +1197,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
#ifdef CONFIG_X86_64
min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
#endif
- opt = 0;
+ opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
&_vmexit_control) < 0)
return -EIO;
- min = opt = 0;
+ min = 0;
+ opt = VM_ENTRY_LOAD_IA32_PAT;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
&_vmentry_control) < 0)
return -EIO;
@@ -2087,8 +2109,9 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
*/
static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
{
- u32 host_sysenter_cs;
+ u32 host_sysenter_cs, msr_low, msr_high;
u32 junk;
+ u64 host_pat;
unsigned long a;
struct descriptor_table dt;
int i;
@@ -2176,6 +2199,20 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
rdmsrl(MSR_IA32_SYSENTER_EIP, a);
vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
+ if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
+ rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+ host_pat = msr_low | ((u64) msr_high << 32);
+ vmcs_write64(HOST_IA32_PAT, host_pat);
+ }
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+ host_pat = msr_low | ((u64) msr_high << 32);
+ /* Write the default value follow host pat */
+ vmcs_write64(GUEST_IA32_PAT, host_pat);
+ /* Keep arch.pat sync with GUEST_IA32_PAT */
+ vmx->vcpu.arch.pat = host_pat;
+ }
+
for (i = 0; i < NR_VMX_MSR; ++i) {
u32 index = vmx_msr_index[i];
u32 data_low, data_high;
@@ -2230,6 +2267,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx->vcpu.arch.rmode.active = 0;
+ vmx->soft_vnmi_blocked = 0;
+
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
kvm_set_cr8(&vmx->vcpu, 0);
msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
@@ -2335,6 +2374,29 @@ out:
return ret;
}
+static void enable_irq_window(struct kvm_vcpu *vcpu)
+{
+ u32 cpu_based_vm_exec_control;
+
+ cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+ u32 cpu_based_vm_exec_control;
+
+ if (!cpu_has_virtual_nmis()) {
+ enable_irq_window(vcpu);
+ return;
+ }
+
+ cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2358,10 +2420,54 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!cpu_has_virtual_nmis()) {
+ /*
+ * Tracking the NMI-blocked state in software is built upon
+ * finding the next open IRQ window. This, in turn, depends on
+ * well-behaving guests: They have to keep IRQs disabled at
+ * least as long as the NMI handler runs. Otherwise we may
+ * cause NMI nesting, maybe breaking the guest. But as this is
+ * highly unlikely, we can live with the residual risk.
+ */
+ vmx->soft_vnmi_blocked = 1;
+ vmx->vnmi_blocked_time = 0;
+ }
+
+ ++vcpu->stat.nmi_injections;
+ if (vcpu->arch.rmode.active) {
+ vmx->rmode.irq.pending = true;
+ vmx->rmode.irq.vector = NMI_VECTOR;
+ vmx->rmode.irq.rip = kvm_rip_read(vcpu);
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ NMI_VECTOR | INTR_TYPE_SOFT_INTR |
+ INTR_INFO_VALID_MASK);
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+ kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+ return;
+ }
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
}
+static void vmx_update_window_states(struct kvm_vcpu *vcpu)
+{
+ u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+
+ vcpu->arch.nmi_window_open =
+ !(guest_intr & (GUEST_INTR_STATE_STI |
+ GUEST_INTR_STATE_MOV_SS |
+ GUEST_INTR_STATE_NMI));
+ if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
+ vcpu->arch.nmi_window_open = 0;
+
+ vcpu->arch.interrupt_window_open =
+ ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+ !(guest_intr & (GUEST_INTR_STATE_STI |
+ GUEST_INTR_STATE_MOV_SS)));
+}
+
static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
{
int word_index = __ffs(vcpu->arch.irq_summary);
@@ -2374,40 +2480,49 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
kvm_queue_interrupt(vcpu, irq);
}
-
static void do_interrupt_requests(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
{
- u32 cpu_based_vm_exec_control;
-
- vcpu->arch.interrupt_window_open =
- ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
- (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
+ vmx_update_window_states(vcpu);
- if (vcpu->arch.interrupt_window_open &&
- vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
- kvm_do_inject_irq(vcpu);
+ if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
+ if (vcpu->arch.interrupt.pending) {
+ enable_nmi_window(vcpu);
+ } else if (vcpu->arch.nmi_window_open) {
+ vcpu->arch.nmi_pending = false;
+ vcpu->arch.nmi_injected = true;
+ } else {
+ enable_nmi_window(vcpu);
+ return;
+ }
+ }
+ if (vcpu->arch.nmi_injected) {
+ vmx_inject_nmi(vcpu);
+ if (vcpu->arch.nmi_pending)
+ enable_nmi_window(vcpu);
+ else if (vcpu->arch.irq_summary
+ || kvm_run->request_interrupt_window)
+ enable_irq_window(vcpu);
+ return;
+ }
- if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending)
- vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+ if (vcpu->arch.interrupt_window_open) {
+ if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
+ kvm_do_inject_irq(vcpu);
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ if (vcpu->arch.interrupt.pending)
+ vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+ }
if (!vcpu->arch.interrupt_window_open &&
(vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
- /*
- * Interrupts blocked. Wait for unblock.
- */
- cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
- else
- cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+ enable_irq_window(vcpu);
}
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
{
int ret;
struct kvm_userspace_memory_region tss_mem = {
- .slot = 8,
+ .slot = TSS_PRIVATE_MEMSLOT,
.guest_phys_addr = addr,
.memory_size = PAGE_SIZE * 3,
.flags = 0,
@@ -2492,7 +2607,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
}
- if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
+ if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
return 1; /* already handled by vmx_vcpu_run() */
if (is_no_device(intr_info)) {
@@ -2581,6 +2696,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
rep = (exit_qualification & 32) != 0;
port = exit_qualification >> 16;
+ skip_emulated_instruction(vcpu);
return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
}
@@ -2767,6 +2883,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
KVMTRACE_0D(PEND_INTR, vcpu, handler);
+ ++vcpu->stat.irq_window_exits;
/*
* If the user space waits to inject interrupts, exit as soon as
@@ -2775,7 +2892,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
if (kvm_run->request_interrupt_window &&
!vcpu->arch.irq_summary) {
kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
- ++vcpu->stat.irq_window_exits;
return 0;
}
return 1;
@@ -2832,6 +2948,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long exit_qualification;
u16 tss_selector;
int reason;
@@ -2839,6 +2956,15 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
reason = (u32)exit_qualification >> 30;
+ if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected &&
+ (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK)
+ == INTR_TYPE_NMI_INTR) {
+ vcpu->arch.nmi_injected = false;
+ if (cpu_has_virtual_nmis())
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ }
tss_selector = exit_qualification;
return kvm_task_switch(vcpu, tss_selector, reason);
@@ -2927,16 +3053,12 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
while (!guest_state_valid(vcpu)) {
err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
- switch (err) {
- case EMULATE_DONE:
- break;
- case EMULATE_DO_MMIO:
- kvm_report_emulation_failure(vcpu, "mmio");
- /* TODO: Handle MMIO */
- return;
- default:
- kvm_report_emulation_failure(vcpu, "emulation failure");
- return;
+ if (err == EMULATE_DO_MMIO)
+ break;
+
+ if (err != EMULATE_DONE) {
+ kvm_report_emulation_failure(vcpu, "emulation failure");
+ return;
}
if (signal_pending(current))
@@ -2948,8 +3070,10 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
local_irq_disable();
preempt_disable();
- /* Guest state should be valid now, no more emulation should be needed */
- vmx->emulation_required = 0;
+ /* Guest state should be valid now except if we need to
+ * emulate an MMIO */
+ if (guest_state_valid(vcpu))
+ vmx->emulation_required = 0;
}
/*
@@ -2996,6 +3120,11 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
(u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
+ /* If we need to emulate an MMIO from handle_invalid_guest_state
+ * we just return 0 */
+ if (vmx->emulation_required && emulate_invalid_guest_state)
+ return 0;
+
/* Access CR3 don't cause VMExit in paging mode, so we need
* to sync with guest real CR3. */
if (vm_need_ept() && is_paging(vcpu)) {
@@ -3012,9 +3141,32 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
- exit_reason != EXIT_REASON_EPT_VIOLATION))
- printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
- "exit reason is 0x%x\n", __func__, exit_reason);
+ exit_reason != EXIT_REASON_EPT_VIOLATION &&
+ exit_reason != EXIT_REASON_TASK_SWITCH))
+ printk(KERN_WARNING "%s: unexpected, valid vectoring info "
+ "(0x%x) and exit reason is 0x%x\n",
+ __func__, vectoring_info, exit_reason);
+
+ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
+ if (vcpu->arch.interrupt_window_open) {
+ vmx->soft_vnmi_blocked = 0;
+ vcpu->arch.nmi_window_open = 1;
+ } else if (vmx->vnmi_blocked_time > 1000000000LL &&
+ vcpu->arch.nmi_pending) {
+ /*
+ * This CPU don't support us in finding the end of an
+ * NMI-blocked window if the guest runs with IRQs
+ * disabled. So we pull the trigger after 1 s of
+ * futile waiting, but inform the user about this.
+ */
+ printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
+ "state on VCPU %d after 1 s timeout\n",
+ __func__, vcpu->vcpu_id);
+ vmx->soft_vnmi_blocked = 0;
+ vmx->vcpu.arch.nmi_window_open = 1;
+ }
+ }
+
if (exit_reason < kvm_vmx_max_exit_handlers
&& kvm_vmx_exit_handlers[exit_reason])
return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
@@ -3042,51 +3194,6 @@ static void update_tpr_threshold(struct kvm_vcpu *vcpu)
vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
}
-static void enable_irq_window(struct kvm_vcpu *vcpu)
-{
- u32 cpu_based_vm_exec_control;
-
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
-{
- u32 cpu_based_vm_exec_control;
-
- if (!cpu_has_virtual_nmis())
- return;
-
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static int vmx_nmi_enabled(struct kvm_vcpu *vcpu)
-{
- u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
- return !(guest_intr & (GUEST_INTR_STATE_NMI |
- GUEST_INTR_STATE_MOV_SS |
- GUEST_INTR_STATE_STI));
-}
-
-static int vmx_irq_enabled(struct kvm_vcpu *vcpu)
-{
- u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
- return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS |
- GUEST_INTR_STATE_STI)) &&
- (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
-}
-
-static void enable_intr_window(struct kvm_vcpu *vcpu)
-{
- if (vcpu->arch.nmi_pending)
- enable_nmi_window(vcpu);
- else if (kvm_cpu_has_interrupt(vcpu))
- enable_irq_window(vcpu);
-}
-
static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
{
u32 exit_intr_info;
@@ -3109,7 +3216,9 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
if (unblock_nmi && vector != DF_VECTOR)
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
GUEST_INTR_STATE_NMI);
- }
+ } else if (unlikely(vmx->soft_vnmi_blocked))
+ vmx->vnmi_blocked_time +=
+ ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
idt_vectoring_info = vmx->idt_vectoring_info;
idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
@@ -3147,24 +3256,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
{
update_tpr_threshold(vcpu);
- if (cpu_has_virtual_nmis()) {
- if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
- if (vmx_nmi_enabled(vcpu)) {
- vcpu->arch.nmi_pending = false;
- vcpu->arch.nmi_injected = true;
- } else {
- enable_intr_window(vcpu);
- return;
- }
- }
- if (vcpu->arch.nmi_injected) {
- vmx_inject_nmi(vcpu);
- enable_intr_window(vcpu);
+ vmx_update_window_states(vcpu);
+
+ if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
+ if (vcpu->arch.interrupt.pending) {
+ enable_nmi_window(vcpu);
+ } else if (vcpu->arch.nmi_window_open) {
+ vcpu->arch.nmi_pending = false;
+ vcpu->arch.nmi_injected = true;
+ } else {
+ enable_nmi_window(vcpu);
return;
}
}
+ if (vcpu->arch.nmi_injected) {
+ vmx_inject_nmi(vcpu);
+ if (vcpu->arch.nmi_pending)
+ enable_nmi_window(vcpu);
+ else if (kvm_cpu_has_interrupt(vcpu))
+ enable_irq_window(vcpu);
+ return;
+ }
if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
- if (vmx_irq_enabled(vcpu))
+ if (vcpu->arch.interrupt_window_open)
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
else
enable_irq_window(vcpu);
@@ -3172,6 +3286,8 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
if (vcpu->arch.interrupt.pending) {
vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
+ if (kvm_cpu_has_interrupt(vcpu))
+ enable_irq_window(vcpu);
}
}
@@ -3211,6 +3327,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 intr_info;
+ /* Record the guest's net vcpu time for enforced NMI injections. */
+ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
+ vmx->entry_time = ktime_get();
+
/* Handle invalid guest state instead of entering VMX */
if (vmx->emulation_required && emulate_invalid_guest_state) {
handle_invalid_guest_state(vcpu, kvm_run);
@@ -3325,9 +3445,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (vmx->rmode.irq.pending)
fixup_rmode_irq(vmx);
- vcpu->arch.interrupt_window_open =
- (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0;
+ vmx_update_window_states(vcpu);
asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
vmx->launched = 1;
@@ -3335,7 +3453,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
/* We need to handle NMIs before interrupts are enabled */
- if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 &&
+ if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
(intr_info & INTR_INFO_VALID_MASK)) {
KVMTRACE_0D(NMI, vcpu, handler);
asm("int $2");
@@ -3453,6 +3571,11 @@ static int get_ept_level(void)
return VMX_EPT_DEFAULT_GAW + 1;
}
+static int vmx_get_mt_mask_shift(void)
+{
+ return VMX_EPT_MT_EPTE_SHIFT;
+}
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -3508,6 +3631,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_tss_addr = vmx_set_tss_addr,
.get_tdp_level = get_ept_level,
+ .get_mt_mask_shift = vmx_get_mt_mask_shift,
};
static int __init vmx_init(void)
@@ -3564,9 +3688,10 @@ static int __init vmx_init(void)
bypass_guest_pf = 0;
kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
VMX_EPT_WRITABLE_MASK |
- VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
+ VMX_EPT_IGMT_BIT);
kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
- VMX_EPT_EXECUTABLE_MASK);
+ VMX_EPT_EXECUTABLE_MASK,
+ VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
kvm_enable_tdp();
} else
kvm_disable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f1f8ff2f1fa..cc17546a240 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -34,11 +34,13 @@
#include <linux/module.h>
#include <linux/mman.h>
#include <linux/highmem.h>
+#include <linux/iommu.h>
#include <linux/intel-iommu.h>
#include <asm/uaccess.h>
#include <asm/msr.h>
#include <asm/desc.h>
+#include <asm/mtrr.h>
#define MAX_IO_MSRS 256
#define CR0_RESERVED_BITS \
@@ -86,6 +88,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "hypercalls", VCPU_STAT(hypercalls) },
{ "request_irq", VCPU_STAT(request_irq_exits) },
+ { "request_nmi", VCPU_STAT(request_nmi_exits) },
{ "irq_exits", VCPU_STAT(irq_exits) },
{ "host_state_reload", VCPU_STAT(host_state_reload) },
{ "efer_reload", VCPU_STAT(efer_reload) },
@@ -93,6 +96,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "insn_emulation", VCPU_STAT(insn_emulation) },
{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
{ "irq_injections", VCPU_STAT(irq_injections) },
+ { "nmi_injections", VCPU_STAT(nmi_injections) },
{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -101,6 +105,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "mmu_recycled", VM_STAT(mmu_recycled) },
{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
{ "mmu_unsync", VM_STAT(mmu_unsync) },
+ { "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ "largepages", VM_STAT(lpages) },
{ NULL }
@@ -312,6 +317,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
kvm_x86_ops->set_cr0(vcpu, cr0);
vcpu->arch.cr0 = cr0;
+ kvm_mmu_sync_global(vcpu);
kvm_mmu_reset_context(vcpu);
return;
}
@@ -355,6 +361,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
}
kvm_x86_ops->set_cr4(vcpu, cr4);
vcpu->arch.cr4 = cr4;
+ kvm_mmu_sync_global(vcpu);
kvm_mmu_reset_context(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -449,7 +456,7 @@ static u32 msrs_to_save[] = {
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
- MSR_IA32_PERF_STATUS,
+ MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT
};
static unsigned num_msrs_to_save;
@@ -648,10 +655,38 @@ static bool msr_mtrr_valid(unsigned msr)
static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
+ u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
+
if (!msr_mtrr_valid(msr))
return 1;
- vcpu->arch.mtrr[msr - 0x200] = data;
+ if (msr == MSR_MTRRdefType) {
+ vcpu->arch.mtrr_state.def_type = data;
+ vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
+ } else if (msr == MSR_MTRRfix64K_00000)
+ p[0] = data;
+ else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+ p[1 + msr - MSR_MTRRfix16K_80000] = data;
+ else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+ p[3 + msr - MSR_MTRRfix4K_C0000] = data;
+ else if (msr == MSR_IA32_CR_PAT)
+ vcpu->arch.pat = data;
+ else { /* Variable MTRRs */
+ int idx, is_mtrr_mask;
+ u64 *pt;
+
+ idx = (msr - 0x200) / 2;
+ is_mtrr_mask = msr - 0x200 - 2 * idx;
+ if (!is_mtrr_mask)
+ pt =
+ (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
+ else
+ pt =
+ (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
+ *pt = data;
+ }
+
+ kvm_mmu_reset_context(vcpu);
return 0;
}
@@ -747,10 +782,37 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
+ u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
+
if (!msr_mtrr_valid(msr))
return 1;
- *pdata = vcpu->arch.mtrr[msr - 0x200];
+ if (msr == MSR_MTRRdefType)
+ *pdata = vcpu->arch.mtrr_state.def_type +
+ (vcpu->arch.mtrr_state.enabled << 10);
+ else if (msr == MSR_MTRRfix64K_00000)
+ *pdata = p[0];
+ else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+ *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
+ else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+ *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
+ else if (msr == MSR_IA32_CR_PAT)
+ *pdata = vcpu->arch.pat;
+ else { /* Variable MTRRs */
+ int idx, is_mtrr_mask;
+ u64 *pt;
+
+ idx = (msr - 0x200) / 2;
+ is_mtrr_mask = msr - 0x200 - 2 * idx;
+ if (!is_mtrr_mask)
+ pt =
+ (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
+ else
+ pt =
+ (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
+ *pdata = *pt;
+ }
+
return 0;
}
@@ -903,7 +965,6 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_IRQCHIP:
case KVM_CAP_HLT:
case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
- case KVM_CAP_USER_MEMORY:
case KVM_CAP_SET_TSS_ADDR:
case KVM_CAP_EXT_CPUID:
case KVM_CAP_CLOCKSOURCE:
@@ -929,7 +990,7 @@ int kvm_dev_ioctl_check_extension(long ext)
r = !tdp_enabled;
break;
case KVM_CAP_IOMMU:
- r = intel_iommu_found();
+ r = iommu_found();
break;
default:
r = 0;
@@ -1188,6 +1249,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
int t, times = entry->eax & 0xff;
entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+ entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
for (t = 1; t < times && *nent < maxnent; ++t) {
do_cpuid_1_ent(&entry[t], function, 0);
entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
@@ -1218,7 +1280,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
/* read more entries until level_type is zero */
for (i = 1; *nent < maxnent; ++i) {
- level_type = entry[i - 1].ecx & 0xff;
+ level_type = entry[i - 1].ecx & 0xff00;
if (!level_type)
break;
do_cpuid_1_ent(&entry[i], function, i);
@@ -1318,6 +1380,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
return 0;
}
+static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
+{
+ vcpu_load(vcpu);
+ kvm_inject_nmi(vcpu);
+ vcpu_put(vcpu);
+
+ return 0;
+}
+
static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
struct kvm_tpr_access_ctl *tac)
{
@@ -1377,6 +1448,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = 0;
break;
}
+ case KVM_NMI: {
+ r = kvm_vcpu_ioctl_nmi(vcpu);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
case KVM_SET_CPUID: {
struct kvm_cpuid __user *cpuid_arg = argp;
struct kvm_cpuid cpuid;
@@ -1968,7 +2046,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
if (ret < 0)
return 0;
- kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+ kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
return 1;
}
@@ -2404,8 +2482,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
val = kvm_register_read(vcpu, VCPU_REGS_RAX);
memcpy(vcpu->arch.pio_data, &val, 4);
- kvm_x86_ops->skip_emulated_instruction(vcpu);
-
pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
if (pio_dev) {
kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
@@ -2541,7 +2617,7 @@ int kvm_arch_init(void *opaque)
kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
- PT_DIRTY_MASK, PT64_NX_MASK, 0);
+ PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
return 0;
out:
@@ -2729,7 +2805,7 @@ static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
/* when no next entry is found, the current entry[i] is reselected */
- for (j = i + 1; j == i; j = (j + 1) % nent) {
+ for (j = i + 1; ; j = (j + 1) % nent) {
struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
if (ej->function == e->function) {
ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
@@ -2973,7 +3049,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
pr_debug("vcpu %d received sipi with vector # %x\n",
vcpu->vcpu_id, vcpu->arch.sipi_vector);
kvm_lapic_reset(vcpu);
- r = kvm_x86_ops->vcpu_reset(vcpu);
+ r = kvm_arch_vcpu_reset(vcpu);
if (r)
return r;
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -3275,9 +3351,9 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
kvm_desct->padding = 0;
}
-static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
- u16 selector,
- struct descriptor_table *dtable)
+static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
+ u16 selector,
+ struct descriptor_table *dtable)
{
if (selector & 1 << 2) {
struct kvm_segment kvm_seg;
@@ -3302,7 +3378,7 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
struct descriptor_table dtable;
u16 index = selector >> 3;
- get_segment_descritptor_dtable(vcpu, selector, &dtable);
+ get_segment_descriptor_dtable(vcpu, selector, &dtable);
if (dtable.limit < index * 8 + 7) {
kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
@@ -3321,7 +3397,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
struct descriptor_table dtable;
u16 index = selector >> 3;
- get_segment_descritptor_dtable(vcpu, selector, &dtable);
+ get_segment_descriptor_dtable(vcpu, selector, &dtable);
if (dtable.limit < index * 8 + 7)
return 1;
@@ -3900,6 +3976,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
/* We do fxsave: this must be aligned. */
BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
+ vcpu->arch.mtrr_state.have_fixed = 1;
vcpu_load(vcpu);
r = kvm_arch_vcpu_reset(vcpu);
if (r == 0)
@@ -3925,6 +4002,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
{
+ vcpu->arch.nmi_pending = false;
+ vcpu->arch.nmi_injected = false;
+
return kvm_x86_ops->vcpu_reset(vcpu);
}
@@ -4012,6 +4092,7 @@ struct kvm *kvm_arch_create_vm(void)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -4048,8 +4129,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
void kvm_arch_destroy_vm(struct kvm *kvm)
{
- kvm_iommu_unmap_guest(kvm);
kvm_free_all_assigned_devices(kvm);
+ kvm_iommu_unmap_guest(kvm);
kvm_free_pit(kvm);
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
@@ -4127,7 +4208,8 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
- || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED;
+ || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
+ || vcpu->arch.nmi_pending;
}
static void vcpu_kick_intr(void *info)
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index ea051173b0d..d174db7a337 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -58,6 +58,7 @@
#define SrcMem32 (4<<4) /* Memory operand (32-bit). */
#define SrcImm (5<<4) /* Immediate operand. */
#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
+#define SrcOne (7<<4) /* Implied '1' */
#define SrcMask (7<<4)
/* Generic ModRM decode. */
#define ModRM (1<<7)
@@ -70,17 +71,23 @@
#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
#define GroupMask 0xff /* Group number stored in bits 0:7 */
+/* Source 2 operand type */
+#define Src2None (0<<29)
+#define Src2CL (1<<29)
+#define Src2ImmByte (2<<29)
+#define Src2One (3<<29)
+#define Src2Mask (7<<29)
enum {
Group1_80, Group1_81, Group1_82, Group1_83,
Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
};
-static u16 opcode_table[256] = {
+static u32 opcode_table[256] = {
/* 0x00 - 0x07 */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
/* 0x08 - 0x0F */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -195,7 +202,7 @@ static u16 opcode_table[256] = {
ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
};
-static u16 twobyte_table[256] = {
+static u32 twobyte_table[256] = {
/* 0x00 - 0x0F */
0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0,
ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
@@ -230,9 +237,14 @@ static u16 twobyte_table[256] = {
/* 0x90 - 0x9F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0xA0 - 0xA7 */
- 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
+ 0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
+ DstMem | SrcReg | Src2ImmByte | ModRM,
+ DstMem | SrcReg | Src2CL | ModRM, 0, 0,
/* 0xA8 - 0xAF */
- 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, ModRM, 0,
+ 0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
+ DstMem | SrcReg | Src2ImmByte | ModRM,
+ DstMem | SrcReg | Src2CL | ModRM,
+ ModRM, 0,
/* 0xB0 - 0xB7 */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
DstMem | SrcReg | ModRM | BitOp,
@@ -253,7 +265,7 @@ static u16 twobyte_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
-static u16 group_table[] = {
+static u32 group_table[] = {
[Group1_80*8] =
ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
@@ -297,9 +309,9 @@ static u16 group_table[] = {
SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp,
};
-static u16 group2_table[] = {
+static u32 group2_table[] = {
[Group7*8] =
- SrcNone | ModRM, 0, 0, 0,
+ SrcNone | ModRM, 0, 0, SrcNone | ModRM,
SrcNone | ModRM | DstMem | Mov, 0,
SrcMem16 | ModRM | Mov, 0,
};
@@ -359,49 +371,48 @@ static u16 group2_table[] = {
"andl %"_msk",%"_LO32 _tmp"; " \
"orl %"_LO32 _tmp",%"_sav"; "
+#ifdef CONFIG_X86_64
+#define ON64(x) x
+#else
+#define ON64(x)
+#endif
+
+#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \
+ do { \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "4", "2") \
+ _op _suffix " %"_x"3,%1; " \
+ _POST_EFLAGS("0", "4", "2") \
+ : "=m" (_eflags), "=m" ((_dst).val), \
+ "=&r" (_tmp) \
+ : _y ((_src).val), "i" (EFLAGS_MASK)); \
+ } while (0)
+
+
/* Raw emulation: instruction has two explicit operands. */
#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
- do { \
- unsigned long _tmp; \
- \
- switch ((_dst).bytes) { \
- case 2: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "4", "2") \
- _op"w %"_wx"3,%1; " \
- _POST_EFLAGS("0", "4", "2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : _wy ((_src).val), "i" (EFLAGS_MASK)); \
- break; \
- case 4: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "4", "2") \
- _op"l %"_lx"3,%1; " \
- _POST_EFLAGS("0", "4", "2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : _ly ((_src).val), "i" (EFLAGS_MASK)); \
- break; \
- case 8: \
- __emulate_2op_8byte(_op, _src, _dst, \
- _eflags, _qx, _qy); \
- break; \
- } \
+ do { \
+ unsigned long _tmp; \
+ \
+ switch ((_dst).bytes) { \
+ case 2: \
+ ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \
+ break; \
+ case 4: \
+ ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \
+ break; \
+ case 8: \
+ ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \
+ break; \
+ } \
} while (0)
#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
do { \
- unsigned long __tmp; \
+ unsigned long _tmp; \
switch ((_dst).bytes) { \
case 1: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "4", "2") \
- _op"b %"_bx"3,%1; " \
- _POST_EFLAGS("0", "4", "2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (__tmp) \
- : _by ((_src).val), "i" (EFLAGS_MASK)); \
+ ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \
break; \
default: \
__emulate_2op_nobyte(_op, _src, _dst, _eflags, \
@@ -425,71 +436,68 @@ static u16 group2_table[] = {
__emulate_2op_nobyte(_op, _src, _dst, _eflags, \
"w", "r", _LO32, "r", "", "r")
-/* Instruction has only one explicit operand (no source operand). */
-#define emulate_1op(_op, _dst, _eflags) \
- do { \
- unsigned long _tmp; \
- \
- switch ((_dst).bytes) { \
- case 1: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "3", "2") \
- _op"b %1; " \
- _POST_EFLAGS("0", "3", "2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : "i" (EFLAGS_MASK)); \
- break; \
- case 2: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "3", "2") \
- _op"w %1; " \
- _POST_EFLAGS("0", "3", "2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : "i" (EFLAGS_MASK)); \
- break; \
- case 4: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "3", "2") \
- _op"l %1; " \
- _POST_EFLAGS("0", "3", "2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : "i" (EFLAGS_MASK)); \
- break; \
- case 8: \
- __emulate_1op_8byte(_op, _dst, _eflags); \
- break; \
- } \
+/* Instruction has three operands and one operand is stored in ECX register */
+#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \
+ do { \
+ unsigned long _tmp; \
+ _type _clv = (_cl).val; \
+ _type _srcv = (_src).val; \
+ _type _dstv = (_dst).val; \
+ \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "5", "2") \
+ _op _suffix " %4,%1 \n" \
+ _POST_EFLAGS("0", "5", "2") \
+ : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \
+ : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
+ ); \
+ \
+ (_cl).val = (unsigned long) _clv; \
+ (_src).val = (unsigned long) _srcv; \
+ (_dst).val = (unsigned long) _dstv; \
} while (0)
-/* Emulate an instruction with quadword operands (x86/64 only). */
-#if defined(CONFIG_X86_64)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \
- do { \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "4", "2") \
- _op"q %"_qx"3,%1; " \
- _POST_EFLAGS("0", "4", "2") \
- : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
- : _qy ((_src).val), "i" (EFLAGS_MASK)); \
+#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \
+ do { \
+ switch ((_dst).bytes) { \
+ case 2: \
+ __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
+ "w", unsigned short); \
+ break; \
+ case 4: \
+ __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
+ "l", unsigned int); \
+ break; \
+ case 8: \
+ ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
+ "q", unsigned long)); \
+ break; \
+ } \
} while (0)
-#define __emulate_1op_8byte(_op, _dst, _eflags) \
- do { \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "3", "2") \
- _op"q %1; " \
- _POST_EFLAGS("0", "3", "2") \
- : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
- : "i" (EFLAGS_MASK)); \
+#define __emulate_1op(_op, _dst, _eflags, _suffix) \
+ do { \
+ unsigned long _tmp; \
+ \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "3", "2") \
+ _op _suffix " %1; " \
+ _POST_EFLAGS("0", "3", "2") \
+ : "=m" (_eflags), "+m" ((_dst).val), \
+ "=&r" (_tmp) \
+ : "i" (EFLAGS_MASK)); \
} while (0)
-#elif defined(__i386__)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
-#define __emulate_1op_8byte(_op, _dst, _eflags)
-#endif /* __i386__ */
+/* Instruction has only one explicit operand (no source operand). */
+#define emulate_1op(_op, _dst, _eflags) \
+ do { \
+ switch ((_dst).bytes) { \
+ case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \
+ case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \
+ case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \
+ case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \
+ } \
+ } while (0)
/* Fetch next part of the instruction being emulated. */
#define insn_fetch(_type, _size, _eip) \
@@ -1041,6 +1049,33 @@ done_prefixes:
c->src.bytes = 1;
c->src.val = insn_fetch(s8, 1, c->eip);
break;
+ case SrcOne:
+ c->src.bytes = 1;
+ c->src.val = 1;
+ break;
+ }
+
+ /*
+ * Decode and fetch the second source operand: register, memory
+ * or immediate.
+ */
+ switch (c->d & Src2Mask) {
+ case Src2None:
+ break;
+ case Src2CL:
+ c->src2.bytes = 1;
+ c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
+ break;
+ case Src2ImmByte:
+ c->src2.type = OP_IMM;
+ c->src2.ptr = (unsigned long *)c->eip;
+ c->src2.bytes = 1;
+ c->src2.val = insn_fetch(u8, 1, c->eip);
+ break;
+ case Src2One:
+ c->src2.bytes = 1;
+ c->src2.val = 1;
+ break;
}
/* Decode and fetch the destination operand: register or memory. */
@@ -1100,20 +1135,33 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
c->regs[VCPU_REGS_RSP]);
}
-static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int emulate_pop(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
int rc;
- rc = ops->read_std(register_address(c, ss_base(ctxt),
- c->regs[VCPU_REGS_RSP]),
- &c->dst.val, c->dst.bytes, ctxt->vcpu);
+ rc = ops->read_emulated(register_address(c, ss_base(ctxt),
+ c->regs[VCPU_REGS_RSP]),
+ &c->src.val, c->src.bytes, ctxt->vcpu);
if (rc != 0)
return rc;
- register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->dst.bytes);
+ register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.bytes);
+ return rc;
+}
+
+static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc;
+ c->src.bytes = c->dst.bytes;
+ rc = emulate_pop(ctxt, ops);
+ if (rc != 0)
+ return rc;
+ c->dst.val = c->src.val;
return 0;
}
@@ -1415,24 +1463,15 @@ special_insn:
emulate_1op("dec", c->dst, ctxt->eflags);
break;
case 0x50 ... 0x57: /* push reg */
- c->dst.type = OP_MEM;
- c->dst.bytes = c->op_bytes;
- c->dst.val = c->src.val;
- register_address_increment(c, &c->regs[VCPU_REGS_RSP],
- -c->op_bytes);
- c->dst.ptr = (void *) register_address(
- c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]);
+ emulate_push(ctxt);
break;
case 0x58 ... 0x5f: /* pop reg */
pop_instruction:
- if ((rc = ops->read_std(register_address(c, ss_base(ctxt),
- c->regs[VCPU_REGS_RSP]), c->dst.ptr,
- c->op_bytes, ctxt->vcpu)) != 0)
+ c->src.bytes = c->op_bytes;
+ rc = emulate_pop(ctxt, ops);
+ if (rc != 0)
goto done;
-
- register_address_increment(c, &c->regs[VCPU_REGS_RSP],
- c->op_bytes);
- c->dst.type = OP_NONE; /* Disable writeback. */
+ c->dst.val = c->src.val;
break;
case 0x63: /* movsxd */
if (ctxt->mode != X86EMUL_MODE_PROT64)
@@ -1591,7 +1630,9 @@ special_insn:
emulate_push(ctxt);
break;
case 0x9d: /* popf */
+ c->dst.type = OP_REG;
c->dst.ptr = (unsigned long *) &ctxt->eflags;
+ c->dst.bytes = c->op_bytes;
goto pop_instruction;
case 0xa0 ... 0xa1: /* mov */
c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
@@ -1689,7 +1730,9 @@ special_insn:
emulate_grp2(ctxt);
break;
case 0xc3: /* ret */
+ c->dst.type = OP_REG;
c->dst.ptr = &c->eip;
+ c->dst.bytes = c->op_bytes;
goto pop_instruction;
case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
mov:
@@ -1778,7 +1821,7 @@ special_insn:
c->eip = saved_eip;
goto cannot_emulate;
}
- return 0;
+ break;
case 0xf4: /* hlt */
ctxt->vcpu->arch.halt_request = 1;
break;
@@ -1999,12 +2042,20 @@ twobyte_insn:
c->src.val &= (c->dst.bytes << 3) - 1;
emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
break;
+ case 0xa4: /* shld imm8, r, r/m */
+ case 0xa5: /* shld cl, r, r/m */
+ emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
+ break;
case 0xab:
bts: /* bts */
/* only subword offset */
c->src.val &= (c->dst.bytes << 3) - 1;
emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
break;
+ case 0xac: /* shrd imm8, r, r/m */
+ case 0xad: /* shrd cl, r, r/m */
+ emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags);
+ break;
case 0xae: /* clflush */
break;
case 0xb0 ... 0xb1: /* cmpxchg */
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index a5d8e1ace1c..da2e314f61b 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -173,24 +173,29 @@ static unsigned long save_fl(void)
{
return lguest_data.irq_enabled;
}
+PV_CALLEE_SAVE_REGS_THUNK(save_fl);
/* restore_flags() just sets the flags back to the value given. */
static void restore_fl(unsigned long flags)
{
lguest_data.irq_enabled = flags;
}
+PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
/* Interrupts go off... */
static void irq_disable(void)
{
lguest_data.irq_enabled = 0;
}
+PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
/* Interrupts go on... */
static void irq_enable(void)
{
lguest_data.irq_enabled = X86_EFLAGS_IF;
}
+PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
+
/*:*/
/*M:003 Note that we don't check for outstanding interrupts when we re-enable
* them (or when we unmask an interrupt). This seems to work for the moment,
@@ -278,7 +283,7 @@ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
/* There's one problem which normal hardware doesn't have: the Host
* can't handle us removing entries we're currently using. So we clear
* the GS register here: if it's needed it'll be reloaded anyway. */
- loadsegment(gs, 0);
+ lazy_load_gs(0);
lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
}
@@ -590,7 +595,8 @@ static void __init lguest_init_IRQ(void)
* a straightforward 1 to 1 mapping, so force that here. */
__get_cpu_var(vector_irq)[vector] = i;
if (vector != SYSCALL_VECTOR) {
- set_intr_gate(vector, interrupt[vector]);
+ set_intr_gate(vector,
+ interrupt[vector-FIRST_EXTERNAL_VECTOR]);
set_irq_chip_and_handler_name(i, &lguest_irq_controller,
handle_level_irq,
"level");
@@ -737,7 +743,7 @@ static void lguest_time_init(void)
/* We can't set cpumask in the initializer: damn C limitations! Set it
* here and register our timer device. */
- lguest_clockevent.cpumask = cpumask_of_cpu(0);
+ lguest_clockevent.cpumask = cpumask_of(0);
clockevents_register_device(&lguest_clockevent);
/* Finally, we unblock the timer interrupt. */
@@ -930,7 +936,7 @@ static void lguest_restart(char *reason)
* that we can fit comfortably.
*
* First we need assembly templates of each of the patchable Guest operations,
- * and these are in lguest_asm.S. */
+ * and these are in i386_head.S. */
/*G:060 We construct a table from the assembler templates: */
static const struct lguest_insns
@@ -983,10 +989,10 @@ __init void lguest_init(void)
/* interrupt-related operations */
pv_irq_ops.init_IRQ = lguest_init_IRQ;
- pv_irq_ops.save_fl = save_fl;
- pv_irq_ops.restore_fl = restore_fl;
- pv_irq_ops.irq_disable = irq_disable;
- pv_irq_ops.irq_enable = irq_enable;
+ pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
+ pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl);
+ pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
+ pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable);
pv_irq_ops.safe_halt = lguest_safe_halt;
/* init-time operations */
@@ -1092,7 +1098,7 @@ __init void lguest_init(void)
acpi_ht = 0;
#endif
- /* We set the perferred console to "hvc". This is the "hypervisor
+ /* We set the preferred console to "hvc". This is the "hypervisor
* virtual console" driver written by the PowerPC people, which we also
* adapted for lguest's use. */
add_preferred_console("hvc", 0, NULL);
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 5c7cef34c9e..10b9bd35a8f 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -30,21 +30,6 @@ ENTRY(lguest_entry)
movl $lguest_data - __PAGE_OFFSET, %edx
int $LGUEST_TRAP_ENTRY
- /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl
- * instruction uses %esi implicitly as the source for the copy we're
- * about to do. */
- movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi
-
- /* Copy first 32 entries of page directory to __PAGE_OFFSET entries.
- * This means the first 128M of kernel memory will be mapped at
- * PAGE_OFFSET where the kernel expects to run. This will get it far
- * enough through boot to switch to its own pagetables. */
- movl $32, %ecx
- movl %esi, %edi
- addl $((__PAGE_OFFSET >> 22) * 4), %edi
- rep
- movsl
-
/* Set up the initial stack so we can run C code. */
movl $(init_thread_union+THREAD_SIZE),%esp
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 9e68075544f..7c8ca91bb9e 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -39,7 +39,7 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon
#define __do_strncpy_from_user(dst, src, count, res) \
do { \
int __d0, __d1, __d2; \
- might_sleep(); \
+ might_fault(); \
__asm__ __volatile__( \
" testl %1,%1\n" \
" jz 2f\n" \
@@ -56,7 +56,7 @@ do { \
" jmp 2b\n" \
".previous\n" \
_ASM_EXTABLE(0b,3b) \
- : "=d"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \
+ : "=&d"(res), "=&c"(count), "=&a" (__d0), "=&S" (__d1), \
"=&D" (__d2) \
: "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
: "memory"); \
@@ -126,7 +126,7 @@ EXPORT_SYMBOL(strncpy_from_user);
#define __do_clear_user(addr,size) \
do { \
int __d0; \
- might_sleep(); \
+ might_fault(); \
__asm__ __volatile__( \
"0: rep; stosl\n" \
" movl %2,%0\n" \
@@ -155,7 +155,7 @@ do { \
unsigned long
clear_user(void __user *to, unsigned long n)
{
- might_sleep();
+ might_fault();
if (access_ok(VERIFY_WRITE, to, n))
__do_clear_user(to, n);
return n;
@@ -197,7 +197,7 @@ long strnlen_user(const char __user *s, long n)
unsigned long mask = -__addr_ok(s);
unsigned long res, tmp;
- might_sleep();
+ might_fault();
__asm__ __volatile__(
" testl %0, %0\n"
@@ -218,7 +218,7 @@ long strnlen_user(const char __user *s, long n)
" .align 4\n"
" .long 0b,2b\n"
".previous"
- :"=r" (n), "=D" (s), "=a" (res), "=c" (tmp)
+ :"=&r" (n), "=&D" (s), "=&a" (res), "=&c" (tmp)
:"0" (n), "1" (s), "2" (0), "3" (mask)
:"cc");
return res & mask;
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index f4df6e7c718..ec13cb5f17e 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -15,7 +15,7 @@
#define __do_strncpy_from_user(dst,src,count,res) \
do { \
long __d0, __d1, __d2; \
- might_sleep(); \
+ might_fault(); \
__asm__ __volatile__( \
" testq %1,%1\n" \
" jz 2f\n" \
@@ -32,7 +32,7 @@ do { \
" jmp 2b\n" \
".previous\n" \
_ASM_EXTABLE(0b,3b) \
- : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \
+ : "=&r"(res), "=&c"(count), "=&a" (__d0), "=&S" (__d1), \
"=&D" (__d2) \
: "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
: "memory"); \
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(strncpy_from_user);
unsigned long __clear_user(void __user *addr, unsigned long size)
{
long __d0;
- might_sleep();
+ might_fault();
/* no memory constraint because it doesn't change any memory gcc knows
about */
asm volatile(
@@ -86,7 +86,7 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
".previous\n"
_ASM_EXTABLE(0b,3b)
_ASM_EXTABLE(1b,2b)
- : [size8] "=c"(size), [dst] "=&D" (__d0)
+ : [size8] "=&c"(size), [dst] "=&D" (__d0)
: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
[zero] "r" (0UL), [eight] "r" (8UL));
return size;
diff --git a/arch/x86/mach-default/Makefile b/arch/x86/mach-default/Makefile
deleted file mode 100644
index 012fe34459e..00000000000
--- a/arch/x86/mach-default/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-#
-# Makefile for the linux kernel.
-#
-
-obj-y := setup.o
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
deleted file mode 100644
index 37b9ae4d44c..00000000000
--- a/arch/x86/mach-default/setup.c
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Machine specific setup for generic
- */
-
-#include <linux/smp.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <asm/acpi.h>
-#include <asm/arch_hooks.h>
-#include <asm/e820.h>
-#include <asm/setup.h>
-
-#include <mach_ipi.h>
-
-#ifdef CONFIG_HOTPLUG_CPU
-#define DEFAULT_SEND_IPI (1)
-#else
-#define DEFAULT_SEND_IPI (0)
-#endif
-
-int no_broadcast = DEFAULT_SEND_IPI;
-
-/**
- * pre_intr_init_hook - initialisation prior to setting up interrupt vectors
- *
- * Description:
- * Perform any necessary interrupt initialisation prior to setting up
- * the "ordinary" interrupt call gates. For legacy reasons, the ISA
- * interrupts should be initialised here if the machine emulates a PC
- * in any way.
- **/
-void __init pre_intr_init_hook(void)
-{
- if (x86_quirks->arch_pre_intr_init) {
- if (x86_quirks->arch_pre_intr_init())
- return;
- }
- init_ISA_irqs();
-}
-
-/**
- * intr_init_hook - post gate setup interrupt initialisation
- *
- * Description:
- * Fill in any interrupts that may have been left out by the general
- * init_IRQ() routine. interrupts having to do with the machine rather
- * than the devices on the I/O bus (like APIC interrupts in intel MP
- * systems) are started here.
- **/
-void __init intr_init_hook(void)
-{
- if (x86_quirks->arch_intr_init) {
- if (x86_quirks->arch_intr_init())
- return;
- }
-}
-
-/**
- * pre_setup_arch_hook - hook called prior to any setup_arch() execution
- *
- * Description:
- * generally used to activate any machine specific identification
- * routines that may be needed before setup_arch() runs. On Voyager
- * this is used to get the board revision and type.
- **/
-void __init pre_setup_arch_hook(void)
-{
-}
-
-/**
- * trap_init_hook - initialise system specific traps
- *
- * Description:
- * Called as the final act of trap_init(). Used in VISWS to initialise
- * the various board specific APIC traps.
- **/
-void __init trap_init_hook(void)
-{
- if (x86_quirks->arch_trap_init) {
- if (x86_quirks->arch_trap_init())
- return;
- }
-}
-
-static struct irqaction irq0 = {
- .handler = timer_interrupt,
- .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL,
- .mask = CPU_MASK_NONE,
- .name = "timer"
-};
-
-/**
- * pre_time_init_hook - do any specific initialisations before.
- *
- **/
-void __init pre_time_init_hook(void)
-{
- if (x86_quirks->arch_pre_time_init)
- x86_quirks->arch_pre_time_init();
-}
-
-/**
- * time_init_hook - do any specific initialisations for the system timer.
- *
- * Description:
- * Must plug the system timer interrupt source at HZ into the IRQ listed
- * in irq_vectors.h:TIMER_IRQ
- **/
-void __init time_init_hook(void)
-{
- if (x86_quirks->arch_time_init) {
- /*
- * A nonzero return code does not mean failure, it means
- * that the architecture quirk does not want any
- * generic (timer) setup to be performed after this:
- */
- if (x86_quirks->arch_time_init())
- return;
- }
-
- irq0.mask = cpumask_of_cpu(0);
- setup_irq(0, &irq0);
-}
-
-#ifdef CONFIG_MCA
-/**
- * mca_nmi_hook - hook into MCA specific NMI chain
- *
- * Description:
- * The MCA (Microchannel Architecture) has an NMI chain for NMI sources
- * along the MCA bus. Use this to hook into that chain if you will need
- * it.
- **/
-void mca_nmi_hook(void)
-{
- /* If I recall correctly, there's a whole bunch of other things that
- * we can do to check for NMI problems, but that's all I know about
- * at the moment.
- */
-
- printk("NMI generated from unknown source!\n");
-}
-#endif
-
-static __init int no_ipi_broadcast(char *str)
-{
- get_option(&str, &no_broadcast);
- printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
- "IPI Broadcast");
- return 1;
-}
-
-__setup("no_ipi_broadcast=", no_ipi_broadcast);
-
-static int __init print_ipi_mode(void)
-{
- printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
- "Shortcut");
- return 0;
-}
-
-late_initcall(print_ipi_mode);
-
diff --git a/arch/x86/mach-generic/Makefile b/arch/x86/mach-generic/Makefile
deleted file mode 100644
index 6730f4e7c74..00000000000
--- a/arch/x86/mach-generic/Makefile
+++ /dev/null
@@ -1,11 +0,0 @@
-#
-# Makefile for the generic architecture
-#
-
-EXTRA_CFLAGS := -Iarch/x86/kernel
-
-obj-y := probe.o default.o
-obj-$(CONFIG_X86_NUMAQ) += numaq.o
-obj-$(CONFIG_X86_SUMMIT) += summit.o
-obj-$(CONFIG_X86_BIGSMP) += bigsmp.o
-obj-$(CONFIG_X86_ES7000) += es7000.o
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
deleted file mode 100644
index 3c3b471ea49..00000000000
--- a/arch/x86/mach-generic/bigsmp.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * APIC driver for "bigsmp" XAPIC machines with more than 8 virtual CPUs.
- * Drives the local APIC in "clustered mode".
- */
-#define APIC_DEFINITION 1
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <asm/mpspec.h>
-#include <asm/genapic.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/dmi.h>
-#include <asm/bigsmp/apicdef.h>
-#include <linux/smp.h>
-#include <asm/bigsmp/apic.h>
-#include <asm/bigsmp/ipi.h>
-#include <asm/mach-default/mach_mpparse.h>
-
-static int dmi_bigsmp; /* can be set by dmi scanners */
-
-static int hp_ht_bigsmp(const struct dmi_system_id *d)
-{
- printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
- dmi_bigsmp = 1;
- return 0;
-}
-
-
-static const struct dmi_system_id bigsmp_dmi_table[] = {
- { hp_ht_bigsmp, "HP ProLiant DL760 G2",
- { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
- DMI_MATCH(DMI_BIOS_VERSION, "P44-"),}
- },
-
- { hp_ht_bigsmp, "HP ProLiant DL740",
- { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
- DMI_MATCH(DMI_BIOS_VERSION, "P47-"),}
- },
- { }
-};
-
-static cpumask_t vector_allocation_domain(int cpu)
-{
- return cpumask_of_cpu(cpu);
-}
-
-static int probe_bigsmp(void)
-{
- if (def_to_bigsmp)
- dmi_bigsmp = 1;
- else
- dmi_check_system(bigsmp_dmi_table);
- return dmi_bigsmp;
-}
-
-struct genapic apic_bigsmp = APIC_INIT("bigsmp", probe_bigsmp);
diff --git a/arch/x86/mach-generic/default.c b/arch/x86/mach-generic/default.c
deleted file mode 100644
index 9e835a11a13..00000000000
--- a/arch/x86/mach-generic/default.c
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Default generic APIC driver. This handles up to 8 CPUs.
- */
-#define APIC_DEFINITION 1
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <asm/mpspec.h>
-#include <asm/mach-default/mach_apicdef.h>
-#include <asm/genapic.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/smp.h>
-#include <linux/init.h>
-#include <asm/mach-default/mach_apic.h>
-#include <asm/mach-default/mach_ipi.h>
-#include <asm/mach-default/mach_mpparse.h>
-
-/* should be called last. */
-static int probe_default(void)
-{
- return 1;
-}
-
-struct genapic apic_default = APIC_INIT("default", probe_default);
diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c
deleted file mode 100644
index 28459cab3dd..00000000000
--- a/arch/x86/mach-generic/es7000.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * APIC driver for the Unisys ES7000 chipset.
- */
-#define APIC_DEFINITION 1
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <asm/mpspec.h>
-#include <asm/genapic.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <asm/es7000/apicdef.h>
-#include <linux/smp.h>
-#include <asm/es7000/apic.h>
-#include <asm/es7000/ipi.h>
-#include <asm/es7000/mpparse.h>
-#include <asm/es7000/wakecpu.h>
-
-static int probe_es7000(void)
-{
- /* probed later in mptable/ACPI hooks */
- return 0;
-}
-
-extern void es7000_sw_apic(void);
-static void __init enable_apic_mode(void)
-{
- es7000_sw_apic();
- return;
-}
-
-static __init int mps_oem_check(struct mp_config_table *mpc, char *oem,
- char *productid)
-{
- if (mpc->mpc_oemptr) {
- struct mp_config_oemtable *oem_table =
- (struct mp_config_oemtable *)mpc->mpc_oemptr;
- if (!strncmp(oem, "UNISYS", 6))
- return parse_unisys_oem((char *)oem_table);
- }
- return 0;
-}
-
-#ifdef CONFIG_ACPI
-/* Hook from generic ACPI tables.c */
-static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- unsigned long oem_addr = 0;
- int check_dsdt;
- int ret = 0;
-
- /* check dsdt at first to avoid clear fix_map for oem_addr */
- check_dsdt = es7000_check_dsdt();
-
- if (!find_unisys_acpi_oem_table(&oem_addr)) {
- if (check_dsdt)
- ret = parse_unisys_oem((char *)oem_addr);
- else {
- setup_unisys();
- ret = 1;
- }
- /*
- * we need to unmap it
- */
- unmap_unisys_acpi_oem_table(oem_addr);
- }
- return ret;
-}
-#else
-static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- return 0;
-}
-#endif
-
-static cpumask_t vector_allocation_domain(int cpu)
-{
- /* Careful. Some cpus do not strictly honor the set of cpus
- * specified in the interrupt destination when using lowest
- * priority interrupt delivery mode.
- *
- * In particular there was a hyperthreading cpu observed to
- * deliver interrupts to the wrong hyperthread when only one
- * hyperthread was specified in the interrupt desitination.
- */
- cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
- return domain;
-}
-
-struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000);
diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c
deleted file mode 100644
index 71a309b122e..00000000000
--- a/arch/x86/mach-generic/numaq.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * APIC driver for the IBM NUMAQ chipset.
- */
-#define APIC_DEFINITION 1
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <asm/mpspec.h>
-#include <asm/genapic.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <asm/numaq/apicdef.h>
-#include <linux/smp.h>
-#include <asm/numaq/apic.h>
-#include <asm/numaq/ipi.h>
-#include <asm/numaq/mpparse.h>
-#include <asm/numaq/wakecpu.h>
-#include <asm/numaq.h>
-
-static int mps_oem_check(struct mp_config_table *mpc, char *oem,
- char *productid)
-{
- numaq_mps_oem_check(mpc, oem, productid);
- return found_numaq;
-}
-
-static int probe_numaq(void)
-{
- /* already know from get_memcfg_numaq() */
- return found_numaq;
-}
-
-/* Hook from generic ACPI tables.c */
-static int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- return 0;
-}
-
-static cpumask_t vector_allocation_domain(int cpu)
-{
- /* Careful. Some cpus do not strictly honor the set of cpus
- * specified in the interrupt destination when using lowest
- * priority interrupt delivery mode.
- *
- * In particular there was a hyperthreading cpu observed to
- * deliver interrupts to the wrong hyperthread when only one
- * hyperthread was specified in the interrupt desitination.
- */
- cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
- return domain;
-}
-
-struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq);
diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c
deleted file mode 100644
index 5a7e4619e1c..00000000000
--- a/arch/x86/mach-generic/probe.c
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright 2003 Andi Kleen, SuSE Labs.
- * Subject to the GNU Public License, v.2
- *
- * Generic x86 APIC driver probe layer.
- */
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <asm/fixmap.h>
-#include <asm/mpspec.h>
-#include <asm/apicdef.h>
-#include <asm/genapic.h>
-
-extern struct genapic apic_numaq;
-extern struct genapic apic_summit;
-extern struct genapic apic_bigsmp;
-extern struct genapic apic_es7000;
-extern struct genapic apic_default;
-
-struct genapic *genapic = &apic_default;
-
-static struct genapic *apic_probe[] __initdata = {
-#ifdef CONFIG_X86_NUMAQ
- &apic_numaq,
-#endif
-#ifdef CONFIG_X86_SUMMIT
- &apic_summit,
-#endif
-#ifdef CONFIG_X86_BIGSMP
- &apic_bigsmp,
-#endif
-#ifdef CONFIG_X86_ES7000
- &apic_es7000,
-#endif
- &apic_default, /* must be last */
- NULL,
-};
-
-static int cmdline_apic __initdata;
-static int __init parse_apic(char *arg)
-{
- int i;
-
- if (!arg)
- return -EINVAL;
-
- for (i = 0; apic_probe[i]; i++) {
- if (!strcmp(apic_probe[i]->name, arg)) {
- genapic = apic_probe[i];
- cmdline_apic = 1;
- return 0;
- }
- }
-
- /* Parsed again by __setup for debug/verbose */
- return 0;
-}
-early_param("apic", parse_apic);
-
-void __init generic_bigsmp_probe(void)
-{
-#ifdef CONFIG_X86_BIGSMP
- /*
- * This routine is used to switch to bigsmp mode when
- * - There is no apic= option specified by the user
- * - generic_apic_probe() has chosen apic_default as the sub_arch
- * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
- */
-
- if (!cmdline_apic && genapic == &apic_default)
- if (apic_bigsmp.probe()) {
- genapic = &apic_bigsmp;
- printk(KERN_INFO "Overriding APIC driver with %s\n",
- genapic->name);
- }
-#endif
-}
-
-void __init generic_apic_probe(void)
-{
- if (!cmdline_apic) {
- int i;
- for (i = 0; apic_probe[i]; i++) {
- if (apic_probe[i]->probe()) {
- genapic = apic_probe[i];
- break;
- }
- }
- /* Not visible without early console */
- if (!apic_probe[i])
- panic("Didn't find an APIC driver");
- }
- printk(KERN_INFO "Using APIC driver %s\n", genapic->name);
-}
-
-/* These functions can switch the APIC even after the initial ->probe() */
-
-int __init mps_oem_check(struct mp_config_table *mpc, char *oem,
- char *productid)
-{
- int i;
- for (i = 0; apic_probe[i]; ++i) {
- if (apic_probe[i]->mps_oem_check(mpc, oem, productid)) {
- if (!cmdline_apic) {
- genapic = apic_probe[i];
- printk(KERN_INFO "Switched to APIC driver `%s'.\n",
- genapic->name);
- }
- return 1;
- }
- }
- return 0;
-}
-
-int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- int i;
- for (i = 0; apic_probe[i]; ++i) {
- if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
- if (!cmdline_apic) {
- genapic = apic_probe[i];
- printk(KERN_INFO "Switched to APIC driver `%s'.\n",
- genapic->name);
- }
- return 1;
- }
- }
- return 0;
-}
-
-int hard_smp_processor_id(void)
-{
- return genapic->get_apic_id(*(unsigned long *)(APIC_BASE+APIC_ID));
-}
diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c
deleted file mode 100644
index 6272b5e69da..00000000000
--- a/arch/x86/mach-generic/summit.c
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * APIC driver for the IBM "Summit" chipset.
- */
-#define APIC_DEFINITION 1
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <asm/mpspec.h>
-#include <asm/genapic.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <asm/summit/apicdef.h>
-#include <linux/smp.h>
-#include <asm/summit/apic.h>
-#include <asm/summit/ipi.h>
-#include <asm/summit/mpparse.h>
-
-static int probe_summit(void)
-{
- /* probed later in mptable/ACPI hooks */
- return 0;
-}
-
-static cpumask_t vector_allocation_domain(int cpu)
-{
- /* Careful. Some cpus do not strictly honor the set of cpus
- * specified in the interrupt destination when using lowest
- * priority interrupt delivery mode.
- *
- * In particular there was a hyperthreading cpu observed to
- * deliver interrupts to the wrong hyperthread when only one
- * hyperthread was specified in the interrupt desitination.
- */
- cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
- return domain;
-}
-
-struct genapic apic_summit = APIC_INIT("summit", probe_summit);
diff --git a/arch/x86/mach-rdc321x/Makefile b/arch/x86/mach-rdc321x/Makefile
deleted file mode 100644
index 8325b4ca431..00000000000
--- a/arch/x86/mach-rdc321x/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-#
-# Makefile for the RDC321x specific parts of the kernel
-#
-obj-$(CONFIG_X86_RDC321X) := gpio.o platform.o
-
diff --git a/arch/x86/mach-rdc321x/gpio.c b/arch/x86/mach-rdc321x/gpio.c
deleted file mode 100644
index 247f33d3a40..00000000000
--- a/arch/x86/mach-rdc321x/gpio.c
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- * GPIO support for RDC SoC R3210/R8610
- *
- * Copyright (C) 2007, Florian Fainelli <florian@openwrt.org>
- * Copyright (C) 2008, Volker Weiss <dev@tintuc.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-
-#include <linux/spinlock.h>
-#include <linux/io.h>
-#include <linux/types.h>
-#include <linux/module.h>
-
-#include <asm/gpio.h>
-#include <asm/mach-rdc321x/rdc321x_defs.h>
-
-
-/* spin lock to protect our private copy of GPIO data register plus
- the access to PCI conf registers. */
-static DEFINE_SPINLOCK(gpio_lock);
-
-/* copy of GPIO data registers */
-static u32 gpio_data_reg1;
-static u32 gpio_data_reg2;
-
-static u32 gpio_request_data[2];
-
-
-static inline void rdc321x_conf_write(unsigned addr, u32 value)
-{
- outl((1 << 31) | (7 << 11) | addr, RDC3210_CFGREG_ADDR);
- outl(value, RDC3210_CFGREG_DATA);
-}
-
-static inline void rdc321x_conf_or(unsigned addr, u32 value)
-{
- outl((1 << 31) | (7 << 11) | addr, RDC3210_CFGREG_ADDR);
- value |= inl(RDC3210_CFGREG_DATA);
- outl(value, RDC3210_CFGREG_DATA);
-}
-
-static inline u32 rdc321x_conf_read(unsigned addr)
-{
- outl((1 << 31) | (7 << 11) | addr, RDC3210_CFGREG_ADDR);
-
- return inl(RDC3210_CFGREG_DATA);
-}
-
-/* configure pin as GPIO */
-static void rdc321x_configure_gpio(unsigned gpio)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&gpio_lock, flags);
- rdc321x_conf_or(gpio < 32
- ? RDC321X_GPIO_CTRL_REG1 : RDC321X_GPIO_CTRL_REG2,
- 1 << (gpio & 0x1f));
- spin_unlock_irqrestore(&gpio_lock, flags);
-}
-
-/* initially setup the 2 copies of the gpio data registers.
- This function must be called by the platform setup code. */
-void __init rdc321x_gpio_setup()
-{
- /* this might not be, what others (BIOS, bootloader, etc.)
- wrote to these registers before, but it's a good guess. Still
- better than just using 0xffffffff. */
-
- gpio_data_reg1 = rdc321x_conf_read(RDC321X_GPIO_DATA_REG1);
- gpio_data_reg2 = rdc321x_conf_read(RDC321X_GPIO_DATA_REG2);
-}
-
-/* determine, if gpio number is valid */
-static inline int rdc321x_is_gpio(unsigned gpio)
-{
- return gpio <= RDC321X_MAX_GPIO;
-}
-
-/* request GPIO */
-int rdc_gpio_request(unsigned gpio, const char *label)
-{
- unsigned long flags;
-
- if (!rdc321x_is_gpio(gpio))
- return -EINVAL;
-
- spin_lock_irqsave(&gpio_lock, flags);
- if (gpio_request_data[(gpio & 0x20) ? 1 : 0] & (1 << (gpio & 0x1f)))
- goto inuse;
- gpio_request_data[(gpio & 0x20) ? 1 : 0] |= (1 << (gpio & 0x1f));
- spin_unlock_irqrestore(&gpio_lock, flags);
-
- return 0;
-inuse:
- spin_unlock_irqrestore(&gpio_lock, flags);
- return -EINVAL;
-}
-EXPORT_SYMBOL(rdc_gpio_request);
-
-/* release previously-claimed GPIO */
-void rdc_gpio_free(unsigned gpio)
-{
- unsigned long flags;
-
- if (!rdc321x_is_gpio(gpio))
- return;
-
- spin_lock_irqsave(&gpio_lock, flags);
- gpio_request_data[(gpio & 0x20) ? 1 : 0] &= ~(1 << (gpio & 0x1f));
- spin_unlock_irqrestore(&gpio_lock, flags);
-}
-EXPORT_SYMBOL(rdc_gpio_free);
-
-/* read GPIO pin */
-int rdc_gpio_get_value(unsigned gpio)
-{
- u32 reg;
- unsigned long flags;
-
- spin_lock_irqsave(&gpio_lock, flags);
- reg = rdc321x_conf_read(gpio < 32
- ? RDC321X_GPIO_DATA_REG1 : RDC321X_GPIO_DATA_REG2);
- spin_unlock_irqrestore(&gpio_lock, flags);
-
- return (1 << (gpio & 0x1f)) & reg ? 1 : 0;
-}
-EXPORT_SYMBOL(rdc_gpio_get_value);
-
-/* set GPIO pin to value */
-void rdc_gpio_set_value(unsigned gpio, int value)
-{
- unsigned long flags;
- u32 reg;
-
- reg = 1 << (gpio & 0x1f);
- if (gpio < 32) {
- spin_lock_irqsave(&gpio_lock, flags);
- if (value)
- gpio_data_reg1 |= reg;
- else
- gpio_data_reg1 &= ~reg;
- rdc321x_conf_write(RDC321X_GPIO_DATA_REG1, gpio_data_reg1);
- spin_unlock_irqrestore(&gpio_lock, flags);
- } else {
- spin_lock_irqsave(&gpio_lock, flags);
- if (value)
- gpio_data_reg2 |= reg;
- else
- gpio_data_reg2 &= ~reg;
- rdc321x_conf_write(RDC321X_GPIO_DATA_REG2, gpio_data_reg2);
- spin_unlock_irqrestore(&gpio_lock, flags);
- }
-}
-EXPORT_SYMBOL(rdc_gpio_set_value);
-
-/* configure GPIO pin as input */
-int rdc_gpio_direction_input(unsigned gpio)
-{
- if (!rdc321x_is_gpio(gpio))
- return -EINVAL;
-
- rdc321x_configure_gpio(gpio);
-
- return 0;
-}
-EXPORT_SYMBOL(rdc_gpio_direction_input);
-
-/* configure GPIO pin as output and set value */
-int rdc_gpio_direction_output(unsigned gpio, int value)
-{
- if (!rdc321x_is_gpio(gpio))
- return -EINVAL;
-
- gpio_set_value(gpio, value);
- rdc321x_configure_gpio(gpio);
-
- return 0;
-}
-EXPORT_SYMBOL(rdc_gpio_direction_output);
diff --git a/arch/x86/mach-rdc321x/platform.c b/arch/x86/mach-rdc321x/platform.c
deleted file mode 100644
index 4f4e50c3ad3..00000000000
--- a/arch/x86/mach-rdc321x/platform.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Generic RDC321x platform devices
- *
- * Copyright (C) 2007 Florian Fainelli <florian@openwrt.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the
- * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
- * Boston, MA 02110-1301, USA.
- *
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/device.h>
-#include <linux/platform_device.h>
-#include <linux/leds.h>
-
-#include <asm/gpio.h>
-
-/* LEDS */
-static struct gpio_led default_leds[] = {
- { .name = "rdc:dmz", .gpio = 1, },
-};
-
-static struct gpio_led_platform_data rdc321x_led_data = {
- .num_leds = ARRAY_SIZE(default_leds),
- .leds = default_leds,
-};
-
-static struct platform_device rdc321x_leds = {
- .name = "leds-gpio",
- .id = -1,
- .dev = {
- .platform_data = &rdc321x_led_data,
- }
-};
-
-/* Watchdog */
-static struct platform_device rdc321x_wdt = {
- .name = "rdc321x-wdt",
- .id = -1,
- .num_resources = 0,
-};
-
-static struct platform_device *rdc321x_devs[] = {
- &rdc321x_leds,
- &rdc321x_wdt
-};
-
-static int __init rdc_board_setup(void)
-{
- rdc321x_gpio_setup();
-
- return platform_add_devices(rdc321x_devs, ARRAY_SIZE(rdc321x_devs));
-}
-
-arch_initcall(rdc_board_setup);
diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c
index a580b9562e7..66b7eb57d8e 100644
--- a/arch/x86/mach-voyager/setup.c
+++ b/arch/x86/mach-voyager/setup.c
@@ -9,6 +9,7 @@
#include <asm/e820.h>
#include <asm/io.h>
#include <asm/setup.h>
+#include <asm/cpu.h>
void __init pre_intr_init_hook(void)
{
@@ -33,13 +34,23 @@ void __init intr_init_hook(void)
setup_irq(2, &irq2);
}
-void __init pre_setup_arch_hook(void)
+static void voyager_disable_tsc(void)
{
/* Voyagers run their CPUs from independent clocks, so disable
* the TSC code because we can't sync them */
setup_clear_cpu_cap(X86_FEATURE_TSC);
}
+void __init pre_setup_arch_hook(void)
+{
+ voyager_disable_tsc();
+}
+
+void __init pre_time_init_hook(void)
+{
+ voyager_disable_tsc();
+}
+
void __init trap_init_hook(void)
{
}
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 0e331652681..6f5a38c7f90 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -7,6 +7,7 @@
* This file provides all the same external entries as smp.c but uses
* the voyager hal to provide the functionality
*/
+#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/kernel_stat.h>
@@ -62,11 +63,6 @@ static int voyager_extended_cpus = 1;
/* Used for the invalidate map that's also checked in the spinlock */
static volatile unsigned long smp_invalidate_needed;
-/* Bitmask of currently online CPUs - used by setup.c for
- /proc/cpuinfo, visible externally but still physical */
-cpumask_t cpu_online_map = CPU_MASK_NONE;
-EXPORT_SYMBOL(cpu_online_map);
-
/* Bitmask of CPUs present in the system - exported by i386_syms.c, used
* by scheduler but indexed physically */
cpumask_t phys_cpu_present_map = CPU_MASK_NONE;
@@ -85,7 +81,7 @@ static void enable_local_vic_irq(unsigned int irq);
static void disable_local_vic_irq(unsigned int irq);
static void before_handle_vic_irq(unsigned int irq);
static void after_handle_vic_irq(unsigned int irq);
-static void set_vic_irq_affinity(unsigned int irq, cpumask_t mask);
+static void set_vic_irq_affinity(unsigned int irq, const struct cpumask *mask);
static void ack_vic_irq(unsigned int irq);
static void vic_enable_cpi(void);
static void do_boot_cpu(__u8 cpuid);
@@ -215,10 +211,6 @@ static __u32 cpu_booted_map;
static cpumask_t smp_commenced_mask = CPU_MASK_NONE;
/* This is for the new dynamic CPU boot code */
-cpumask_t cpu_callin_map = CPU_MASK_NONE;
-cpumask_t cpu_callout_map = CPU_MASK_NONE;
-cpumask_t cpu_possible_map = CPU_MASK_NONE;
-EXPORT_SYMBOL(cpu_possible_map);
/* The per processor IRQ masks (these are usually kept in sync) */
static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned;
@@ -363,9 +355,8 @@ void __init find_smp_config(void)
printk("VOYAGER SMP: Boot cpu is %d\n", boot_cpu_id);
/* initialize the CPU structures (moved from smp_boot_cpus) */
- for (i = 0; i < NR_CPUS; i++) {
+ for (i = 0; i < nr_cpu_ids; i++)
cpu_irq_affinity[i] = ~0;
- }
cpu_online_map = cpumask_of_cpu(boot_cpu_id);
/* The boot CPU must be extended */
@@ -385,7 +376,7 @@ void __init find_smp_config(void)
cpus_addr(phys_cpu_present_map)[0] |=
voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK +
3) << 24;
- cpu_possible_map = phys_cpu_present_map;
+ init_cpu_possible(&phys_cpu_present_map);
printk("VOYAGER SMP: phys_cpu_present_map = 0x%lx\n",
cpus_addr(phys_cpu_present_map)[0]);
/* Here we set up the VIC to enable SMP */
@@ -409,7 +400,7 @@ void __init find_smp_config(void)
VOYAGER_SUS_IN_CONTROL_PORT);
current_thread_info()->cpu = boot_cpu_id;
- x86_write_percpu(cpu_number, boot_cpu_id);
+ percpu_write(cpu_number, boot_cpu_id);
}
/*
@@ -537,7 +528,6 @@ static void __init do_boot_cpu(__u8 cpu)
/* init_tasks (in sched.c) is indexed logically */
stack_start.sp = (void *)idle->thread.sp;
- init_gdt(cpu);
per_cpu(current_task, cpu) = idle;
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
irq_ctx_init(cpu);
@@ -678,7 +668,7 @@ void __init smp_boot_cpus(void)
/* loop over all the extended VIC CPUs and boot them. The
* Quad CPUs must be bootstrapped by their extended VIC cpu */
- for (i = 0; i < NR_CPUS; i++) {
+ for (i = 0; i < nr_cpu_ids; i++) {
if (i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map))
continue;
do_boot_cpu(i);
@@ -1233,7 +1223,7 @@ int setup_profiling_timer(unsigned int multiplier)
* new values until the next timer interrupt in which they do process
* accounting.
*/
- for (i = 0; i < NR_CPUS; ++i)
+ for (i = 0; i < nr_cpu_ids; ++i)
per_cpu(prof_multiplier, i) = multiplier;
return 0;
@@ -1263,7 +1253,7 @@ void __init voyager_smp_intr_init(void)
int i;
/* initialize the per cpu irq mask to all disabled */
- for (i = 0; i < NR_CPUS; i++)
+ for (i = 0; i < nr_cpu_ids; i++)
vic_irq_mask[i] = 0xFFFF;
VIC_SET_GATE(VIC_CPI_LEVEL0, vic_cpi_interrupt);
@@ -1606,16 +1596,16 @@ static void after_handle_vic_irq(unsigned int irq)
* change the mask and then do an interrupt enable CPI to re-enable on
* the selected processors */
-void set_vic_irq_affinity(unsigned int irq, cpumask_t mask)
+void set_vic_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
/* Only extended processors handle interrupts */
unsigned long real_mask;
unsigned long irq_mask = 1 << irq;
int cpu;
- real_mask = cpus_addr(mask)[0] & voyager_extended_vic_processors;
+ real_mask = cpus_addr(*mask)[0] & voyager_extended_vic_processors;
- if (cpus_addr(mask)[0] == 0)
+ if (cpus_addr(*mask)[0] == 0)
/* can't have no CPUs to accept the interrupt -- extremely
* bad things will happen */
return;
@@ -1754,13 +1744,13 @@ static void __init voyager_smp_prepare_cpus(unsigned int max_cpus)
static void __cpuinit voyager_smp_prepare_boot_cpu(void)
{
- init_gdt(smp_processor_id());
- switch_to_new_gdt();
+ int cpu = smp_processor_id();
+ switch_to_new_gdt(cpu);
- cpu_set(smp_processor_id(), cpu_online_map);
- cpu_set(smp_processor_id(), cpu_callout_map);
- cpu_set(smp_processor_id(), cpu_possible_map);
- cpu_set(smp_processor_id(), cpu_present_map);
+ cpu_online_map = cpumask_of_cpu(smp_processor_id());
+ cpu_callout_map = cpumask_of_cpu(smp_processor_id());
+ cpu_callin_map = CPU_MASK_NONE;
+ cpu_present_map = cpumask_of_cpu(smp_processor_id());
}
static int __cpuinit voyager_cpu_up(unsigned int cpu)
@@ -1787,7 +1777,17 @@ static void __init voyager_smp_cpus_done(unsigned int max_cpus)
void __init smp_setup_processor_id(void)
{
current_thread_info()->cpu = hard_smp_processor_id();
- x86_write_percpu(cpu_number, hard_smp_processor_id());
+}
+
+static void voyager_send_call_func(const struct cpumask *callmask)
+{
+ __u32 mask = cpus_addr(*callmask)[0] & ~(1 << smp_processor_id());
+ send_CPI(mask, VIC_CALL_FUNCTION_CPI);
+}
+
+static void voyager_send_call_func_single(int cpu)
+{
+ send_CPI(1 << cpu, VIC_CALL_FUNCTION_SINGLE_CPI);
}
struct smp_ops smp_ops = {
@@ -1799,6 +1799,6 @@ struct smp_ops smp_ops = {
.smp_send_stop = voyager_smp_send_stop,
.smp_send_reschedule = voyager_smp_send_reschedule,
- .send_call_func_ipi = native_send_call_func_ipi,
- .send_call_func_single_ipi = native_send_call_func_single_ipi,
+ .send_call_func_ipi = voyager_send_call_func,
+ .send_call_func_single_ipi = voyager_send_call_func_single,
};
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index c7b06feb139..5d87f586f8d 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -131,7 +131,7 @@ u_char emulating = 0;
static int valid_prefix(u_char *Byte, u_char __user ** fpu_eip,
overrides * override);
-asmlinkage void math_emulate(long arg)
+void math_emulate(struct math_emu_info *info)
{
u_char FPU_modrm, byte1;
unsigned short code;
@@ -161,7 +161,7 @@ asmlinkage void math_emulate(long arg)
RE_ENTRANT_CHECK_ON;
#endif /* RE_ENTRANT_CHECKING */
- SETUP_DATA_AREA(arg);
+ FPU_info = info;
FPU_ORIG_EIP = FPU_EIP;
@@ -659,7 +659,7 @@ static int valid_prefix(u_char *Byte, u_char __user **fpu_eip,
}
}
-void math_abort(struct info *info, unsigned int signal)
+void math_abort(struct math_emu_info *info, unsigned int signal)
{
FPU_EIP = FPU_ORIG_EIP;
current->thread.trap_no = 16;
diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h
index aa49b6a0d85..9779df436b7 100644
--- a/arch/x86/math-emu/fpu_proto.h
+++ b/arch/x86/math-emu/fpu_proto.h
@@ -51,8 +51,8 @@ extern void ffreep(void);
extern void fst_i_(void);
extern void fstp_i(void);
/* fpu_entry.c */
-asmlinkage extern void math_emulate(long arg);
-extern void math_abort(struct info *info, unsigned int signal);
+extern void math_emulate(struct math_emu_info *info);
+extern void math_abort(struct math_emu_info *info, unsigned int signal);
/* fpu_etc.c */
extern void FPU_etc(void);
/* fpu_tags.c */
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index 13488fa153e..50fa0ec2c8a 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -16,10 +16,6 @@
#include <linux/kernel.h>
#include <linux/mm.h>
-/* This sets the pointer FPU_info to point to the argument part
- of the stack frame of math_emulate() */
-#define SETUP_DATA_AREA(arg) FPU_info = (struct info *) &arg
-
/* s is always from a cpu register, and the cpu does bounds checking
* during register load --> no further bounds checks needed */
#define LDT_DESCRIPTOR(s) (((struct desc_struct *)current->mm->context.ldt)[(s) >> 3])
@@ -38,12 +34,12 @@
#define I387 (current->thread.xstate)
#define FPU_info (I387->soft.info)
-#define FPU_CS (*(unsigned short *) &(FPU_info->___cs))
-#define FPU_SS (*(unsigned short *) &(FPU_info->___ss))
-#define FPU_DS (*(unsigned short *) &(FPU_info->___ds))
-#define FPU_EAX (FPU_info->___eax)
-#define FPU_EFLAGS (FPU_info->___eflags)
-#define FPU_EIP (FPU_info->___eip)
+#define FPU_CS (*(unsigned short *) &(FPU_info->regs->cs))
+#define FPU_SS (*(unsigned short *) &(FPU_info->regs->ss))
+#define FPU_DS (*(unsigned short *) &(FPU_info->regs->ds))
+#define FPU_EAX (FPU_info->regs->ax)
+#define FPU_EFLAGS (FPU_info->regs->flags)
+#define FPU_EIP (FPU_info->regs->ip)
#define FPU_ORIG_EIP (FPU_info->___orig_eip)
#define FPU_lookahead (I387->soft.lookahead)
diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c
index d701e2b39e4..6ef5e99380f 100644
--- a/arch/x86/math-emu/get_address.c
+++ b/arch/x86/math-emu/get_address.c
@@ -29,46 +29,43 @@
#define FPU_WRITE_BIT 0x10
static int reg_offset[] = {
- offsetof(struct info, ___eax),
- offsetof(struct info, ___ecx),
- offsetof(struct info, ___edx),
- offsetof(struct info, ___ebx),
- offsetof(struct info, ___esp),
- offsetof(struct info, ___ebp),
- offsetof(struct info, ___esi),
- offsetof(struct info, ___edi)
+ offsetof(struct pt_regs, ax),
+ offsetof(struct pt_regs, cx),
+ offsetof(struct pt_regs, dx),
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, sp),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di)
};
-#define REG_(x) (*(long *)(reg_offset[(x)]+(u_char *) FPU_info))
+#define REG_(x) (*(long *)(reg_offset[(x)] + (u_char *)FPU_info->regs))
static int reg_offset_vm86[] = {
- offsetof(struct info, ___cs),
- offsetof(struct info, ___vm86_ds),
- offsetof(struct info, ___vm86_es),
- offsetof(struct info, ___vm86_fs),
- offsetof(struct info, ___vm86_gs),
- offsetof(struct info, ___ss),
- offsetof(struct info, ___vm86_ds)
+ offsetof(struct pt_regs, cs),
+ offsetof(struct kernel_vm86_regs, ds),
+ offsetof(struct kernel_vm86_regs, es),
+ offsetof(struct kernel_vm86_regs, fs),
+ offsetof(struct kernel_vm86_regs, gs),
+ offsetof(struct pt_regs, ss),
+ offsetof(struct kernel_vm86_regs, ds)
};
#define VM86_REG_(x) (*(unsigned short *) \
- (reg_offset_vm86[((unsigned)x)]+(u_char *) FPU_info))
-
-/* This dummy, gs is not saved on the stack. */
-#define ___GS ___ds
+ (reg_offset_vm86[((unsigned)x)] + (u_char *)FPU_info->regs))
static int reg_offset_pm[] = {
- offsetof(struct info, ___cs),
- offsetof(struct info, ___ds),
- offsetof(struct info, ___es),
- offsetof(struct info, ___fs),
- offsetof(struct info, ___GS),
- offsetof(struct info, ___ss),
- offsetof(struct info, ___ds)
+ offsetof(struct pt_regs, cs),
+ offsetof(struct pt_regs, ds),
+ offsetof(struct pt_regs, es),
+ offsetof(struct pt_regs, fs),
+ offsetof(struct pt_regs, ds), /* dummy, not saved on stack */
+ offsetof(struct pt_regs, ss),
+ offsetof(struct pt_regs, ds)
};
#define PM_REG_(x) (*(unsigned short *) \
- (reg_offset_pm[((unsigned)x)]+(u_char *) FPU_info))
+ (reg_offset_pm[((unsigned)x)] + (u_char *)FPU_info->regs))
/* Decode the SIB byte. This function assumes mod != 0 */
static int sib(int mod, unsigned long *fpu_eip)
@@ -153,11 +150,9 @@ static long pm_address(u_char FPU_modrm, u_char segment,
#endif /* PARANOID */
switch (segment) {
- /* gs isn't used by the kernel, so it still has its
- user-space value. */
case PREFIX_GS_ - 1:
- /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */
- savesegment(gs, addr->selector);
+ /* user gs handling can be lazy, use special accessors */
+ addr->selector = get_user_gs(FPU_info->regs);
break;
default:
addr->selector = PM_REG_(segment);
@@ -349,34 +344,34 @@ void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip,
}
switch (rm) {
case 0:
- address += FPU_info->___ebx + FPU_info->___esi;
+ address += FPU_info->regs->bx + FPU_info->regs->si;
break;
case 1:
- address += FPU_info->___ebx + FPU_info->___edi;
+ address += FPU_info->regs->bx + FPU_info->regs->di;
break;
case 2:
- address += FPU_info->___ebp + FPU_info->___esi;
+ address += FPU_info->regs->bp + FPU_info->regs->si;
if (addr_modes.override.segment == PREFIX_DEFAULT)
addr_modes.override.segment = PREFIX_SS_;
break;
case 3:
- address += FPU_info->___ebp + FPU_info->___edi;
+ address += FPU_info->regs->bp + FPU_info->regs->di;
if (addr_modes.override.segment == PREFIX_DEFAULT)
addr_modes.override.segment = PREFIX_SS_;
break;
case 4:
- address += FPU_info->___esi;
+ address += FPU_info->regs->si;
break;
case 5:
- address += FPU_info->___edi;
+ address += FPU_info->regs->di;
break;
case 6:
- address += FPU_info->___ebp;
+ address += FPU_info->regs->bp;
if (addr_modes.override.segment == PREFIX_DEFAULT)
addr_modes.override.segment = PREFIX_SS_;
break;
case 7:
- address += FPU_info->___ebx;
+ address += FPU_info->regs->bx;
break;
}
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index fea4565ff57..2b938a38491 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,6 +1,8 @@
obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
pat.o pgtable.o gup.o
+obj-$(CONFIG_SMP) += tlb.o
+
obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
@@ -8,9 +10,8 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
obj-$(CONFIG_HIGHMEM) += highmem_32.o
-obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o
obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
-mmiotrace-y := pf_in.o mmio-mod.o
+mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
obj-$(CONFIG_NUMA) += numa_$(BITS).o
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 7e8db53528a..61b41ca3b5a 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -23,6 +23,12 @@ int fixup_exception(struct pt_regs *regs)
fixup = search_exception_tables(regs->ip);
if (fixup) {
+ /* If fixup is less than 16, it means uaccess error */
+ if (fixup->fixup < 16) {
+ current_thread_info()->uaccess_err = -EFAULT;
+ regs->ip += fixup->fixup;
+ return 1;
+ }
regs->ip = fixup->fixup;
return 1;
}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 31e8730fa24..2a9ea3aee49 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -26,6 +26,7 @@
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/kdebug.h>
+#include <linux/magic.h>
#include <asm/system.h>
#include <asm/desc.h>
@@ -53,7 +54,7 @@
static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
{
-#ifdef CONFIG_MMIOTRACE_HOOKS
+#ifdef CONFIG_MMIOTRACE
if (unlikely(is_kmmio_active()))
if (kmmio_handler(regs, addr) == 1)
return -1;
@@ -91,8 +92,8 @@ static inline int notify_page_fault(struct pt_regs *regs)
*
* Opcode checker based on code by Richard Brunner
*/
-static int is_prefetch(struct pt_regs *regs, unsigned long addr,
- unsigned long error_code)
+static int is_prefetch(struct pt_regs *regs, unsigned long error_code,
+ unsigned long addr)
{
unsigned char *instr;
int scan_more = 1;
@@ -393,7 +394,7 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
if (pte && pte_present(*pte) && !pte_exec(*pte))
printk(KERN_CRIT "kernel tried to execute "
"NX-protected page - exploit attempt? "
- "(uid: %d)\n", current->uid);
+ "(uid: %d)\n", current_uid());
}
#endif
@@ -409,24 +410,214 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
}
#ifdef CONFIG_X86_64
-static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
- unsigned long error_code)
+static noinline void pgtable_bad(struct pt_regs *regs,
+ unsigned long error_code, unsigned long address)
{
unsigned long flags = oops_begin();
- struct task_struct *tsk;
+ int sig = SIGKILL;
+ struct task_struct *tsk = current;
printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
- current->comm, address);
+ tsk->comm, address);
dump_pagetable(address);
- tsk = current;
tsk->thread.cr2 = address;
tsk->thread.trap_no = 14;
tsk->thread.error_code = error_code;
if (__die("Bad pagetable", regs, error_code))
- regs = NULL;
- oops_end(flags, regs, SIGKILL);
+ sig = 0;
+ oops_end(flags, regs, sig);
+}
+#endif
+
+static noinline void no_context(struct pt_regs *regs,
+ unsigned long error_code, unsigned long address)
+{
+ struct task_struct *tsk = current;
+ unsigned long *stackend;
+
+#ifdef CONFIG_X86_64
+ unsigned long flags;
+ int sig;
+#endif
+
+ /* Are we prepared to handle this kernel fault? */
+ if (fixup_exception(regs))
+ return;
+
+ /*
+ * X86_32
+ * Valid to do another page fault here, because if this fault
+ * had been triggered by is_prefetch fixup_exception would have
+ * handled it.
+ *
+ * X86_64
+ * Hall of shame of CPU/BIOS bugs.
+ */
+ if (is_prefetch(regs, error_code, address))
+ return;
+
+ if (is_errata93(regs, address))
+ return;
+
+ /*
+ * Oops. The kernel tried to access some bad page. We'll have to
+ * terminate things with extreme prejudice.
+ */
+#ifdef CONFIG_X86_32
+ bust_spinlocks(1);
+#else
+ flags = oops_begin();
+#endif
+
+ show_fault_oops(regs, error_code, address);
+
+ stackend = end_of_stack(tsk);
+ if (*stackend != STACK_END_MAGIC)
+ printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
+
+ tsk->thread.cr2 = address;
+ tsk->thread.trap_no = 14;
+ tsk->thread.error_code = error_code;
+
+#ifdef CONFIG_X86_32
+ die("Oops", regs, error_code);
+ bust_spinlocks(0);
+ do_exit(SIGKILL);
+#else
+ sig = SIGKILL;
+ if (__die("Oops", regs, error_code))
+ sig = 0;
+ /* Executive summary in case the body of the oops scrolled away */
+ printk(KERN_EMERG "CR2: %016lx\n", address);
+ oops_end(flags, regs, sig);
+#endif
+}
+
+static void __bad_area_nosemaphore(struct pt_regs *regs,
+ unsigned long error_code, unsigned long address,
+ int si_code)
+{
+ struct task_struct *tsk = current;
+
+ /* User mode accesses just cause a SIGSEGV */
+ if (error_code & PF_USER) {
+ /*
+ * It's possible to have interrupts off here.
+ */
+ local_irq_enable();
+
+ /*
+ * Valid to do another page fault here because this one came
+ * from user space.
+ */
+ if (is_prefetch(regs, error_code, address))
+ return;
+
+ if (is_errata100(regs, address))
+ return;
+
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ printk_ratelimit()) {
+ printk(
+ "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+ task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+ tsk->comm, task_pid_nr(tsk), address,
+ (void *) regs->ip, (void *) regs->sp, error_code);
+ print_vma_addr(" in ", regs->ip);
+ printk("\n");
+ }
+
+ tsk->thread.cr2 = address;
+ /* Kernel addresses are always protection faults */
+ tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+ tsk->thread.trap_no = 14;
+ force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+ return;
+ }
+
+ if (is_f00f_bug(regs, address))
+ return;
+
+ no_context(regs, error_code, address);
+}
+
+static noinline void bad_area_nosemaphore(struct pt_regs *regs,
+ unsigned long error_code, unsigned long address)
+{
+ __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
}
+
+static void __bad_area(struct pt_regs *regs,
+ unsigned long error_code, unsigned long address,
+ int si_code)
+{
+ struct mm_struct *mm = current->mm;
+
+ /*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+ up_read(&mm->mmap_sem);
+
+ __bad_area_nosemaphore(regs, error_code, address, si_code);
+}
+
+static noinline void bad_area(struct pt_regs *regs,
+ unsigned long error_code, unsigned long address)
+{
+ __bad_area(regs, error_code, address, SEGV_MAPERR);
+}
+
+static noinline void bad_area_access_error(struct pt_regs *regs,
+ unsigned long error_code, unsigned long address)
+{
+ __bad_area(regs, error_code, address, SEGV_ACCERR);
+}
+
+/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
+static void out_of_memory(struct pt_regs *regs,
+ unsigned long error_code, unsigned long address)
+{
+ /*
+ * We ran out of memory, call the OOM killer, and return the userspace
+ * (which will retry the fault, or kill us if we got oom-killed).
+ */
+ up_read(&current->mm->mmap_sem);
+ pagefault_out_of_memory();
+}
+
+static void do_sigbus(struct pt_regs *regs,
+ unsigned long error_code, unsigned long address)
+{
+ struct task_struct *tsk = current;
+ struct mm_struct *mm = tsk->mm;
+
+ up_read(&mm->mmap_sem);
+
+ /* Kernel mode? Handle exceptions or die */
+ if (!(error_code & PF_USER))
+ no_context(regs, error_code, address);
+#ifdef CONFIG_X86_32
+ /* User space => ok to do another page fault */
+ if (is_prefetch(regs, error_code, address))
+ return;
#endif
+ tsk->thread.cr2 = address;
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = 14;
+ force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+}
+
+static noinline void mm_fault_error(struct pt_regs *regs,
+ unsigned long error_code, unsigned long address, unsigned int fault)
+{
+ if (fault & VM_FAULT_OOM)
+ out_of_memory(regs, error_code, address);
+ else if (fault & VM_FAULT_SIGBUS)
+ do_sigbus(regs, error_code, address);
+ else
+ BUG();
+}
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
{
@@ -447,8 +638,8 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
* There are no security implications to leaving a stale TLB when
* increasing the permissions on a page.
*/
-static int spurious_fault(unsigned long address,
- unsigned long error_code)
+static noinline int spurious_fault(unsigned long error_code,
+ unsigned long address)
{
pgd_t *pgd;
pud_t *pud;
@@ -493,7 +684,7 @@ static int spurious_fault(unsigned long address,
*
* This assumes no large pages in there.
*/
-static int vmalloc_fault(unsigned long address)
+static noinline int vmalloc_fault(unsigned long address)
{
#ifdef CONFIG_X86_32
unsigned long pgd_paddr;
@@ -533,7 +724,7 @@ static int vmalloc_fault(unsigned long address)
happen within a race in page table update. In the later
case just flush. */
- pgd = pgd_offset(current->mm ?: &init_mm, address);
+ pgd = pgd_offset(current->active_mm, address);
pgd_ref = pgd_offset_k(address);
if (pgd_none(*pgd_ref))
return -1;
@@ -572,6 +763,25 @@ static int vmalloc_fault(unsigned long address)
int show_unhandled_signals = 1;
+static inline int access_error(unsigned long error_code, int write,
+ struct vm_area_struct *vma)
+{
+ if (write) {
+ /* write, present and write, not present */
+ if (unlikely(!(vma->vm_flags & VM_WRITE)))
+ return 1;
+ } else if (unlikely(error_code & PF_PROT)) {
+ /* read, present */
+ return 1;
+ } else {
+ /* read, not present */
+ if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+ return 1;
+ }
+
+ return 0;
+}
+
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
@@ -582,15 +792,12 @@ asmlinkage
#endif
void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
+ unsigned long address;
struct task_struct *tsk;
struct mm_struct *mm;
struct vm_area_struct *vma;
- unsigned long address;
- int write, si_code;
+ int write;
int fault;
-#ifdef CONFIG_X86_64
- unsigned long flags;
-#endif
tsk = current;
mm = tsk->mm;
@@ -599,10 +806,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
/* get the address */
address = read_cr2();
- si_code = SEGV_MAPERR;
-
- if (notify_page_fault(regs))
- return;
if (unlikely(kmmio_fault(regs, address)))
return;
@@ -629,17 +832,22 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
return;
/* Can handle a stale RO->RW TLB */
- if (spurious_fault(address, error_code))
+ if (spurious_fault(error_code, address))
return;
+ /* kprobes don't want to hook the spurious faults. */
+ if (notify_page_fault(regs))
+ return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock.
*/
- goto bad_area_nosemaphore;
+ bad_area_nosemaphore(regs, error_code, address);
+ return;
}
-
+ if (unlikely(notify_page_fault(regs)))
+ return;
/*
* It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled.
@@ -655,17 +863,18 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
#ifdef CONFIG_X86_64
if (unlikely(error_code & PF_RSVD))
- pgtable_bad(address, regs, error_code);
+ pgtable_bad(regs, error_code, address);
#endif
/*
* If we're in an interrupt, have no user context or are running in an
* atomic region then we must not take the fault.
*/
- if (unlikely(in_atomic() || !mm))
- goto bad_area_nosemaphore;
+ if (unlikely(in_atomic() || !mm)) {
+ bad_area_nosemaphore(regs, error_code, address);
+ return;
+ }
-again:
/*
* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
@@ -682,20 +891,26 @@ again:
* source. If this is invalid we can skip the address space check,
* thus avoiding the deadlock.
*/
- if (!down_read_trylock(&mm->mmap_sem)) {
+ if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
if ((error_code & PF_USER) == 0 &&
- !search_exception_tables(regs->ip))
- goto bad_area_nosemaphore;
+ !search_exception_tables(regs->ip)) {
+ bad_area_nosemaphore(regs, error_code, address);
+ return;
+ }
down_read(&mm->mmap_sem);
}
vma = find_vma(mm, address);
- if (!vma)
- goto bad_area;
- if (vma->vm_start <= address)
+ if (unlikely(!vma)) {
+ bad_area(regs, error_code, address);
+ return;
+ }
+ if (likely(vma->vm_start <= address))
goto good_area;
- if (!(vma->vm_flags & VM_GROWSDOWN))
- goto bad_area;
+ if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+ bad_area(regs, error_code, address);
+ return;
+ }
if (error_code & PF_USER) {
/*
* Accessing the stack below %sp is always a bug.
@@ -703,31 +918,25 @@ again:
* and pusha to work. ("enter $65535,$31" pushes
* 32 pointers and then decrements %sp by 65535.)
*/
- if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
- goto bad_area;
+ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
+ bad_area(regs, error_code, address);
+ return;
+ }
}
- if (expand_stack(vma, address))
- goto bad_area;
-/*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
+ if (unlikely(expand_stack(vma, address))) {
+ bad_area(regs, error_code, address);
+ return;
+ }
+
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
good_area:
- si_code = SEGV_ACCERR;
- write = 0;
- switch (error_code & (PF_PROT|PF_WRITE)) {
- default: /* 3: write, present */
- /* fall through */
- case PF_WRITE: /* write, not present */
- if (!(vma->vm_flags & VM_WRITE))
- goto bad_area;
- write++;
- break;
- case PF_PROT: /* read, present */
- goto bad_area;
- case 0: /* read, not present */
- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
- goto bad_area;
+ write = error_code & PF_WRITE;
+ if (unlikely(access_error(error_code, write, vma))) {
+ bad_area_access_error(regs, error_code, address);
+ return;
}
/*
@@ -737,11 +946,8 @@ good_area:
*/
fault = handle_mm_fault(mm, vma, address, write);
if (unlikely(fault & VM_FAULT_ERROR)) {
- if (fault & VM_FAULT_OOM)
- goto out_of_memory;
- else if (fault & VM_FAULT_SIGBUS)
- goto do_sigbus;
- BUG();
+ mm_fault_error(regs, error_code, address, fault);
+ return;
}
if (fault & VM_FAULT_MAJOR)
tsk->maj_flt++;
@@ -759,138 +965,6 @@ good_area:
}
#endif
up_read(&mm->mmap_sem);
- return;
-
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
-bad_area:
- up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
- /* User mode accesses just cause a SIGSEGV */
- if (error_code & PF_USER) {
- /*
- * It's possible to have interrupts off here.
- */
- local_irq_enable();
-
- /*
- * Valid to do another page fault here because this one came
- * from user space.
- */
- if (is_prefetch(regs, address, error_code))
- return;
-
- if (is_errata100(regs, address))
- return;
-
- if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
- printk_ratelimit()) {
- printk(
- "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
- task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
- tsk->comm, task_pid_nr(tsk), address,
- (void *) regs->ip, (void *) regs->sp, error_code);
- print_vma_addr(" in ", regs->ip);
- printk("\n");
- }
-
- tsk->thread.cr2 = address;
- /* Kernel addresses are always protection faults */
- tsk->thread.error_code = error_code | (address >= TASK_SIZE);
- tsk->thread.trap_no = 14;
- force_sig_info_fault(SIGSEGV, si_code, address, tsk);
- return;
- }
-
- if (is_f00f_bug(regs, address))
- return;
-
-no_context:
- /* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs))
- return;
-
- /*
- * X86_32
- * Valid to do another page fault here, because if this fault
- * had been triggered by is_prefetch fixup_exception would have
- * handled it.
- *
- * X86_64
- * Hall of shame of CPU/BIOS bugs.
- */
- if (is_prefetch(regs, address, error_code))
- return;
-
- if (is_errata93(regs, address))
- return;
-
-/*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
- */
-#ifdef CONFIG_X86_32
- bust_spinlocks(1);
-#else
- flags = oops_begin();
-#endif
-
- show_fault_oops(regs, error_code, address);
-
- tsk->thread.cr2 = address;
- tsk->thread.trap_no = 14;
- tsk->thread.error_code = error_code;
-
-#ifdef CONFIG_X86_32
- die("Oops", regs, error_code);
- bust_spinlocks(0);
- do_exit(SIGKILL);
-#else
- if (__die("Oops", regs, error_code))
- regs = NULL;
- /* Executive summary in case the body of the oops scrolled away */
- printk(KERN_EMERG "CR2: %016lx\n", address);
- oops_end(flags, regs, SIGKILL);
-#endif
-
-/*
- * We ran out of memory, or some other thing happened to us that made
- * us unable to handle the page fault gracefully.
- */
-out_of_memory:
- up_read(&mm->mmap_sem);
- if (is_global_init(tsk)) {
- yield();
- /*
- * Re-lookup the vma - in theory the vma tree might
- * have changed:
- */
- goto again;
- }
-
- printk("VM: killing process %s\n", tsk->comm);
- if (error_code & PF_USER)
- do_group_exit(SIGKILL);
- goto no_context;
-
-do_sigbus:
- up_read(&mm->mmap_sem);
-
- /* Kernel mode? Handle exceptions or die */
- if (!(error_code & PF_USER))
- goto no_context;
-#ifdef CONFIG_X86_32
- /* User space => ok to do another page fault */
- if (is_prefetch(regs, address, error_code))
- return;
-#endif
- tsk->thread.cr2 = address;
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 14;
- force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
}
DEFINE_SPINLOCK(pgd_lock);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c483f424207..06708ee94aa 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -21,6 +21,7 @@
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
+#include <linux/pci.h>
#include <linux/pfn.h>
#include <linux/poison.h>
#include <linux/bootmem.h>
@@ -48,7 +49,6 @@
#include <asm/paravirt.h>
#include <asm/setup.h>
#include <asm/cacheflush.h>
-#include <asm/smp.h>
unsigned int __VMALLOC_RESERVE = 128 << 20;
@@ -67,7 +67,7 @@ static unsigned long __meminitdata table_top;
static int __initdata after_init_bootmem;
-static __init void *alloc_low_page(unsigned long *phys)
+static __init void *alloc_low_page(void)
{
unsigned long pfn = table_end++;
void *adr;
@@ -77,7 +77,6 @@ static __init void *alloc_low_page(unsigned long *phys)
adr = __va(pfn * PAGE_SIZE);
memset(adr, 0, PAGE_SIZE);
- *phys = pfn * PAGE_SIZE;
return adr;
}
@@ -92,16 +91,17 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
pmd_t *pmd_table;
#ifdef CONFIG_X86_PAE
- unsigned long phys;
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
if (after_init_bootmem)
pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
else
- pmd_table = (pmd_t *)alloc_low_page(&phys);
+ pmd_table = (pmd_t *)alloc_low_page();
paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
BUG_ON(pmd_table != pmd_offset(pud, 0));
+
+ return pmd_table;
}
#endif
pud = pud_offset(pgd, 0);
@@ -126,10 +126,8 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
if (!page_table)
page_table =
(pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
- } else {
- unsigned long phys;
- page_table = (pte_t *)alloc_low_page(&phys);
- }
+ } else
+ page_table = (pte_t *)alloc_low_page();
paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -139,6 +137,47 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
return pte_offset_kernel(pmd, 0);
}
+static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
+ unsigned long vaddr, pte_t *lastpte)
+{
+#ifdef CONFIG_HIGHMEM
+ /*
+ * Something (early fixmap) may already have put a pte
+ * page here, which causes the page table allocation
+ * to become nonlinear. Attempt to fix it, and if it
+ * is still nonlinear then we have to bug.
+ */
+ int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+ int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+
+ if (pmd_idx_kmap_begin != pmd_idx_kmap_end
+ && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
+ && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
+ && ((__pa(pte) >> PAGE_SHIFT) < table_start
+ || (__pa(pte) >> PAGE_SHIFT) >= table_end)) {
+ pte_t *newpte;
+ int i;
+
+ BUG_ON(after_init_bootmem);
+ newpte = alloc_low_page();
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ set_pte(newpte + i, pte[i]);
+
+ paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
+ set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
+ BUG_ON(newpte != pte_offset_kernel(pmd, 0));
+ __flush_tlb_all();
+
+ paravirt_release_pte(__pa(pte) >> PAGE_SHIFT);
+ pte = newpte;
+ }
+ BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1)
+ && vaddr > fix_to_virt(FIX_KMAP_END)
+ && lastpte && lastpte + PTRS_PER_PTE != pte);
+#endif
+ return pte;
+}
+
/*
* This function initializes a certain range of kernel virtual memory
* with new bootmem page tables, everywhere page tables are missing in
@@ -155,6 +194,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
unsigned long vaddr;
pgd_t *pgd;
pmd_t *pmd;
+ pte_t *pte = NULL;
vaddr = start;
pgd_idx = pgd_index(vaddr);
@@ -166,7 +206,8 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
pmd = pmd + pmd_index(vaddr);
for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
pmd++, pmd_idx++) {
- one_page_table_init(pmd);
+ pte = page_table_kmap_check(one_page_table_init(pmd),
+ pmd, vaddr, pte);
vaddr += PMD_SIZE;
}
@@ -329,6 +370,8 @@ int devmem_is_allowed(unsigned long pagenr)
{
if (pagenr <= 256)
return 1;
+ if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+ return 0;
if (!page_is_ram(pagenr))
return 1;
return 0;
@@ -436,8 +479,12 @@ static void __init set_highmem_pages_init(void)
#endif /* !CONFIG_NUMA */
#else
-# define permanent_kmaps_init(pgd_base) do { } while (0)
-# define set_highmem_pages_init() do { } while (0)
+static inline void permanent_kmaps_init(pgd_t *pgd_base)
+{
+}
+static inline void set_highmem_pages_init(void)
+{
+}
#endif /* CONFIG_HIGHMEM */
void __init native_pagetable_setup_start(pgd_t *base)
@@ -503,7 +550,6 @@ static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
* Fixed mappings, only the page table structure has to be
* created - mappings will be set by set_fixmap():
*/
- early_ioremap_clear();
vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
page_table_range_init(vaddr, end, pgd_base);
@@ -628,75 +674,97 @@ static int __init parse_highmem(char *arg)
}
early_param("highmem", parse_highmem);
+#define MSG_HIGHMEM_TOO_BIG \
+ "highmem size (%luMB) is bigger than pages available (%luMB)!\n"
+
+#define MSG_LOWMEM_TOO_SMALL \
+ "highmem size (%luMB) results in <64MB lowmem, ignoring it!\n"
/*
- * Determine low and high memory ranges:
+ * All of RAM fits into lowmem - but if user wants highmem
+ * artificially via the highmem=x boot parameter then create
+ * it:
*/
-void __init find_low_pfn_range(void)
+void __init lowmem_pfn_init(void)
{
- /* it could update max_pfn */
-
/* max_low_pfn is 0, we already have early_res support */
-
max_low_pfn = max_pfn;
- if (max_low_pfn > MAXMEM_PFN) {
- if (highmem_pages == -1)
- highmem_pages = max_pfn - MAXMEM_PFN;
- if (highmem_pages + MAXMEM_PFN < max_pfn)
- max_pfn = MAXMEM_PFN + highmem_pages;
- if (highmem_pages + MAXMEM_PFN > max_pfn) {
- printk(KERN_WARNING "only %luMB highmem pages "
- "available, ignoring highmem size of %uMB.\n",
- pages_to_mb(max_pfn - MAXMEM_PFN),
+
+ if (highmem_pages == -1)
+ highmem_pages = 0;
+#ifdef CONFIG_HIGHMEM
+ if (highmem_pages >= max_pfn) {
+ printk(KERN_ERR MSG_HIGHMEM_TOO_BIG,
+ pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
+ highmem_pages = 0;
+ }
+ if (highmem_pages) {
+ if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) {
+ printk(KERN_ERR MSG_LOWMEM_TOO_SMALL,
pages_to_mb(highmem_pages));
highmem_pages = 0;
}
- max_low_pfn = MAXMEM_PFN;
+ max_low_pfn -= highmem_pages;
+ }
+#else
+ if (highmem_pages)
+ printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
+#endif
+}
+
+#define MSG_HIGHMEM_TOO_SMALL \
+ "only %luMB highmem pages available, ignoring highmem size of %luMB!\n"
+
+#define MSG_HIGHMEM_TRIMMED \
+ "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n"
+/*
+ * We have more RAM than fits into lowmem - we try to put it into
+ * highmem, also taking the highmem=x boot parameter into account:
+ */
+void __init highmem_pfn_init(void)
+{
+ max_low_pfn = MAXMEM_PFN;
+
+ if (highmem_pages == -1)
+ highmem_pages = max_pfn - MAXMEM_PFN;
+
+ if (highmem_pages + MAXMEM_PFN < max_pfn)
+ max_pfn = MAXMEM_PFN + highmem_pages;
+
+ if (highmem_pages + MAXMEM_PFN > max_pfn) {
+ printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
+ pages_to_mb(max_pfn - MAXMEM_PFN),
+ pages_to_mb(highmem_pages));
+ highmem_pages = 0;
+ }
#ifndef CONFIG_HIGHMEM
- /* Maximum memory usable is what is directly addressable */
- printk(KERN_WARNING "Warning only %ldMB will be used.\n",
- MAXMEM>>20);
- if (max_pfn > MAX_NONPAE_PFN)
- printk(KERN_WARNING
- "Use a HIGHMEM64G enabled kernel.\n");
- else
- printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
- max_pfn = MAXMEM_PFN;
+ /* Maximum memory usable is what is directly addressable */
+ printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
+ if (max_pfn > MAX_NONPAE_PFN)
+ printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
+ else
+ printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+ max_pfn = MAXMEM_PFN;
#else /* !CONFIG_HIGHMEM */
#ifndef CONFIG_HIGHMEM64G
- if (max_pfn > MAX_NONPAE_PFN) {
- max_pfn = MAX_NONPAE_PFN;
- printk(KERN_WARNING "Warning only 4GB will be used."
- "Use a HIGHMEM64G enabled kernel.\n");
- }
+ if (max_pfn > MAX_NONPAE_PFN) {
+ max_pfn = MAX_NONPAE_PFN;
+ printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
+ }
#endif /* !CONFIG_HIGHMEM64G */
#endif /* !CONFIG_HIGHMEM */
- } else {
- if (highmem_pages == -1)
- highmem_pages = 0;
-#ifdef CONFIG_HIGHMEM
- if (highmem_pages >= max_pfn) {
- printk(KERN_ERR "highmem size specified (%uMB) is "
- "bigger than pages available (%luMB)!.\n",
- pages_to_mb(highmem_pages),
- pages_to_mb(max_pfn));
- highmem_pages = 0;
- }
- if (highmem_pages) {
- if (max_low_pfn - highmem_pages <
- 64*1024*1024/PAGE_SIZE){
- printk(KERN_ERR "highmem size %uMB results in "
- "smaller than 64MB lowmem, ignoring it.\n"
- , pages_to_mb(highmem_pages));
- highmem_pages = 0;
- }
- max_low_pfn -= highmem_pages;
- }
-#else
- if (highmem_pages)
- printk(KERN_ERR "ignoring highmem size on non-highmem"
- " kernel!\n");
-#endif
- }
+}
+
+/*
+ * Determine low and high memory ranges:
+ */
+void __init find_low_pfn_range(void)
+{
+ /* it could update max_pfn */
+
+ if (max_pfn <= MAXMEM_PFN)
+ lowmem_pfn_init();
+ else
+ highmem_pfn_init();
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -796,7 +864,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse)
tables += PAGE_ALIGN(ptes * sizeof(pte_t));
/* for fixmap */
- tables += PAGE_SIZE * 2;
+ tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t));
/*
* RED-PEN putting page tables only on node 0 could
@@ -969,7 +1037,7 @@ void __init mem_init(void)
int codesize, reservedpages, datasize, initsize;
int tmp;
- start_periodic_check_for_corruption();
+ pci_iommu_alloc();
#ifdef CONFIG_FLATMEM
BUG_ON(!mem_map);
@@ -1040,11 +1108,25 @@ void __init mem_init(void)
(unsigned long)&_text, (unsigned long)&_etext,
((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+ /*
+ * Check boundaries twice: Some fundamental inconsistencies can
+ * be detected at build time already.
+ */
+#define __FIXADDR_TOP (-PAGE_SIZE)
+#ifdef CONFIG_HIGHMEM
+ BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
+ BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE);
+#endif
+#define high_memory (-128UL << 20)
+ BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END);
+#undef high_memory
+#undef __FIXADDR_TOP
+
#ifdef CONFIG_HIGHMEM
BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
BUG_ON(VMALLOC_END > PKMAP_BASE);
#endif
- BUG_ON(VMALLOC_START > VMALLOC_END);
+ BUG_ON(VMALLOC_START >= VMALLOC_END);
BUG_ON((unsigned long)high_memory > VMALLOC_START);
if (boot_cpu_data.wp_works_ok < 0)
@@ -1062,7 +1144,7 @@ int arch_add_memory(int nid, u64 start, u64 size)
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- return __add_pages(zone, start_pfn, nr_pages);
+ return __add_pages(nid, zone, start_pfn, nr_pages);
}
#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9db01db6e3c..e6d36b49025 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -596,7 +596,7 @@ static void __init init_gbpages(void)
direct_gbpages = 0;
}
-static unsigned long __init kernel_physical_mapping_init(unsigned long start,
+static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
unsigned long end,
unsigned long page_size_mask)
{
@@ -857,7 +857,7 @@ int arch_add_memory(int nid, u64 start, u64 size)
if (last_mapped_pfn > max_pfn_mapped)
max_pfn_mapped = last_mapped_pfn;
- ret = __add_pages(zone, start_pfn, nr_pages);
+ ret = __add_pages(nid, zone, start_pfn, nr_pages);
WARN_ON_ONCE(ret);
return ret;
@@ -888,6 +888,8 @@ int devmem_is_allowed(unsigned long pagenr)
{
if (pagenr <= 256)
return 1;
+ if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+ return 0;
if (!page_is_ram(pagenr))
return 1;
return 0;
@@ -902,8 +904,6 @@ void __init mem_init(void)
long codesize, reservedpages, datasize, initsize;
unsigned long absent_pages;
- start_periodic_check_for_corruption();
-
pci_iommu_alloc();
/* clear_bss() already clear the empty_zero_page */
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index d0151d8ce45..ca53224fc56 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -17,6 +17,7 @@
*/
#include <asm/iomap.h>
+#include <asm/pat.h>
#include <linux/module.h>
/* Map 'pfn' using fixed map 'type' and protections 'prot'
@@ -29,6 +30,15 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
pagefault_disable();
+ /*
+ * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
+ * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
+ * MTRR is UC or WC. UC_MINUS gets the real intention, of the
+ * user, which is "WC if the MTRR is WC, UC if you can't do that."
+ */
+ if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
+ prot = PAGE_KERNEL_UC_MINUS;
+
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
set_pte(kmap_pte-idx, pfn_pte(pfn, prot));
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index d4c4307ff3e..1448bcb7f22 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -223,7 +223,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
* Check if the request spans more than any BAR in the iomem resource
* tree.
*/
- WARN_ON(iomem_map_sanity_check(phys_addr, size));
+ WARN_ONCE(iomem_map_sanity_check(phys_addr, size),
+ KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
/*
* Don't allow anybody to remap normal RAM that we're using..
@@ -366,7 +367,7 @@ EXPORT_SYMBOL(ioremap_nocache);
*
* Must be freed with iounmap.
*/
-void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
+void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
{
if (pat_enabled)
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
@@ -556,34 +557,9 @@ void __init early_ioremap_init(void)
}
}
-void __init early_ioremap_clear(void)
-{
- pmd_t *pmd;
-
- if (early_ioremap_debug)
- printk(KERN_INFO "early_ioremap_clear()\n");
-
- pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
- pmd_clear(pmd);
- paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT);
- __flush_tlb_all();
-}
-
void __init early_ioremap_reset(void)
{
- enum fixed_addresses idx;
- unsigned long addr, phys;
- pte_t *pte;
-
after_paging_init = 1;
- for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
- addr = fix_to_virt(idx);
- pte = early_ioremap_pte(addr);
- if (pte_present(*pte)) {
- phys = pte_val(*pte) & PAGE_MASK;
- set_fixmap(idx, phys);
- }
- }
}
static void __init __early_set_fixmap(enum fixed_addresses idx,
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 41f1b5c00a1..268f8255280 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -81,7 +81,6 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
unsigned numnodes, cores, bits, apicid_base;
unsigned long prevbase;
struct bootnode nodes[8];
- unsigned char nodeids[8];
int i, j, nb, found = 0;
u32 nodeid, reg;
@@ -110,7 +109,6 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
limit = read_pci_config(0, nb, 1, 0x44 + i*8);
nodeid = limit & 7;
- nodeids[i] = nodeid;
if ((base & 3) == 0) {
if (i < numnodes)
printk("Skipping disabled node %d\n", i);
@@ -179,9 +177,6 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
nodes[nodeid].start = base;
nodes[nodeid].end = limit;
- e820_register_active_regions(nodeid,
- nodes[nodeid].start >> PAGE_SHIFT,
- nodes[nodeid].end >> PAGE_SHIFT);
prevbase = base;
@@ -211,12 +206,15 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
}
for (i = 0; i < 8; i++) {
- if (nodes[i].start != nodes[i].end) {
- nodeid = nodeids[i];
- for (j = apicid_base; j < cores + apicid_base; j++)
- apicid_to_node[(nodeid << bits) + j] = i;
- setup_node_bootmem(i, nodes[i].start, nodes[i].end);
- }
+ if (nodes[i].start == nodes[i].end)
+ continue;
+
+ e820_register_active_regions(i,
+ nodes[i].start >> PAGE_SHIFT,
+ nodes[i].end >> PAGE_SHIFT);
+ for (j = apicid_base; j < cores + apicid_base; j++)
+ apicid_to_node[(i << bits) + j] = i;
+ setup_node_bootmem(i, nodes[i].start, nodes[i].end);
}
numa_init_array();
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 56fe7124fbe..16582960056 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -4,7 +4,7 @@
* Based on code by Ingo Molnar and Andi Kleen, copyrighted
* as follows:
*
- * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * Copyright 2003-2009 Red Hat Inc.
* All Rights Reserved.
* Copyright 2005 Andi Kleen, SUSE Labs.
* Copyright 2007 Jiri Kosina, SUSE Labs.
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 847c164725f..d1f7439d173 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -222,6 +222,41 @@ static void __init remap_numa_kva(void)
}
}
+#ifdef CONFIG_HIBERNATION
+/**
+ * resume_map_numa_kva - add KVA mapping to the temporary page tables created
+ * during resume from hibernation
+ * @pgd_base - temporary resume page directory
+ */
+void resume_map_numa_kva(pgd_t *pgd_base)
+{
+ int node;
+
+ for_each_online_node(node) {
+ unsigned long start_va, start_pfn, size, pfn;
+
+ start_va = (unsigned long)node_remap_start_vaddr[node];
+ start_pfn = node_remap_start_pfn[node];
+ size = node_remap_size[node];
+
+ printk(KERN_DEBUG "%s: node %d\n", __func__, node);
+
+ for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) {
+ unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
+ pgd_t *pgd = pgd_base + pgd_index(vaddr);
+ pud_t *pud = pud_offset(pgd, vaddr);
+ pmd_t *pmd = pmd_offset(pud, vaddr);
+
+ set_pmd(pmd, pfn_pmd(start_pfn + pfn,
+ PAGE_KERNEL_LARGE_EXEC));
+
+ printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n",
+ __func__, vaddr, start_pfn + pfn);
+ }
+ }
+}
+#endif
+
static unsigned long calculate_numa_remap_pages(void)
{
int nid;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index cebcbf152d4..deb1c1ab786 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -20,6 +20,12 @@
#include <asm/acpi.h>
#include <asm/k8.h>
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+# define DBG(x...) printk(KERN_DEBUG x)
+#else
+# define DBG(x...)
+#endif
+
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
@@ -33,6 +39,21 @@ int numa_off __initdata;
static unsigned long __initdata nodemap_addr;
static unsigned long __initdata nodemap_size;
+DEFINE_PER_CPU(int, node_number) = 0;
+EXPORT_PER_CPU_SYMBOL(node_number);
+
+/*
+ * Map cpu index to node index
+ */
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+
+/*
+ * Which logical CPUs are on which nodes
+ */
+cpumask_t *node_to_cpumask_map;
+EXPORT_SYMBOL(node_to_cpumask_map);
+
/*
* Given a shift value, try to populate memnodemap[]
* Returns :
@@ -278,7 +299,7 @@ void __init numa_init_array(void)
int rr, i;
rr = first_node(node_online_map);
- for (i = 0; i < NR_CPUS; i++) {
+ for (i = 0; i < nr_cpu_ids; i++) {
if (early_cpu_to_node(i) != NUMA_NO_NODE)
continue;
numa_set_node(i, rr);
@@ -549,7 +570,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
memnodemap[0] = 0;
node_set_online(0);
node_set(0, node_possible_map);
- for (i = 0; i < NR_CPUS; i++)
+ for (i = 0; i < nr_cpu_ids; i++)
numa_set_node(i, 0);
e820_register_active_regions(0, start_pfn, last_pfn);
setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
@@ -640,3 +661,199 @@ void __init init_cpu_to_node(void)
#endif
+/*
+ * Allocate node_to_cpumask_map based on number of available nodes
+ * Requires node_possible_map to be valid.
+ *
+ * Note: node_to_cpumask() is not valid until after this is done.
+ * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
+ */
+void __init setup_node_to_cpumask_map(void)
+{
+ unsigned int node, num = 0;
+ cpumask_t *map;
+
+ /* setup nr_node_ids if not done yet */
+ if (nr_node_ids == MAX_NUMNODES) {
+ for_each_node_mask(node, node_possible_map)
+ num = node;
+ nr_node_ids = num + 1;
+ }
+
+ /* allocate the map */
+ map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
+ DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids);
+
+ pr_debug("Node to cpumask map at %p for %d nodes\n",
+ map, nr_node_ids);
+
+ /* node_to_cpumask() will now work */
+ node_to_cpumask_map = map;
+}
+
+void __cpuinit numa_set_node(int cpu, int node)
+{
+ int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
+
+ /* early setting, no percpu area yet */
+ if (cpu_to_node_map) {
+ cpu_to_node_map[cpu] = node;
+ return;
+ }
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+ if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
+ printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
+ dump_stack();
+ return;
+ }
+#endif
+ per_cpu(x86_cpu_to_node_map, cpu) = node;
+
+ if (node != NUMA_NO_NODE)
+ per_cpu(node_number, cpu) = node;
+}
+
+void __cpuinit numa_clear_node(int cpu)
+{
+ numa_set_node(cpu, NUMA_NO_NODE);
+}
+
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+
+void __cpuinit numa_add_cpu(int cpu)
+{
+ cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+ cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+
+#else /* CONFIG_DEBUG_PER_CPU_MAPS */
+
+/*
+ * --------- debug versions of the numa functions ---------
+ */
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+ int node = early_cpu_to_node(cpu);
+ cpumask_t *mask;
+ char buf[64];
+
+ if (node_to_cpumask_map == NULL) {
+ printk(KERN_ERR "node_to_cpumask_map NULL\n");
+ dump_stack();
+ return;
+ }
+
+ mask = &node_to_cpumask_map[node];
+ if (enable)
+ cpu_set(cpu, *mask);
+ else
+ cpu_clear(cpu, *mask);
+
+ cpulist_scnprintf(buf, sizeof(buf), mask);
+ printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+ enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
+}
+
+void __cpuinit numa_add_cpu(int cpu)
+{
+ numa_set_cpumask(cpu, 1);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+ numa_set_cpumask(cpu, 0);
+}
+
+int cpu_to_node(int cpu)
+{
+ if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
+ printk(KERN_WARNING
+ "cpu_to_node(%d): usage too early!\n", cpu);
+ dump_stack();
+ return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+ }
+ return per_cpu(x86_cpu_to_node_map, cpu);
+}
+EXPORT_SYMBOL(cpu_to_node);
+
+/*
+ * Same function as cpu_to_node() but used if called before the
+ * per_cpu areas are setup.
+ */
+int early_cpu_to_node(int cpu)
+{
+ if (early_per_cpu_ptr(x86_cpu_to_node_map))
+ return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+
+ if (!cpu_possible(cpu)) {
+ printk(KERN_WARNING
+ "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
+ dump_stack();
+ return NUMA_NO_NODE;
+ }
+ return per_cpu(x86_cpu_to_node_map, cpu);
+}
+
+
+/* empty cpumask */
+static const cpumask_t cpu_mask_none;
+
+/*
+ * Returns a pointer to the bitmask of CPUs on Node 'node'.
+ */
+const cpumask_t *cpumask_of_node(int node)
+{
+ if (node_to_cpumask_map == NULL) {
+ printk(KERN_WARNING
+ "cpumask_of_node(%d): no node_to_cpumask_map!\n",
+ node);
+ dump_stack();
+ return (const cpumask_t *)&cpu_online_map;
+ }
+ if (node >= nr_node_ids) {
+ printk(KERN_WARNING
+ "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
+ node, nr_node_ids);
+ dump_stack();
+ return &cpu_mask_none;
+ }
+ return &node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(cpumask_of_node);
+
+/*
+ * Returns a bitmask of CPUs on Node 'node'.
+ *
+ * Side note: this function creates the returned cpumask on the stack
+ * so with a high NR_CPUS count, excessive stack space is used. The
+ * node_to_cpumask_ptr function should be used whenever possible.
+ */
+cpumask_t node_to_cpumask(int node)
+{
+ if (node_to_cpumask_map == NULL) {
+ printk(KERN_WARNING
+ "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
+ dump_stack();
+ return cpu_online_map;
+ }
+ if (node >= nr_node_ids) {
+ printk(KERN_WARNING
+ "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
+ node, nr_node_ids);
+ dump_stack();
+ return cpu_mask_none;
+ }
+ return node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(node_to_cpumask);
+
+/*
+ * --------- end of debug versions of the numa functions ---------
+ */
+
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e89d24815f2..84ba74820ad 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -534,6 +534,36 @@ out_unlock:
return 0;
}
+static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
+ int primary)
+{
+ /*
+ * Ignore all non primary paths.
+ */
+ if (!primary)
+ return 0;
+
+ /*
+ * Ignore the NULL PTE for kernel identity mapping, as it is expected
+ * to have holes.
+ * Also set numpages to '1' indicating that we processed cpa req for
+ * one virtual address page and its pfn. TBD: numpages can be set based
+ * on the initial value and the level returned by lookup_address().
+ */
+ if (within(vaddr, PAGE_OFFSET,
+ PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
+ cpa->numpages = 1;
+ cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
+ return 0;
+ } else {
+ WARN(1, KERN_WARNING "CPA: called for zero pte. "
+ "vaddr = %lx cpa->vaddr = %lx\n", vaddr,
+ *cpa->vaddr);
+
+ return -EFAULT;
+ }
+}
+
static int __change_page_attr(struct cpa_data *cpa, int primary)
{
unsigned long address;
@@ -549,17 +579,11 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
repeat:
kpte = lookup_address(address, &level);
if (!kpte)
- return 0;
+ return __cpa_process_fault(cpa, address, primary);
old_pte = *kpte;
- if (!pte_val(old_pte)) {
- if (!primary)
- return 0;
- WARN(1, KERN_WARNING "CPA: called for zero pte. "
- "vaddr = %lx cpa->vaddr = %lx\n", address,
- *cpa->vaddr);
- return -EINVAL;
- }
+ if (!pte_val(old_pte))
+ return __cpa_process_fault(cpa, address, primary);
if (level == PG_LEVEL_4K) {
pte_t new_pte;
@@ -657,12 +681,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
vaddr = *cpa->vaddr;
if (!(within(vaddr, PAGE_OFFSET,
- PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
-#ifdef CONFIG_X86_64
- || within(vaddr, PAGE_OFFSET + (1UL<<32),
- PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
-#endif
- )) {
+ PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
alias_cpa = *cpa;
temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index eb1bf000d12..9127e31c726 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -30,7 +30,7 @@
#ifdef CONFIG_X86_PAT
int __read_mostly pat_enabled = 1;
-void __cpuinit pat_disable(char *reason)
+void __cpuinit pat_disable(const char *reason)
{
pat_enabled = 0;
printk(KERN_INFO "%s\n", reason);
@@ -42,6 +42,11 @@ static int __init nopat(char *str)
return 0;
}
early_param("nopat", nopat);
+#else
+static inline void pat_disable(const char *reason)
+{
+ (void)reason;
+}
#endif
@@ -78,16 +83,20 @@ void pat_init(void)
if (!pat_enabled)
return;
- /* Paranoia check. */
- if (!cpu_has_pat && boot_pat_state) {
- /*
- * If this happens we are on a secondary CPU, but
- * switched to PAT on the boot CPU. We have no way to
- * undo PAT.
- */
- printk(KERN_ERR "PAT enabled, "
- "but not supported by secondary CPU\n");
- BUG();
+ if (!cpu_has_pat) {
+ if (!boot_pat_state) {
+ pat_disable("PAT not supported by CPU.");
+ return;
+ } else {
+ /*
+ * If this happens we are on a secondary CPU, but
+ * switched to PAT on the boot CPU. We have no way to
+ * undo PAT.
+ */
+ printk(KERN_ERR "PAT enabled, "
+ "but not supported by secondary CPU\n");
+ BUG();
+ }
}
/* Set PWT to Write-Combining. All other bits stay the same */
@@ -333,11 +342,23 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
req_type & _PAGE_CACHE_MASK);
}
- is_range_ram = pagerange_is_ram(start, end);
- if (is_range_ram == 1)
- return reserve_ram_pages_type(start, end, req_type, new_type);
- else if (is_range_ram < 0)
- return -EINVAL;
+ if (new_type)
+ *new_type = actual_type;
+
+ /*
+ * For legacy reasons, some parts of the physical address range in the
+ * legacy 1MB region is treated as non-RAM (even when listed as RAM in
+ * the e820 tables). So we will track the memory attributes of this
+ * legacy 1MB region using the linear memtype_list always.
+ */
+ if (end >= ISA_END_ADDRESS) {
+ is_range_ram = pagerange_is_ram(start, end);
+ if (is_range_ram == 1)
+ return reserve_ram_pages_type(start, end, req_type,
+ new_type);
+ else if (is_range_ram < 0)
+ return -EINVAL;
+ }
new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
if (!new)
@@ -347,9 +368,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
new->end = end;
new->type = actual_type;
- if (new_type)
- *new_type = actual_type;
-
spin_lock(&memtype_lock);
if (cached_entry && start >= cached_start)
@@ -437,11 +455,19 @@ int free_memtype(u64 start, u64 end)
if (is_ISA_range(start, end - 1))
return 0;
- is_range_ram = pagerange_is_ram(start, end);
- if (is_range_ram == 1)
- return free_ram_pages_type(start, end);
- else if (is_range_ram < 0)
- return -EINVAL;
+ /*
+ * For legacy reasons, some parts of the physical address range in the
+ * legacy 1MB region is treated as non-RAM (even when listed as RAM in
+ * the e820 tables). So we will track the memory attributes of this
+ * legacy 1MB region using the linear memtype_list always.
+ */
+ if (end >= ISA_END_ADDRESS) {
+ is_range_ram = pagerange_is_ram(start, end);
+ if (is_range_ram == 1)
+ return free_ram_pages_type(start, end);
+ else if (is_range_ram < 0)
+ return -EINVAL;
+ }
spin_lock(&memtype_lock);
list_for_each_entry(entry, &memtype_list, nd) {
@@ -596,6 +622,255 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
free_memtype(addr, addr + size);
}
+/*
+ * Internal interface to reserve a range of physical memory with prot.
+ * Reserved non RAM regions only and after successful reserve_memtype,
+ * this func also keeps identity mapping (if any) in sync with this new prot.
+ */
+static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
+ int strict_prot)
+{
+ int is_ram = 0;
+ int id_sz, ret;
+ unsigned long flags;
+ unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK);
+
+ is_ram = pagerange_is_ram(paddr, paddr + size);
+
+ if (is_ram != 0) {
+ /*
+ * For mapping RAM pages, drivers need to call
+ * set_memory_[uc|wc|wb] directly, for reserve and free, before
+ * setting up the PTE.
+ */
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
+ if (ret)
+ return ret;
+
+ if (flags != want_flags) {
+ if (strict_prot || !is_new_memtype_allowed(want_flags, flags)) {
+ free_memtype(paddr, paddr + size);
+ printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
+ " for %Lx-%Lx, got %s\n",
+ current->comm, current->pid,
+ cattr_name(want_flags),
+ (unsigned long long)paddr,
+ (unsigned long long)(paddr + size),
+ cattr_name(flags));
+ return -EINVAL;
+ }
+ /*
+ * We allow returning different type than the one requested in
+ * non strict case.
+ */
+ *vma_prot = __pgprot((pgprot_val(*vma_prot) &
+ (~_PAGE_CACHE_MASK)) |
+ flags);
+ }
+
+ /* Need to keep identity mapping in sync */
+ if (paddr >= __pa(high_memory))
+ return 0;
+
+ id_sz = (__pa(high_memory) < paddr + size) ?
+ __pa(high_memory) - paddr :
+ size;
+
+ if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) {
+ free_memtype(paddr, paddr + size);
+ printk(KERN_ERR
+ "%s:%d reserve_pfn_range ioremap_change_attr failed %s "
+ "for %Lx-%Lx\n",
+ current->comm, current->pid,
+ cattr_name(flags),
+ (unsigned long long)paddr,
+ (unsigned long long)(paddr + size));
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * Internal interface to free a range of physical memory.
+ * Frees non RAM regions only.
+ */
+static void free_pfn_range(u64 paddr, unsigned long size)
+{
+ int is_ram;
+
+ is_ram = pagerange_is_ram(paddr, paddr + size);
+ if (is_ram == 0)
+ free_memtype(paddr, paddr + size);
+}
+
+/*
+ * track_pfn_vma_copy is called when vma that is covering the pfnmap gets
+ * copied through copy_page_range().
+ *
+ * If the vma has a linear pfn mapping for the entire range, we get the prot
+ * from pte and reserve the entire vma range with single reserve_pfn_range call.
+ * Otherwise, we reserve the entire vma range, my ging through the PTEs page
+ * by page to get physical address and protection.
+ */
+int track_pfn_vma_copy(struct vm_area_struct *vma)
+{
+ int retval = 0;
+ unsigned long i, j;
+ resource_size_t paddr;
+ unsigned long prot;
+ unsigned long vma_start = vma->vm_start;
+ unsigned long vma_end = vma->vm_end;
+ unsigned long vma_size = vma_end - vma_start;
+ pgprot_t pgprot;
+
+ if (!pat_enabled)
+ return 0;
+
+ if (is_linear_pfn_mapping(vma)) {
+ /*
+ * reserve the whole chunk covered by vma. We need the
+ * starting address and protection from pte.
+ */
+ if (follow_phys(vma, vma_start, 0, &prot, &paddr)) {
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+ pgprot = __pgprot(prot);
+ return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
+ }
+
+ /* reserve entire vma page by page, using pfn and prot from pte */
+ for (i = 0; i < vma_size; i += PAGE_SIZE) {
+ if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
+ continue;
+
+ pgprot = __pgprot(prot);
+ retval = reserve_pfn_range(paddr, PAGE_SIZE, &pgprot, 1);
+ if (retval)
+ goto cleanup_ret;
+ }
+ return 0;
+
+cleanup_ret:
+ /* Reserve error: Cleanup partial reservation and return error */
+ for (j = 0; j < i; j += PAGE_SIZE) {
+ if (follow_phys(vma, vma_start + j, 0, &prot, &paddr))
+ continue;
+
+ free_pfn_range(paddr, PAGE_SIZE);
+ }
+
+ return retval;
+}
+
+/*
+ * track_pfn_vma_new is called when a _new_ pfn mapping is being established
+ * for physical range indicated by pfn and size.
+ *
+ * prot is passed in as a parameter for the new mapping. If the vma has a
+ * linear pfn mapping for the entire range reserve the entire vma range with
+ * single reserve_pfn_range call.
+ * Otherwise, we look t the pfn and size and reserve only the specified range
+ * page by page.
+ *
+ * Note that this function can be called with caller trying to map only a
+ * subrange/page inside the vma.
+ */
+int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
+ unsigned long pfn, unsigned long size)
+{
+ int retval = 0;
+ unsigned long i, j;
+ resource_size_t base_paddr;
+ resource_size_t paddr;
+ unsigned long vma_start = vma->vm_start;
+ unsigned long vma_end = vma->vm_end;
+ unsigned long vma_size = vma_end - vma_start;
+
+ if (!pat_enabled)
+ return 0;
+
+ if (is_linear_pfn_mapping(vma)) {
+ /* reserve the whole chunk starting from vm_pgoff */
+ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
+ return reserve_pfn_range(paddr, vma_size, prot, 0);
+ }
+
+ /* reserve page by page using pfn and size */
+ base_paddr = (resource_size_t)pfn << PAGE_SHIFT;
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ paddr = base_paddr + i;
+ retval = reserve_pfn_range(paddr, PAGE_SIZE, prot, 0);
+ if (retval)
+ goto cleanup_ret;
+ }
+ return 0;
+
+cleanup_ret:
+ /* Reserve error: Cleanup partial reservation and return error */
+ for (j = 0; j < i; j += PAGE_SIZE) {
+ paddr = base_paddr + j;
+ free_pfn_range(paddr, PAGE_SIZE);
+ }
+
+ return retval;
+}
+
+/*
+ * untrack_pfn_vma is called while unmapping a pfnmap for a region.
+ * untrack can be called for a specific region indicated by pfn and size or
+ * can be for the entire vma (in which case size can be zero).
+ */
+void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
+ unsigned long size)
+{
+ unsigned long i;
+ resource_size_t paddr;
+ unsigned long prot;
+ unsigned long vma_start = vma->vm_start;
+ unsigned long vma_end = vma->vm_end;
+ unsigned long vma_size = vma_end - vma_start;
+
+ if (!pat_enabled)
+ return;
+
+ if (is_linear_pfn_mapping(vma)) {
+ /* free the whole chunk starting from vm_pgoff */
+ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
+ free_pfn_range(paddr, vma_size);
+ return;
+ }
+
+ if (size != 0 && size != vma_size) {
+ /* free page by page, using pfn and size */
+ paddr = (resource_size_t)pfn << PAGE_SHIFT;
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ paddr = paddr + i;
+ free_pfn_range(paddr, PAGE_SIZE);
+ }
+ } else {
+ /* free entire vma, page by page, using the pfn from pte */
+ for (i = 0; i < vma_size; i += PAGE_SIZE) {
+ if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
+ continue;
+
+ free_pfn_range(paddr, PAGE_SIZE);
+ }
+ }
+}
+
+pgprot_t pgprot_writecombine(pgprot_t prot)
+{
+ if (pat_enabled)
+ return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC);
+ else
+ return pgprot_noncached(prot);
+}
+
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
/* get Nth element of the linked list */
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 51c0a2fc14f..15df1baee10 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -21,6 +21,7 @@
#include <asm/numa.h>
#include <asm/e820.h>
#include <asm/genapic.h>
+#include <asm/uv/uv.h>
int acpi_numa __initdata;
@@ -382,7 +383,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
if (!node_online(i))
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
- for (i = 0; i < NR_CPUS; i++) {
+ for (i = 0; i < nr_cpu_ids; i++) {
int node = early_cpu_to_node(i);
if (node == NUMA_NO_NODE)
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/mm/tlb.c
index 8f919ca6949..14c5af4d11e 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/mm/tlb.c
@@ -1,24 +1,20 @@
#include <linux/init.h>
#include <linux/mm.h>
-#include <linux/delay.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
-#include <linux/kernel_stat.h>
-#include <linux/mc146818rtc.h>
#include <linux/interrupt.h>
+#include <linux/module.h>
-#include <asm/mtrr.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
-#include <asm/proto.h>
-#include <asm/apicdef.h>
-#include <asm/idle.h>
-#include <asm/uv/uv_hub.h>
-#include <asm/uv/uv_bau.h>
+#include <asm/apic.h>
+#include <asm/uv/uv.h>
-#include <mach_ipi.h>
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
+ = { &init_mm, 0, };
+
+#include <asm/genapic.h>
/*
* Smarter SMP flushing macros.
* c/o Linus Torvalds.
@@ -33,7 +29,7 @@
* To avoid global state use 8 different call vectors.
* Each CPU uses a specific vector to trigger flushes on other
* CPUs. Depending on the received vector the target CPUs look into
- * the right per cpu variable for the flush data.
+ * the right array slot for the flush data.
*
* With more than 8 CPUs they are hashed to the 8 available
* vectors. The limited global vector space forces us to this right now.
@@ -43,18 +39,18 @@
union smp_flush_state {
struct {
- cpumask_t flush_cpumask;
struct mm_struct *flush_mm;
unsigned long flush_va;
spinlock_t tlbstate_lock;
+ DECLARE_BITMAP(flush_cpumask, NR_CPUS);
};
- char pad[SMP_CACHE_BYTES];
-} ____cacheline_aligned;
+ char pad[CONFIG_X86_INTERNODE_CACHE_BYTES];
+} ____cacheline_internodealigned_in_smp;
/* State is put into the per CPU data section, but padded
to a full cache line because other CPUs can access it and we don't
want false sharing in the per cpu data segment. */
-static DEFINE_PER_CPU(union smp_flush_state, flush_state);
+static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
/*
* We cannot call mmdrop() because we are in interrupt context,
@@ -62,9 +58,9 @@ static DEFINE_PER_CPU(union smp_flush_state, flush_state);
*/
void leave_mm(int cpu)
{
- if (read_pda(mmu_state) == TLBSTATE_OK)
+ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
BUG();
- cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
+ cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
load_cr3(swapper_pg_dir);
}
EXPORT_SYMBOL_GPL(leave_mm);
@@ -117,10 +113,20 @@ EXPORT_SYMBOL_GPL(leave_mm);
* Interrupts are disabled.
*/
-asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
+/*
+ * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop
+ * but still used for documentation purpose but the usage is slightly
+ * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt
+ * entry calls in with the first parameter in %eax. Maybe define
+ * intrlinkage?
+ */
+#ifdef CONFIG_X86_64
+asmlinkage
+#endif
+void smp_invalidate_interrupt(struct pt_regs *regs)
{
- int cpu;
- int sender;
+ unsigned int cpu;
+ unsigned int sender;
union smp_flush_state *f;
cpu = smp_processor_id();
@@ -129,9 +135,9 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
* Use that to determine where the sender put the data.
*/
sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
- f = &per_cpu(flush_state, sender);
+ f = &flush_state[sender];
- if (!cpu_isset(cpu, f->flush_cpumask))
+ if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
goto out;
/*
* This was a BUG() but until someone can quote me the
@@ -142,8 +148,8 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
* BUG();
*/
- if (f->flush_mm == read_pda(active_mm)) {
- if (read_pda(mmu_state) == TLBSTATE_OK) {
+ if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
+ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
if (f->flush_va == TLB_FLUSH_ALL)
local_flush_tlb();
else
@@ -153,23 +159,21 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
}
out:
ack_APIC_irq();
- cpu_clear(cpu, f->flush_cpumask);
- add_pda(irq_tlb_count, 1);
+ smp_mb__before_clear_bit();
+ cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
+ smp_mb__after_clear_bit();
+ inc_irq_stat(irq_tlb_count);
}
-void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
- unsigned long va)
+static void flush_tlb_others_ipi(const struct cpumask *cpumask,
+ struct mm_struct *mm, unsigned long va)
{
- int sender;
+ unsigned int sender;
union smp_flush_state *f;
- cpumask_t cpumask = *cpumaskp;
-
- if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va))
- return;
/* Caller has disabled preemption */
sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
- f = &per_cpu(flush_state, sender);
+ f = &flush_state[sender];
/*
* Could avoid this lock when
@@ -180,7 +184,8 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
f->flush_mm = mm;
f->flush_va = va;
- cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
+ cpumask_andnot(to_cpumask(f->flush_cpumask),
+ cpumask, cpumask_of(smp_processor_id()));
/*
* Make the above memory operations globally visible before
@@ -191,9 +196,10 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
* We have to send the IPI only to
* CPUs affected.
*/
- send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
+ apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
+ INVALIDATE_TLB_VECTOR_START + sender);
- while (!cpus_empty(f->flush_cpumask))
+ while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
cpu_relax();
f->flush_mm = NULL;
@@ -201,12 +207,28 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
spin_unlock(&f->tlbstate_lock);
}
+void native_flush_tlb_others(const struct cpumask *cpumask,
+ struct mm_struct *mm, unsigned long va)
+{
+ if (is_uv_system()) {
+ unsigned int cpu;
+
+ cpu = get_cpu();
+ cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
+ if (cpumask)
+ flush_tlb_others_ipi(cpumask, mm, va);
+ put_cpu();
+ return;
+ }
+ flush_tlb_others_ipi(cpumask, mm, va);
+}
+
static int __cpuinit init_smp_flush(void)
{
int i;
- for_each_possible_cpu(i)
- spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
+ for (i = 0; i < ARRAY_SIZE(flush_state); i++)
+ spin_lock_init(&flush_state[i].tlbstate_lock);
return 0;
}
@@ -215,25 +237,18 @@ core_initcall(init_smp_flush);
void flush_tlb_current_task(void)
{
struct mm_struct *mm = current->mm;
- cpumask_t cpu_mask;
preempt_disable();
- cpu_mask = mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
local_flush_tlb();
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+ if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
preempt_enable();
}
void flush_tlb_mm(struct mm_struct *mm)
{
- cpumask_t cpu_mask;
-
preempt_disable();
- cpu_mask = mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
if (current->active_mm == mm) {
if (current->mm)
@@ -241,8 +256,8 @@ void flush_tlb_mm(struct mm_struct *mm)
else
leave_mm(smp_processor_id());
}
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+ if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
preempt_enable();
}
@@ -250,11 +265,8 @@ void flush_tlb_mm(struct mm_struct *mm)
void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
{
struct mm_struct *mm = vma->vm_mm;
- cpumask_t cpu_mask;
preempt_disable();
- cpu_mask = mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
if (current->active_mm == mm) {
if (current->mm)
@@ -263,8 +275,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
leave_mm(smp_processor_id());
}
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, va);
+ if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(&mm->cpu_vm_mask, mm, va);
preempt_enable();
}
@@ -274,7 +286,7 @@ static void do_flush_tlb_all(void *info)
unsigned long cpu = smp_processor_id();
__flush_tlb_all();
- if (read_pda(mmu_state) == TLBSTATE_LAZY)
+ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
leave_mm(cpu);
}
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 022cd41ea9b..202864ad49a 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -401,14 +401,13 @@ static int __init ppro_init(char **cpu_type)
*cpu_type = "i386/pii";
break;
case 6 ... 8:
+ case 10 ... 11:
*cpu_type = "i386/piii";
break;
case 9:
+ case 13:
*cpu_type = "i386/p6_mobile";
break;
- case 10 ... 13:
- *cpu_type = "i386/p6";
- break;
case 14:
*cpu_type = "i386/core";
break;
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 509513760a6..8fdf06e4edf 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -2,7 +2,7 @@
* @file op_model_amd.c
* athlon / K7 / K8 / Family 10h model-specific MSR operations
*
- * @remark Copyright 2002-2008 OProfile authors
+ * @remark Copyright 2002-2009 OProfile authors
* @remark Read the file COPYING
*
* @author John Levon
@@ -10,7 +10,7 @@
* @author Graydon Hoare
* @author Robert Richter <robert.richter@amd.com>
* @author Barry Kasindorf
-*/
+ */
#include <linux/oprofile.h>
#include <linux/device.h>
@@ -60,56 +60,10 @@ static unsigned long reset_value[NUM_COUNTERS];
#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */
#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */
-/* Codes used in cpu_buffer.c */
-/* This produces duplicate code, need to be fixed */
-#define IBS_FETCH_BEGIN 3
-#define IBS_OP_BEGIN 4
-
-/* The function interface needs to be fixed, something like add
- data. Should then be added to linux/oprofile.h. */
-extern void
-oprofile_add_ibs_sample(struct pt_regs *const regs,
- unsigned int *const ibs_sample, int ibs_code);
-
-struct ibs_fetch_sample {
- /* MSRC001_1031 IBS Fetch Linear Address Register */
- unsigned int ibs_fetch_lin_addr_low;
- unsigned int ibs_fetch_lin_addr_high;
- /* MSRC001_1030 IBS Fetch Control Register */
- unsigned int ibs_fetch_ctl_low;
- unsigned int ibs_fetch_ctl_high;
- /* MSRC001_1032 IBS Fetch Physical Address Register */
- unsigned int ibs_fetch_phys_addr_low;
- unsigned int ibs_fetch_phys_addr_high;
-};
-
-struct ibs_op_sample {
- /* MSRC001_1034 IBS Op Logical Address Register (IbsRIP) */
- unsigned int ibs_op_rip_low;
- unsigned int ibs_op_rip_high;
- /* MSRC001_1035 IBS Op Data Register */
- unsigned int ibs_op_data1_low;
- unsigned int ibs_op_data1_high;
- /* MSRC001_1036 IBS Op Data 2 Register */
- unsigned int ibs_op_data2_low;
- unsigned int ibs_op_data2_high;
- /* MSRC001_1037 IBS Op Data 3 Register */
- unsigned int ibs_op_data3_low;
- unsigned int ibs_op_data3_high;
- /* MSRC001_1038 IBS DC Linear Address Register (IbsDcLinAd) */
- unsigned int ibs_dc_linear_low;
- unsigned int ibs_dc_linear_high;
- /* MSRC001_1039 IBS DC Physical Address Register (IbsDcPhysAd) */
- unsigned int ibs_dc_phys_low;
- unsigned int ibs_dc_phys_high;
-};
-
-/*
- * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+
-*/
-static void clear_ibs_nmi(void);
+#define IBS_FETCH_SIZE 6
+#define IBS_OP_SIZE 12
-static int ibs_allowed; /* AMD Family10h and later */
+static int has_ibs; /* AMD Family10h and later */
struct op_ibs_config {
unsigned long op_enabled;
@@ -200,31 +154,29 @@ static inline int
op_amd_handle_ibs(struct pt_regs * const regs,
struct op_msrs const * const msrs)
{
- unsigned int low, high;
- struct ibs_fetch_sample ibs_fetch;
- struct ibs_op_sample ibs_op;
+ u32 low, high;
+ u64 msr;
+ struct op_entry entry;
- if (!ibs_allowed)
+ if (!has_ibs)
return 1;
if (ibs_config.fetch_enabled) {
rdmsr(MSR_AMD64_IBSFETCHCTL, low, high);
if (high & IBS_FETCH_HIGH_VALID_BIT) {
- ibs_fetch.ibs_fetch_ctl_high = high;
- ibs_fetch.ibs_fetch_ctl_low = low;
- rdmsr(MSR_AMD64_IBSFETCHLINAD, low, high);
- ibs_fetch.ibs_fetch_lin_addr_high = high;
- ibs_fetch.ibs_fetch_lin_addr_low = low;
- rdmsr(MSR_AMD64_IBSFETCHPHYSAD, low, high);
- ibs_fetch.ibs_fetch_phys_addr_high = high;
- ibs_fetch.ibs_fetch_phys_addr_low = low;
-
- oprofile_add_ibs_sample(regs,
- (unsigned int *)&ibs_fetch,
- IBS_FETCH_BEGIN);
-
- /*reenable the IRQ */
- rdmsr(MSR_AMD64_IBSFETCHCTL, low, high);
+ rdmsrl(MSR_AMD64_IBSFETCHLINAD, msr);
+ oprofile_write_reserve(&entry, regs, msr,
+ IBS_FETCH_CODE, IBS_FETCH_SIZE);
+ oprofile_add_data(&entry, (u32)msr);
+ oprofile_add_data(&entry, (u32)(msr >> 32));
+ oprofile_add_data(&entry, low);
+ oprofile_add_data(&entry, high);
+ rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, msr);
+ oprofile_add_data(&entry, (u32)msr);
+ oprofile_add_data(&entry, (u32)(msr >> 32));
+ oprofile_write_commit(&entry);
+
+ /* reenable the IRQ */
high &= ~IBS_FETCH_HIGH_VALID_BIT;
high |= IBS_FETCH_HIGH_ENABLE;
low &= IBS_FETCH_LOW_MAX_CNT_MASK;
@@ -235,30 +187,29 @@ op_amd_handle_ibs(struct pt_regs * const regs,
if (ibs_config.op_enabled) {
rdmsr(MSR_AMD64_IBSOPCTL, low, high);
if (low & IBS_OP_LOW_VALID_BIT) {
- rdmsr(MSR_AMD64_IBSOPRIP, low, high);
- ibs_op.ibs_op_rip_low = low;
- ibs_op.ibs_op_rip_high = high;
- rdmsr(MSR_AMD64_IBSOPDATA, low, high);
- ibs_op.ibs_op_data1_low = low;
- ibs_op.ibs_op_data1_high = high;
- rdmsr(MSR_AMD64_IBSOPDATA2, low, high);
- ibs_op.ibs_op_data2_low = low;
- ibs_op.ibs_op_data2_high = high;
- rdmsr(MSR_AMD64_IBSOPDATA3, low, high);
- ibs_op.ibs_op_data3_low = low;
- ibs_op.ibs_op_data3_high = high;
- rdmsr(MSR_AMD64_IBSDCLINAD, low, high);
- ibs_op.ibs_dc_linear_low = low;
- ibs_op.ibs_dc_linear_high = high;
- rdmsr(MSR_AMD64_IBSDCPHYSAD, low, high);
- ibs_op.ibs_dc_phys_low = low;
- ibs_op.ibs_dc_phys_high = high;
+ rdmsrl(MSR_AMD64_IBSOPRIP, msr);
+ oprofile_write_reserve(&entry, regs, msr,
+ IBS_OP_CODE, IBS_OP_SIZE);
+ oprofile_add_data(&entry, (u32)msr);
+ oprofile_add_data(&entry, (u32)(msr >> 32));
+ rdmsrl(MSR_AMD64_IBSOPDATA, msr);
+ oprofile_add_data(&entry, (u32)msr);
+ oprofile_add_data(&entry, (u32)(msr >> 32));
+ rdmsrl(MSR_AMD64_IBSOPDATA2, msr);
+ oprofile_add_data(&entry, (u32)msr);
+ oprofile_add_data(&entry, (u32)(msr >> 32));
+ rdmsrl(MSR_AMD64_IBSOPDATA3, msr);
+ oprofile_add_data(&entry, (u32)msr);
+ oprofile_add_data(&entry, (u32)(msr >> 32));
+ rdmsrl(MSR_AMD64_IBSDCLINAD, msr);
+ oprofile_add_data(&entry, (u32)msr);
+ oprofile_add_data(&entry, (u32)(msr >> 32));
+ rdmsrl(MSR_AMD64_IBSDCPHYSAD, msr);
+ oprofile_add_data(&entry, (u32)msr);
+ oprofile_add_data(&entry, (u32)(msr >> 32));
+ oprofile_write_commit(&entry);
/* reenable the IRQ */
- oprofile_add_ibs_sample(regs,
- (unsigned int *)&ibs_op,
- IBS_OP_BEGIN);
- rdmsr(MSR_AMD64_IBSOPCTL, low, high);
high = 0;
low &= ~IBS_OP_LOW_VALID_BIT;
low |= IBS_OP_LOW_ENABLE;
@@ -308,14 +259,14 @@ static void op_amd_start(struct op_msrs const * const msrs)
}
#ifdef CONFIG_OPROFILE_IBS
- if (ibs_allowed && ibs_config.fetch_enabled) {
+ if (has_ibs && ibs_config.fetch_enabled) {
low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF;
high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */
+ IBS_FETCH_HIGH_ENABLE;
wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
}
- if (ibs_allowed && ibs_config.op_enabled) {
+ if (has_ibs && ibs_config.op_enabled) {
low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF)
+ ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */
+ IBS_OP_LOW_ENABLE;
@@ -331,8 +282,10 @@ static void op_amd_stop(struct op_msrs const * const msrs)
unsigned int low, high;
int i;
- /* Subtle: stop on all counters to avoid race with
- * setting our pm callback */
+ /*
+ * Subtle: stop on all counters to avoid race with setting our
+ * pm callback
+ */
for (i = 0 ; i < NUM_COUNTERS ; ++i) {
if (!reset_value[i])
continue;
@@ -342,14 +295,16 @@ static void op_amd_stop(struct op_msrs const * const msrs)
}
#ifdef CONFIG_OPROFILE_IBS
- if (ibs_allowed && ibs_config.fetch_enabled) {
- low = 0; /* clear max count and enable */
+ if (has_ibs && ibs_config.fetch_enabled) {
+ /* clear max count and enable */
+ low = 0;
high = 0;
wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
}
- if (ibs_allowed && ibs_config.op_enabled) {
- low = 0; /* clear max count and enable */
+ if (has_ibs && ibs_config.op_enabled) {
+ /* clear max count and enable */
+ low = 0;
high = 0;
wrmsr(MSR_AMD64_IBSOPCTL, low, high);
}
@@ -370,18 +325,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
}
}
-#ifndef CONFIG_OPROFILE_IBS
-
-/* no IBS support */
-
-static int op_amd_init(struct oprofile_operations *ops)
-{
- return 0;
-}
-
-static void op_amd_exit(void) {}
-
-#else
+#ifdef CONFIG_OPROFILE_IBS
static u8 ibs_eilvt_off;
@@ -395,7 +339,7 @@ static inline void apic_clear_ibs_nmi_per_cpu(void *arg)
setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
}
-static int pfm_amd64_setup_eilvt(void)
+static int init_ibs_nmi(void)
{
#define IBSCTL_LVTOFFSETVAL (1 << 8)
#define IBSCTL 0x1cc
@@ -419,6 +363,7 @@ static int pfm_amd64_setup_eilvt(void)
| IBSCTL_LVTOFFSETVAL);
pci_read_config_dword(cpu_cfg, IBSCTL, &value);
if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) {
+ pci_dev_put(cpu_cfg);
printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
"IBSCTL = 0x%08x", value);
return 1;
@@ -443,33 +388,35 @@ static int pfm_amd64_setup_eilvt(void)
return 0;
}
-/*
- * initialize the APIC for the IBS interrupts
- * if available (AMD Family10h rev B0 and later)
- */
-static void setup_ibs(void)
+/* uninitialize the APIC for the IBS interrupts if needed */
+static void clear_ibs_nmi(void)
{
- ibs_allowed = boot_cpu_has(X86_FEATURE_IBS);
+ if (has_ibs)
+ on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
+}
+
+/* initialize the APIC for the IBS interrupts if available */
+static void ibs_init(void)
+{
+ has_ibs = boot_cpu_has(X86_FEATURE_IBS);
- if (!ibs_allowed)
+ if (!has_ibs)
return;
- if (pfm_amd64_setup_eilvt()) {
- ibs_allowed = 0;
+ if (init_ibs_nmi()) {
+ has_ibs = 0;
return;
}
printk(KERN_INFO "oprofile: AMD IBS detected\n");
}
-
-/*
- * unitialize the APIC for the IBS interrupts if needed on AMD Family10h
- * rev B0 and later */
-static void clear_ibs_nmi(void)
+static void ibs_exit(void)
{
- if (ibs_allowed)
- on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
+ if (!has_ibs)
+ return;
+
+ clear_ibs_nmi();
}
static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
@@ -486,7 +433,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
if (ret)
return ret;
- if (!ibs_allowed)
+ if (!has_ibs)
return ret;
/* model specific files */
@@ -519,7 +466,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
static int op_amd_init(struct oprofile_operations *ops)
{
- setup_ibs();
+ ibs_init();
create_arch_files = ops->create_files;
ops->create_files = setup_ibs_files;
return 0;
@@ -527,10 +474,21 @@ static int op_amd_init(struct oprofile_operations *ops)
static void op_amd_exit(void)
{
- clear_ibs_nmi();
+ ibs_exit();
}
-#endif
+#else
+
+/* no IBS support */
+
+static int op_amd_init(struct oprofile_operations *ops)
+{
+ return 0;
+}
+
+static void op_amd_exit(void) {}
+
+#endif /* CONFIG_OPROFILE_IBS */
struct op_x86_model_spec const op_amd_spec = {
.init = op_amd_init,
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 3f1b81a83e2..e9f80c744cf 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -69,7 +69,7 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs)
int i;
if (!reset_value) {
- reset_value = kmalloc(sizeof(unsigned) * num_counters,
+ reset_value = kmalloc(sizeof(reset_value[0]) * num_counters,
GFP_ATOMIC);
if (!reset_value)
return;
@@ -156,6 +156,8 @@ static void ppro_start(struct op_msrs const * const msrs)
unsigned int low, high;
int i;
+ if (!reset_value)
+ return;
for (i = 0; i < num_counters; ++i) {
if (reset_value[i]) {
CTRL_READ(low, high, msrs, i);
@@ -171,6 +173,8 @@ static void ppro_stop(struct op_msrs const * const msrs)
unsigned int low, high;
int i;
+ if (!reset_value)
+ return;
for (i = 0; i < num_counters; ++i) {
if (!reset_value[i])
continue;
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 1d88d2b3977..c0ecf250fe5 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -4,7 +4,7 @@
#include <linux/irq.h>
#include <linux/dmi.h>
#include <asm/numa.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
struct pci_root_info {
char *name;
@@ -210,11 +210,10 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
if (bus && node != -1) {
#ifdef CONFIG_ACPI_NUMA
if (pxm >= 0)
- printk(KERN_DEBUG "bus %02x -> pxm %d -> node %d\n",
- busnum, pxm, node);
+ dev_printk(KERN_DEBUG, &bus->dev,
+ "on NUMA node %d (pxm %d)\n", node, pxm);
#else
- printk(KERN_DEBUG "bus %02x -> node %d\n",
- busnum, node);
+ dev_printk(KERN_DEBUG, &bus->dev, "on NUMA node %d\n", node);
#endif
}
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 22e057665e5..9bb09823b36 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -2,7 +2,7 @@
#include <linux/pci.h>
#include <linux/topology.h>
#include <linux/cpu.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
#ifdef CONFIG_X86_64
#include <asm/pci-direct.h>
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index b67732bbb85..82d22fc601a 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -14,8 +14,7 @@
#include <asm/segment.h>
#include <asm/io.h>
#include <asm/smp.h>
-
-#include "pci.h"
+#include <asm/pci_x86.h>
unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
PCI_PROBE_MMCONF;
@@ -23,6 +22,12 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
unsigned int pci_early_dump_regs;
static int pci_bf_sort;
int pci_routeirq;
+int noioapicquirk;
+#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS
+int noioapicreroute = 0;
+#else
+int noioapicreroute = 1;
+#endif
int pcibios_last_bus = -1;
unsigned long pirq_table_addr;
struct pci_bus *pci_root_bus;
@@ -519,6 +524,17 @@ char * __devinit pcibios_setup(char *str)
} else if (!strcmp(str, "skip_isa_align")) {
pci_probe |= PCI_CAN_SKIP_ISA_ALIGN;
return NULL;
+ } else if (!strcmp(str, "noioapicquirk")) {
+ noioapicquirk = 1;
+ return NULL;
+ } else if (!strcmp(str, "ioapicreroute")) {
+ if (noioapicreroute != -1)
+ noioapicreroute = 0;
+ return NULL;
+ } else if (!strcmp(str, "noioapicreroute")) {
+ if (noioapicreroute != -1)
+ noioapicreroute = 1;
+ return NULL;
}
return str;
}
@@ -535,17 +551,25 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
if ((err = pci_enable_resources(dev, mask)) < 0)
return err;
- if (!dev->msi_enabled)
+ if (!pci_dev_msi_enabled(dev))
return pcibios_enable_irq(dev);
return 0;
}
void pcibios_disable_device (struct pci_dev *dev)
{
- if (!dev->msi_enabled && pcibios_disable_irq)
+ if (!pci_dev_msi_enabled(dev) && pcibios_disable_irq)
pcibios_disable_irq(dev);
}
+int pci_ext_cfg_avail(struct pci_dev *dev)
+{
+ if (raw_pci_ext_ops)
+ return 1;
+ else
+ return 0;
+}
+
struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node)
{
struct pci_bus *bus = NULL;
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index 9915293500f..bd13c3e4c6d 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -5,7 +5,7 @@
#include <linux/pci.h>
#include <linux/init.h>
#include <linux/dmi.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
/*
* Functions for accessing PCI base (first 256 bytes) and extended
@@ -173,7 +173,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus,
#undef PCI_CONF2_ADDRESS
-static struct pci_raw_ops pci_direct_conf2 = {
+struct pci_raw_ops pci_direct_conf2 = {
.read = pci_conf2_read,
.write = pci_conf2_write,
};
@@ -289,6 +289,7 @@ int __init pci_direct_probe(void)
if (pci_check_type1()) {
raw_pci_ops = &pci_direct_conf1;
+ port_cf9_safe = true;
return 1;
}
release_resource(region);
@@ -305,6 +306,7 @@ int __init pci_direct_probe(void)
if (pci_check_type2()) {
raw_pci_ops = &pci_direct_conf2;
+ port_cf9_safe = true;
return 2;
}
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index 86631ccbc25..f6adf2c6d75 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -2,7 +2,7 @@
#include <linux/pci.h>
#include <asm/pci-direct.h>
#include <asm/io.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
/* Direct PCI access. This is used for PCI accesses in early boot before
the PCI subsystem works. */
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 3c27a809393..7d388d5cf54 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -6,8 +6,7 @@
#include <linux/dmi.h>
#include <linux/pci.h>
#include <linux/init.h>
-#include "pci.h"
-
+#include <asm/pci_x86.h>
static void __devinit pci_fixup_i450nx(struct pci_dev *d)
{
@@ -496,21 +495,24 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SIEMENS, 0x0015,
pci_siemens_interrupt_controller);
/*
- * Regular PCI devices have 256 bytes, but AMD Family 10h Opteron ext config
- * have 4096 bytes. Even if the device is capable, that doesn't mean we can
- * access it. Maybe we don't have a way to generate extended config space
- * accesses. So check it
+ * Regular PCI devices have 256 bytes, but AMD Family 10h/11h CPUs have
+ * 4096 bytes configuration space for each function of their processor
+ * configuration space.
*/
-static void fam10h_pci_cfg_space_size(struct pci_dev *dev)
+static void amd_cpu_pci_cfg_space_size(struct pci_dev *dev)
{
dev->cfg_size = pci_cfg_space_size_ext(dev);
}
-
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1200, fam10h_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1201, fam10h_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, fam10h_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, fam10h_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, fam10h_pci_cfg_space_size);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1200, amd_cpu_pci_cfg_space_size);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1201, amd_cpu_pci_cfg_space_size);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, amd_cpu_pci_cfg_space_size);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, amd_cpu_pci_cfg_space_size);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, amd_cpu_pci_cfg_space_size);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1300, amd_cpu_pci_cfg_space_size);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1301, amd_cpu_pci_cfg_space_size);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1302, amd_cpu_pci_cfg_space_size);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1303, amd_cpu_pci_cfg_space_size);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1304, amd_cpu_pci_cfg_space_size);
/*
* SB600: Disable BAR1 on device 14.0 to avoid HPET resources from
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 844df0cbbd3..5ead808dd70 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -34,8 +34,8 @@
#include <asm/pat.h>
#include <asm/e820.h>
+#include <asm/pci_x86.h>
-#include "pci.h"
static int
skip_isa_ioresource_align(struct pci_dev *dev) {
@@ -129,7 +129,7 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
pr = pci_find_parent_resource(dev, r);
if (!r->start || !pr ||
request_resource(pr, r) < 0) {
- dev_err(&dev->dev, "BAR %d: can't allocate resource\n", idx);
+ dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx);
/*
* Something is wrong with the region.
* Invalidate the resource to prevent
@@ -170,7 +170,7 @@ static void __init pcibios_allocate_resources(int pass)
r->flags, disabled, pass);
pr = pci_find_parent_resource(dev, r);
if (!pr || request_resource(pr, r) < 0) {
- dev_err(&dev->dev, "BAR %d: can't allocate resource\n", idx);
+ dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx);
/* We'll assign a new address later */
r->end -= r->start;
r->start = 0;
@@ -314,17 +314,7 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
return retval;
if (flags != new_flags) {
- /*
- * Do not fallback to certain memory types with certain
- * requested type:
- * - request is uncached, return cannot be write-back
- * - request is uncached, return cannot be write-combine
- * - request is write-combine, return cannot be write-back
- */
- if ((flags == _PAGE_CACHE_UC_MINUS &&
- (new_flags == _PAGE_CACHE_WB)) ||
- (flags == _PAGE_CACHE_WC &&
- new_flags == _PAGE_CACHE_WB)) {
+ if (!is_new_memtype_allowed(flags, new_flags)) {
free_memtype(addr, addr+len);
return -EINVAL;
}
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index d6c950f8185..25a1f8efed4 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -1,6 +1,6 @@
#include <linux/pci.h>
#include <linux/init.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
/* arch_initcall has too random ordering, so call the initializers
in the right sequence from here. */
@@ -12,7 +12,8 @@ static __init int pci_arch_init(void)
type = pci_direct_probe();
#endif
- pci_mmcfg_early_init();
+ if (!(pci_probe & PCI_PROBE_NOEARLY))
+ pci_mmcfg_early_init();
#ifdef CONFIG_PCI_OLPC
if (!pci_olpc_init())
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index bf69dbe08bf..fecbce6e7d7 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -16,8 +16,7 @@
#include <asm/io_apic.h>
#include <linux/irq.h>
#include <linux/acpi.h>
-
-#include "pci.h"
+#include <asm/pci_x86.h>
#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
#define PIRQ_VERSION 0x0100
@@ -534,7 +533,7 @@ static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
{
struct pci_dev *bridge;
int pin = pci_get_interrupt_pin(dev, &bridge);
- return pcibios_set_irq_routing(bridge, pin, irq);
+ return pcibios_set_irq_routing(bridge, pin - 1, irq);
}
#endif
@@ -573,6 +572,7 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
case PCI_DEVICE_ID_INTEL_ICH7_1:
case PCI_DEVICE_ID_INTEL_ICH7_30:
case PCI_DEVICE_ID_INTEL_ICH7_31:
+ case PCI_DEVICE_ID_INTEL_TGP_LPC:
case PCI_DEVICE_ID_INTEL_ESB2_0:
case PCI_DEVICE_ID_INTEL_ICH8_0:
case PCI_DEVICE_ID_INTEL_ICH8_1:
@@ -888,7 +888,6 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
dev_dbg(&dev->dev, "no interrupt pin\n");
return 0;
}
- pin = pin - 1;
/* Find IRQ routing entry */
@@ -898,17 +897,17 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
info = pirq_get_info(dev);
if (!info) {
dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
- 'A' + pin);
+ 'A' + pin - 1);
return 0;
}
- pirq = info->irq[pin].link;
- mask = info->irq[pin].bitmap;
+ pirq = info->irq[pin - 1].link;
+ mask = info->irq[pin - 1].bitmap;
if (!pirq) {
- dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin);
+ dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin - 1);
return 0;
}
dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
- 'A' + pin, pirq, mask, pirq_table->exclusive_irqs);
+ 'A' + pin - 1, pirq, mask, pirq_table->exclusive_irqs);
mask &= pcibios_irq_mask;
/* Work around broken HP Pavilion Notebooks which assign USB to
@@ -950,7 +949,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
newirq = i;
}
}
- dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq);
+ dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin - 1, newirq);
/* Check if it is hardcoded */
if ((pirq & 0xf0) == 0xf0) {
@@ -978,18 +977,18 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
return 0;
}
}
- dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq);
+ dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq);
/* Update IRQ for all devices with the same pirq value */
while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
if (!pin)
continue;
- pin--;
+
info = pirq_get_info(dev2);
if (!info)
continue;
- if (info->irq[pin].link == pirq) {
+ if (info->irq[pin - 1].link == pirq) {
/*
* We refuse to override the dev->irq
* information. Give a warning!
@@ -1043,6 +1042,9 @@ static void __init pcibios_fixup_irqs(void)
dev = NULL;
while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+ if (!pin)
+ continue;
+
#ifdef CONFIG_X86_IO_APIC
/*
* Recalculate IRQ numbers if we use the I/O APIC.
@@ -1050,15 +1052,11 @@ static void __init pcibios_fixup_irqs(void)
if (io_apic_assign_pci_irqs) {
int irq;
- if (!pin)
- continue;
-
/*
* interrupt pins are numbered starting from 1
*/
- pin--;
irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
- PCI_SLOT(dev->devfn), pin);
+ PCI_SLOT(dev->devfn), pin - 1);
/*
* Busses behind bridges are typically not listed in the
* MP-table. In this case we have to look up the IRQ
@@ -1071,22 +1069,22 @@ static void __init pcibios_fixup_irqs(void)
struct pci_dev *bridge = dev->bus->self;
int bus;
- pin = (pin + PCI_SLOT(dev->devfn)) % 4;
+ pin = pci_swizzle_interrupt_pin(dev, pin);
bus = bridge->bus->number;
irq = IO_APIC_get_PCI_irq_vector(bus,
- PCI_SLOT(bridge->devfn), pin);
+ PCI_SLOT(bridge->devfn), pin - 1);
if (irq >= 0)
dev_warn(&dev->dev,
"using bridge %s INT %c to "
"get IRQ %d\n",
pci_name(bridge),
- 'A' + pin, irq);
+ 'A' + pin - 1, irq);
}
if (irq >= 0) {
dev_info(&dev->dev,
"PCI->APIC IRQ transform: INT %c "
"-> IRQ %d\n",
- 'A' + pin, irq);
+ 'A' + pin - 1, irq);
dev->irq = irq;
}
}
@@ -1094,7 +1092,7 @@ static void __init pcibios_fixup_irqs(void)
/*
* Still no IRQ? Try to lookup one...
*/
- if (pin && !dev->irq)
+ if (!dev->irq)
pcibios_lookup_irq(dev, 0);
}
}
@@ -1221,12 +1219,10 @@ static int pirq_enable_irq(struct pci_dev *dev)
if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
char *msg = "";
- pin--; /* interrupt pins are numbered starting from 1 */
-
if (io_apic_assign_pci_irqs) {
int irq;
- irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
+ irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin - 1);
/*
* Busses behind bridges are typically not listed in the MP-table.
* In this case we have to look up the IRQ based on the parent bus,
@@ -1237,20 +1233,20 @@ static int pirq_enable_irq(struct pci_dev *dev)
while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
struct pci_dev *bridge = dev->bus->self;
- pin = (pin + PCI_SLOT(dev->devfn)) % 4;
+ pin = pci_swizzle_interrupt_pin(dev, pin);
irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
- PCI_SLOT(bridge->devfn), pin);
+ PCI_SLOT(bridge->devfn), pin - 1);
if (irq >= 0)
dev_warn(&dev->dev, "using bridge %s "
"INT %c to get IRQ %d\n",
- pci_name(bridge), 'A' + pin,
+ pci_name(bridge), 'A' + pin - 1,
irq);
dev = bridge;
}
dev = temp_dev;
if (irq >= 0) {
dev_info(&dev->dev, "PCI->APIC IRQ transform: "
- "INT %c -> IRQ %d\n", 'A' + pin, irq);
+ "INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
dev->irq = irq;
return 0;
} else
@@ -1269,7 +1265,7 @@ static int pirq_enable_irq(struct pci_dev *dev)
return 0;
dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n",
- 'A' + pin, msg);
+ 'A' + pin - 1, msg);
}
return 0;
}
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index b722dd481b3..f1065b129e9 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -3,7 +3,7 @@
*/
#include <linux/init.h>
#include <linux/pci.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
/*
* Discover remaining PCI buses in case there are peer host bridges.
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 654a2234f8f..89bf9242c80 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -15,8 +15,7 @@
#include <linux/acpi.h>
#include <linux/bitmap.h>
#include <asm/e820.h>
-
-#include "pci.h"
+#include <asm/pci_x86.h>
/* aperture is up to 256MB but BIOS may reserve less */
#define MMCONFIG_APER_MIN (2 * 1024*1024)
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index f3c761dce69..8b2d561046a 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -13,7 +13,7 @@
#include <linux/init.h>
#include <linux/acpi.h>
#include <asm/e820.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
/* Assume systems with more busses have correct MCFG */
#define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
index a1994163c99..30007ffc8e1 100644
--- a/arch/x86/pci/mmconfig_64.c
+++ b/arch/x86/pci/mmconfig_64.c
@@ -10,8 +10,7 @@
#include <linux/acpi.h>
#include <linux/bitmap.h>
#include <asm/e820.h>
-
-#include "pci.h"
+#include <asm/pci_x86.h>
/* Static virtual mapping of the MMCONFIG aperture */
struct mmcfg_virt {
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 1177845d318..5601e829c38 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -5,9 +5,9 @@
#include <linux/pci.h>
#include <linux/init.h>
#include <linux/nodemask.h>
-#include <mach_apic.h>
+#include <asm/genapic.h>
#include <asm/mpspec.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
#define XQUAD_PORTIO_BASE 0xfe400000
#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
@@ -18,10 +18,6 @@
#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local])
-/* Where the IO area was mapped on multiquad, always 0 otherwise */
-void *xquad_portio;
-EXPORT_SYMBOL(xquad_portio);
-
#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index e11e9e803d5..b889d824f7c 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -29,7 +29,7 @@
#include <linux/init.h>
#include <asm/olpc.h>
#include <asm/geode.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
/*
* In the tables below, the first two line (8 longwords) are the
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 37472fc6f72..1c975cc9839 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -6,9 +6,8 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/uaccess.h>
-#include "pci.h"
-#include "pci-functions.h"
-
+#include <asm/pci_x86.h>
+#include <asm/pci-functions.h>
/* BIOS32 signature: "_32_" */
#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
index 42f4cb19fac..bcead7a4687 100644
--- a/arch/x86/pci/visws.c
+++ b/arch/x86/pci/visws.c
@@ -9,11 +9,10 @@
#include <linux/init.h>
#include <asm/setup.h>
+#include <asm/pci_x86.h>
#include <asm/visws/cobalt.h>
#include <asm/visws/lithium.h>
-#include "pci.h"
-
static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; }
static void pci_visws_disable_irq(struct pci_dev *dev) { }
@@ -25,24 +24,6 @@ static void pci_visws_disable_irq(struct pci_dev *dev) { }
unsigned int pci_bus0, pci_bus1;
-static inline u8 bridge_swizzle(u8 pin, u8 slot)
-{
- return (((pin - 1) + slot) % 4) + 1;
-}
-
-static u8 __init visws_swizzle(struct pci_dev *dev, u8 *pinp)
-{
- u8 pin = *pinp;
-
- while (dev->bus->self) { /* Move up the chain of bridges. */
- pin = bridge_swizzle(pin, PCI_SLOT(dev->devfn));
- dev = dev->bus->self;
- }
- *pinp = pin;
-
- return PCI_SLOT(dev->devfn);
-}
-
static int __init visws_map_irq(struct pci_dev *dev, u8 slot, u8 pin)
{
int irq, bus = dev->bus->number;
@@ -107,7 +88,7 @@ int __init pci_visws_init(void)
raw_pci_ops = &pci_direct_conf1;
pci_scan_bus_with_sysdata(pci_bus0);
pci_scan_bus_with_sysdata(pci_bus1);
- pci_fixup_irqs(visws_swizzle, visws_map_irq);
+ pci_fixup_irqs(pci_common_swizzle, visws_map_irq);
pcibios_resource_survey();
return 0;
}
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
index f2b6e3f11bf..81197c62d5b 100644
--- a/arch/x86/power/hibernate_32.c
+++ b/arch/x86/power/hibernate_32.c
@@ -12,6 +12,7 @@
#include <asm/system.h>
#include <asm/page.h>
#include <asm/pgtable.h>
+#include <asm/mmzone.h>
/* Defined in hibernate_asm_32.S */
extern int restore_image(void);
@@ -127,6 +128,9 @@ static int resume_physical_mapping_init(pgd_t *pgd_base)
}
}
}
+
+ resume_map_numa_kva(pgd_base);
+
return 0;
}
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 4d6ef0a336d..16a9020c8f1 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -38,7 +38,7 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
$(call if_changed,objcopy)
CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
- $(filter -g%,$(KBUILD_CFLAGS))
+ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector)
$(vobjs): KBUILD_CFLAGS += $(CFL)
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 1ef0f90813d..d9d35824c56 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -9,6 +9,9 @@
* Also alternative() doesn't work.
*/
+/* Disable profiling for userspace code: */
+#define DISABLE_BRANCH_PROFILING
+
#include <linux/kernel.h>
#include <linux/posix-timers.h>
#include <linux/time.h>
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 513f330c583..1241f118ab5 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -310,7 +310,7 @@ int __init sysenter_setup(void)
}
/* Setup a VMA at program startup for the vsyscall page */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
{
struct mm_struct *mm = current->mm;
unsigned long addr;
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 257ba4a10ab..9c98cc6ba97 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -98,7 +98,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
/* Setup a VMA at program startup for the vsyscall page.
Not called for compat tasks */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
{
struct mm_struct *mm = current->mm;
unsigned long addr;
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 6dcefba7836..3b767d03fd6 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -6,7 +6,8 @@ CFLAGS_REMOVE_irq.o = -pg
endif
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
- time.o xen-asm_$(BITS).o grant-table.o suspend.o
+ time.o xen-asm.o xen-asm_$(BITS).o \
+ grant-table.o suspend.o
obj-$(CONFIG_SMP) += smp.o spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5e4686d70f6..95ff6a0e942 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -28,6 +28,7 @@
#include <linux/console.h>
#include <xen/interface/xen.h>
+#include <xen/interface/version.h>
#include <xen/interface/physdev.h>
#include <xen/interface/vcpu.h>
#include <xen/features.h>
@@ -60,40 +61,13 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
enum xen_domain_type xen_domain_type = XEN_NATIVE;
EXPORT_SYMBOL_GPL(xen_domain_type);
-/*
- * Identity map, in addition to plain kernel map. This needs to be
- * large enough to allocate page table pages to allocate the rest.
- * Each page can map 2MB.
- */
-static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
-
-#ifdef CONFIG_X86_64
-/* l3 pud for userspace vsyscall mapping */
-static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
-#endif /* CONFIG_X86_64 */
-
-/*
- * Note about cr3 (pagetable base) values:
- *
- * xen_cr3 contains the current logical cr3 value; it contains the
- * last set cr3. This may not be the current effective cr3, because
- * its update may be being lazily deferred. However, a vcpu looking
- * at its own cr3 can use this value knowing that it everything will
- * be self-consistent.
- *
- * xen_current_cr3 contains the actual vcpu cr3; it is set once the
- * hypercall to set the vcpu cr3 is complete (so it may be a little
- * out of date, but it will never be set early). If one vcpu is
- * looking at another vcpu's cr3 value, it should use this variable.
- */
-DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
-DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
-
struct start_info *xen_start_info;
EXPORT_SYMBOL_GPL(xen_start_info);
struct shared_info xen_dummy_shared_info;
+void *xen_initial_gdt;
+
/*
* Point at some empty memory to start with. We map the real shared_info
* page as soon as fixmap is up and running.
@@ -113,14 +87,7 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
*
* 0: not available, 1: available
*/
-static int have_vcpu_info_placement =
-#ifdef CONFIG_X86_32
- 1
-#else
- 0
-#endif
- ;
-
+static int have_vcpu_info_placement = 1;
static void xen_vcpu_setup(int cpu)
{
@@ -236,7 +203,7 @@ static unsigned long xen_get_debugreg(int reg)
return HYPERVISOR_get_debugreg(reg);
}
-static void xen_leave_lazy(void)
+void xen_leave_lazy(void)
{
paravirt_leave_lazy(paravirt_get_lazy_mode());
xen_mc_flush();
@@ -356,13 +323,14 @@ static void load_TLS_descriptor(struct thread_struct *t,
static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
{
/*
- * XXX sleazy hack: If we're being called in a lazy-cpu zone,
- * it means we're in a context switch, and %gs has just been
- * saved. This means we can zero it out to prevent faults on
- * exit from the hypervisor if the next process has no %gs.
- * Either way, it has been saved, and the new value will get
- * loaded properly. This will go away as soon as Xen has been
- * modified to not save/restore %gs for normal hypercalls.
+ * XXX sleazy hack: If we're being called in a lazy-cpu zone
+ * and lazy gs handling is enabled, it means we're in a
+ * context switch, and %gs has just been saved. This means we
+ * can zero it out to prevent faults on exit from the
+ * hypervisor if the next process has no %gs. Either way, it
+ * has been saved, and the new value will get loaded properly.
+ * This will go away as soon as Xen has been modified to not
+ * save/restore %gs for normal hypercalls.
*
* On x86_64, this hack is not used for %gs, because gs points
* to KERNEL_GS_BASE (and uses it for PDA references), so we
@@ -374,7 +342,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
*/
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
#ifdef CONFIG_X86_32
- loadsegment(gs, 0);
+ lazy_load_gs(0);
#else
loadsegment(fs, 0);
#endif
@@ -597,83 +565,6 @@ static struct apic_ops xen_basic_apic_ops = {
#endif
-static void xen_flush_tlb(void)
-{
- struct mmuext_op *op;
- struct multicall_space mcs;
-
- preempt_disable();
-
- mcs = xen_mc_entry(sizeof(*op));
-
- op = mcs.args;
- op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
- xen_mc_issue(PARAVIRT_LAZY_MMU);
-
- preempt_enable();
-}
-
-static void xen_flush_tlb_single(unsigned long addr)
-{
- struct mmuext_op *op;
- struct multicall_space mcs;
-
- preempt_disable();
-
- mcs = xen_mc_entry(sizeof(*op));
- op = mcs.args;
- op->cmd = MMUEXT_INVLPG_LOCAL;
- op->arg1.linear_addr = addr & PAGE_MASK;
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
- xen_mc_issue(PARAVIRT_LAZY_MMU);
-
- preempt_enable();
-}
-
-static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
- unsigned long va)
-{
- struct {
- struct mmuext_op op;
- cpumask_t mask;
- } *args;
- cpumask_t cpumask = *cpus;
- struct multicall_space mcs;
-
- /*
- * A couple of (to be removed) sanity checks:
- *
- * - current CPU must not be in mask
- * - mask must exist :)
- */
- BUG_ON(cpus_empty(cpumask));
- BUG_ON(cpu_isset(smp_processor_id(), cpumask));
- BUG_ON(!mm);
-
- /* If a CPU which we ran on has gone down, OK. */
- cpus_and(cpumask, cpumask, cpu_online_map);
- if (cpus_empty(cpumask))
- return;
-
- mcs = xen_mc_entry(sizeof(*args));
- args = mcs.args;
- args->mask = cpumask;
- args->op.arg2.vcpumask = &args->mask;
-
- if (va == TLB_FLUSH_ALL) {
- args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
- } else {
- args->op.cmd = MMUEXT_INVLPG_MULTI;
- args->op.arg1.linear_addr = va;
- }
-
- MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
-
- xen_mc_issue(PARAVIRT_LAZY_MMU);
-}
static void xen_clts(void)
{
@@ -699,21 +590,6 @@ static void xen_write_cr0(unsigned long cr0)
xen_mc_issue(PARAVIRT_LAZY_CPU);
}
-static void xen_write_cr2(unsigned long cr2)
-{
- x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
-}
-
-static unsigned long xen_read_cr2(void)
-{
- return x86_read_percpu(xen_vcpu)->arch.cr2;
-}
-
-static unsigned long xen_read_cr2_direct(void)
-{
- return x86_read_percpu(xen_vcpu_info.arch.cr2);
-}
-
static void xen_write_cr4(unsigned long cr4)
{
cr4 &= ~X86_CR4_PGE;
@@ -722,78 +598,13 @@ static void xen_write_cr4(unsigned long cr4)
native_write_cr4(cr4);
}
-static unsigned long xen_read_cr3(void)
-{
- return x86_read_percpu(xen_cr3);
-}
-
-static void set_current_cr3(void *v)
-{
- x86_write_percpu(xen_current_cr3, (unsigned long)v);
-}
-
-static void __xen_write_cr3(bool kernel, unsigned long cr3)
-{
- struct mmuext_op *op;
- struct multicall_space mcs;
- unsigned long mfn;
-
- if (cr3)
- mfn = pfn_to_mfn(PFN_DOWN(cr3));
- else
- mfn = 0;
-
- WARN_ON(mfn == 0 && kernel);
-
- mcs = __xen_mc_entry(sizeof(*op));
-
- op = mcs.args;
- op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
- op->arg1.mfn = mfn;
-
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
- if (kernel) {
- x86_write_percpu(xen_cr3, cr3);
-
- /* Update xen_current_cr3 once the batch has actually
- been submitted. */
- xen_mc_callback(set_current_cr3, (void *)cr3);
- }
-}
-
-static void xen_write_cr3(unsigned long cr3)
-{
- BUG_ON(preemptible());
-
- xen_mc_batch(); /* disables interrupts */
-
- /* Update while interrupts are disabled, so its atomic with
- respect to ipis */
- x86_write_percpu(xen_cr3, cr3);
-
- __xen_write_cr3(true, cr3);
-
-#ifdef CONFIG_X86_64
- {
- pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
- if (user_pgd)
- __xen_write_cr3(false, __pa(user_pgd));
- else
- __xen_write_cr3(false, 0);
- }
-#endif
-
- xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
-}
-
static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
{
int ret;
ret = 0;
- switch(msr) {
+ switch (msr) {
#ifdef CONFIG_X86_64
unsigned which;
u64 base;
@@ -828,185 +639,6 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
return ret;
}
-/* Early in boot, while setting up the initial pagetable, assume
- everything is pinned. */
-static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
-{
-#ifdef CONFIG_FLATMEM
- BUG_ON(mem_map); /* should only be used early */
-#endif
- make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
-}
-
-/* Early release_pte assumes that all pts are pinned, since there's
- only init_mm and anything attached to that is pinned. */
-static void xen_release_pte_init(unsigned long pfn)
-{
- make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
-}
-
-static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
-{
- struct mmuext_op op;
- op.cmd = cmd;
- op.arg1.mfn = pfn_to_mfn(pfn);
- if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
- BUG();
-}
-
-/* This needs to make sure the new pte page is pinned iff its being
- attached to a pinned pagetable. */
-static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
-{
- struct page *page = pfn_to_page(pfn);
-
- if (PagePinned(virt_to_page(mm->pgd))) {
- SetPagePinned(page);
-
- vm_unmap_aliases();
- if (!PageHighMem(page)) {
- make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
- if (level == PT_PTE && USE_SPLIT_PTLOCKS)
- pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
- } else {
- /* make sure there are no stray mappings of
- this page */
- kmap_flush_unused();
- }
- }
-}
-
-static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
-{
- xen_alloc_ptpage(mm, pfn, PT_PTE);
-}
-
-static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
-{
- xen_alloc_ptpage(mm, pfn, PT_PMD);
-}
-
-static int xen_pgd_alloc(struct mm_struct *mm)
-{
- pgd_t *pgd = mm->pgd;
- int ret = 0;
-
- BUG_ON(PagePinned(virt_to_page(pgd)));
-
-#ifdef CONFIG_X86_64
- {
- struct page *page = virt_to_page(pgd);
- pgd_t *user_pgd;
-
- BUG_ON(page->private != 0);
-
- ret = -ENOMEM;
-
- user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
- page->private = (unsigned long)user_pgd;
-
- if (user_pgd != NULL) {
- user_pgd[pgd_index(VSYSCALL_START)] =
- __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
- ret = 0;
- }
-
- BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
- }
-#endif
-
- return ret;
-}
-
-static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-#ifdef CONFIG_X86_64
- pgd_t *user_pgd = xen_get_user_pgd(pgd);
-
- if (user_pgd)
- free_page((unsigned long)user_pgd);
-#endif
-}
-
-/* This should never happen until we're OK to use struct page */
-static void xen_release_ptpage(unsigned long pfn, unsigned level)
-{
- struct page *page = pfn_to_page(pfn);
-
- if (PagePinned(page)) {
- if (!PageHighMem(page)) {
- if (level == PT_PTE && USE_SPLIT_PTLOCKS)
- pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
- make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
- }
- ClearPagePinned(page);
- }
-}
-
-static void xen_release_pte(unsigned long pfn)
-{
- xen_release_ptpage(pfn, PT_PTE);
-}
-
-static void xen_release_pmd(unsigned long pfn)
-{
- xen_release_ptpage(pfn, PT_PMD);
-}
-
-#if PAGETABLE_LEVELS == 4
-static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
-{
- xen_alloc_ptpage(mm, pfn, PT_PUD);
-}
-
-static void xen_release_pud(unsigned long pfn)
-{
- xen_release_ptpage(pfn, PT_PUD);
-}
-#endif
-
-#ifdef CONFIG_HIGHPTE
-static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
-{
- pgprot_t prot = PAGE_KERNEL;
-
- if (PagePinned(page))
- prot = PAGE_KERNEL_RO;
-
- if (0 && PageHighMem(page))
- printk("mapping highpte %lx type %d prot %s\n",
- page_to_pfn(page), type,
- (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
-
- return kmap_atomic_prot(page, type, prot);
-}
-#endif
-
-#ifdef CONFIG_X86_32
-static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
-{
- /* If there's an existing pte, then don't allow _PAGE_RW to be set */
- if (pte_val_ma(*ptep) & _PAGE_PRESENT)
- pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
- pte_val_ma(pte));
-
- return pte;
-}
-
-/* Init-time set_pte while constructing initial pagetables, which
- doesn't allow RO pagetable pages to be remapped RW */
-static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
-{
- pte = mask_rw_pte(ptep, pte);
-
- xen_set_pte(ptep, pte);
-}
-#endif
-
-static __init void xen_pagetable_setup_start(pgd_t *base)
-{
-}
-
void xen_setup_shared_info(void)
{
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
@@ -1027,37 +659,6 @@ void xen_setup_shared_info(void)
xen_setup_mfn_list_list();
}
-static __init void xen_pagetable_setup_done(pgd_t *base)
-{
- xen_setup_shared_info();
-}
-
-static __init void xen_post_allocator_init(void)
-{
- pv_mmu_ops.set_pte = xen_set_pte;
- pv_mmu_ops.set_pmd = xen_set_pmd;
- pv_mmu_ops.set_pud = xen_set_pud;
-#if PAGETABLE_LEVELS == 4
- pv_mmu_ops.set_pgd = xen_set_pgd;
-#endif
-
- /* This will work as long as patching hasn't happened yet
- (which it hasn't) */
- pv_mmu_ops.alloc_pte = xen_alloc_pte;
- pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
- pv_mmu_ops.release_pte = xen_release_pte;
- pv_mmu_ops.release_pmd = xen_release_pmd;
-#if PAGETABLE_LEVELS == 4
- pv_mmu_ops.alloc_pud = xen_alloc_pud;
- pv_mmu_ops.release_pud = xen_release_pud;
-#endif
-
-#ifdef CONFIG_X86_64
- SetPagePinned(virt_to_page(level3_user_vsyscall));
-#endif
- xen_mark_init_mm_pinned();
-}
-
/* This is called once we have the cpu_possible_map */
void xen_setup_vcpu_info_placement(void)
{
@@ -1071,10 +672,10 @@ void xen_setup_vcpu_info_placement(void)
if (have_vcpu_info_placement) {
printk(KERN_INFO "Xen: using vcpu_info placement\n");
- pv_irq_ops.save_fl = xen_save_fl_direct;
- pv_irq_ops.restore_fl = xen_restore_fl_direct;
- pv_irq_ops.irq_disable = xen_irq_disable_direct;
- pv_irq_ops.irq_enable = xen_irq_enable_direct;
+ pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
+ pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
+ pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
+ pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
}
}
@@ -1132,49 +733,6 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
return ret;
}
-static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
-{
- pte_t pte;
-
- phys >>= PAGE_SHIFT;
-
- switch (idx) {
- case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
-#ifdef CONFIG_X86_F00F_BUG
- case FIX_F00F_IDT:
-#endif
-#ifdef CONFIG_X86_32
- case FIX_WP_TEST:
- case FIX_VDSO:
-# ifdef CONFIG_HIGHMEM
- case FIX_KMAP_BEGIN ... FIX_KMAP_END:
-# endif
-#else
- case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
- case FIX_APIC_BASE: /* maps dummy local APIC */
-#endif
- pte = pfn_pte(phys, prot);
- break;
-
- default:
- pte = mfn_pte(phys, prot);
- break;
- }
-
- __native_set_fixmap(idx, pte);
-
-#ifdef CONFIG_X86_64
- /* Replicate changes to map the vsyscall page into the user
- pagetable vsyscall mapping. */
- if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
- unsigned long vaddr = __fix_to_virt(idx);
- set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
- }
-#endif
-}
-
static const struct pv_info xen_info __initdata = {
.paravirt_enabled = 1,
.shared_kernel_pmd = 0,
@@ -1270,87 +828,6 @@ static const struct pv_apic_ops xen_apic_ops __initdata = {
#endif
};
-static const struct pv_mmu_ops xen_mmu_ops __initdata = {
- .pagetable_setup_start = xen_pagetable_setup_start,
- .pagetable_setup_done = xen_pagetable_setup_done,
-
- .read_cr2 = xen_read_cr2,
- .write_cr2 = xen_write_cr2,
-
- .read_cr3 = xen_read_cr3,
- .write_cr3 = xen_write_cr3,
-
- .flush_tlb_user = xen_flush_tlb,
- .flush_tlb_kernel = xen_flush_tlb,
- .flush_tlb_single = xen_flush_tlb_single,
- .flush_tlb_others = xen_flush_tlb_others,
-
- .pte_update = paravirt_nop,
- .pte_update_defer = paravirt_nop,
-
- .pgd_alloc = xen_pgd_alloc,
- .pgd_free = xen_pgd_free,
-
- .alloc_pte = xen_alloc_pte_init,
- .release_pte = xen_release_pte_init,
- .alloc_pmd = xen_alloc_pte_init,
- .alloc_pmd_clone = paravirt_nop,
- .release_pmd = xen_release_pte_init,
-
-#ifdef CONFIG_HIGHPTE
- .kmap_atomic_pte = xen_kmap_atomic_pte,
-#endif
-
-#ifdef CONFIG_X86_64
- .set_pte = xen_set_pte,
-#else
- .set_pte = xen_set_pte_init,
-#endif
- .set_pte_at = xen_set_pte_at,
- .set_pmd = xen_set_pmd_hyper,
-
- .ptep_modify_prot_start = __ptep_modify_prot_start,
- .ptep_modify_prot_commit = __ptep_modify_prot_commit,
-
- .pte_val = xen_pte_val,
- .pte_flags = native_pte_flags,
- .pgd_val = xen_pgd_val,
-
- .make_pte = xen_make_pte,
- .make_pgd = xen_make_pgd,
-
-#ifdef CONFIG_X86_PAE
- .set_pte_atomic = xen_set_pte_atomic,
- .set_pte_present = xen_set_pte_at,
- .pte_clear = xen_pte_clear,
- .pmd_clear = xen_pmd_clear,
-#endif /* CONFIG_X86_PAE */
- .set_pud = xen_set_pud_hyper,
-
- .make_pmd = xen_make_pmd,
- .pmd_val = xen_pmd_val,
-
-#if PAGETABLE_LEVELS == 4
- .pud_val = xen_pud_val,
- .make_pud = xen_make_pud,
- .set_pgd = xen_set_pgd_hyper,
-
- .alloc_pud = xen_alloc_pte_init,
- .release_pud = xen_release_pte_init,
-#endif /* PAGETABLE_LEVELS == 4 */
-
- .activate_mm = xen_activate_mm,
- .dup_mmap = xen_dup_mmap,
- .exit_mmap = xen_exit_mmap,
-
- .lazy_mode = {
- .enter = paravirt_enter_lazy_mmu,
- .leave = xen_leave_lazy,
- },
-
- .set_fixmap = xen_set_fixmap,
-};
-
static void xen_reboot(int reason)
{
struct sched_shutdown r = { .reason = reason };
@@ -1393,221 +870,6 @@ static const struct machine_ops __initdata xen_machine_ops = {
};
-static void __init xen_reserve_top(void)
-{
-#ifdef CONFIG_X86_32
- unsigned long top = HYPERVISOR_VIRT_START;
- struct xen_platform_parameters pp;
-
- if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
- top = pp.virt_start;
-
- reserve_top_address(-top);
-#endif /* CONFIG_X86_32 */
-}
-
-/*
- * Like __va(), but returns address in the kernel mapping (which is
- * all we have until the physical memory mapping has been set up.
- */
-static void *__ka(phys_addr_t paddr)
-{
-#ifdef CONFIG_X86_64
- return (void *)(paddr + __START_KERNEL_map);
-#else
- return __va(paddr);
-#endif
-}
-
-/* Convert a machine address to physical address */
-static unsigned long m2p(phys_addr_t maddr)
-{
- phys_addr_t paddr;
-
- maddr &= PTE_PFN_MASK;
- paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
-
- return paddr;
-}
-
-/* Convert a machine address to kernel virtual */
-static void *m2v(phys_addr_t maddr)
-{
- return __ka(m2p(maddr));
-}
-
-static void set_page_prot(void *addr, pgprot_t prot)
-{
- unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
- pte_t pte = pfn_pte(pfn, prot);
-
- if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
- BUG();
-}
-
-static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
-{
- unsigned pmdidx, pteidx;
- unsigned ident_pte;
- unsigned long pfn;
-
- ident_pte = 0;
- pfn = 0;
- for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
- pte_t *pte_page;
-
- /* Reuse or allocate a page of ptes */
- if (pmd_present(pmd[pmdidx]))
- pte_page = m2v(pmd[pmdidx].pmd);
- else {
- /* Check for free pte pages */
- if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
- break;
-
- pte_page = &level1_ident_pgt[ident_pte];
- ident_pte += PTRS_PER_PTE;
-
- pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
- }
-
- /* Install mappings */
- for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
- pte_t pte;
-
- if (pfn > max_pfn_mapped)
- max_pfn_mapped = pfn;
-
- if (!pte_none(pte_page[pteidx]))
- continue;
-
- pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
- pte_page[pteidx] = pte;
- }
- }
-
- for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
- set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
-
- set_page_prot(pmd, PAGE_KERNEL_RO);
-}
-
-#ifdef CONFIG_X86_64
-static void convert_pfn_mfn(void *v)
-{
- pte_t *pte = v;
- int i;
-
- /* All levels are converted the same way, so just treat them
- as ptes. */
- for(i = 0; i < PTRS_PER_PTE; i++)
- pte[i] = xen_make_pte(pte[i].pte);
-}
-
-/*
- * Set up the inital kernel pagetable.
- *
- * We can construct this by grafting the Xen provided pagetable into
- * head_64.S's preconstructed pagetables. We copy the Xen L2's into
- * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
- * means that only the kernel has a physical mapping to start with -
- * but that's enough to get __va working. We need to fill in the rest
- * of the physical mapping once some sort of allocator has been set
- * up.
- */
-static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
-{
- pud_t *l3;
- pmd_t *l2;
-
- /* Zap identity mapping */
- init_level4_pgt[0] = __pgd(0);
-
- /* Pre-constructed entries are in pfn, so convert to mfn */
- convert_pfn_mfn(init_level4_pgt);
- convert_pfn_mfn(level3_ident_pgt);
- convert_pfn_mfn(level3_kernel_pgt);
-
- l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
- l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
-
- memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
- memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
-
- l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
- l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
- memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
-
- /* Set up identity map */
- xen_map_identity_early(level2_ident_pgt, max_pfn);
-
- /* Make pagetable pieces RO */
- set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
- set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
- set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
- set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
- set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
- set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
-
- /* Pin down new L4 */
- pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
- PFN_DOWN(__pa_symbol(init_level4_pgt)));
-
- /* Unpin Xen-provided one */
- pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
-
- /* Switch over */
- pgd = init_level4_pgt;
-
- /*
- * At this stage there can be no user pgd, and no page
- * structure to attach it to, so make sure we just set kernel
- * pgd.
- */
- xen_mc_batch();
- __xen_write_cr3(true, __pa(pgd));
- xen_mc_issue(PARAVIRT_LAZY_CPU);
-
- reserve_early(__pa(xen_start_info->pt_base),
- __pa(xen_start_info->pt_base +
- xen_start_info->nr_pt_frames * PAGE_SIZE),
- "XEN PAGETABLES");
-
- return pgd;
-}
-#else /* !CONFIG_X86_64 */
-static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
-
-static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
-{
- pmd_t *kernel_pmd;
-
- init_pg_tables_start = __pa(pgd);
- init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
- max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
-
- kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
- memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
-
- xen_map_identity_early(level2_kernel_pgt, max_pfn);
-
- memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
- set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
- __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
-
- set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
- set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
- set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
-
- pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
-
- xen_write_cr3(__pa(swapper_pg_dir));
-
- pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
-
- return swapper_pg_dir;
-}
-#endif /* CONFIG_X86_64 */
-
/* First C function to be called on Xen boot */
asmlinkage void __init xen_start_kernel(void)
{
@@ -1647,10 +909,18 @@ asmlinkage void __init xen_start_kernel(void)
machine_ops = xen_machine_ops;
#ifdef CONFIG_X86_64
- /* Disable until direct per-cpu data access. */
- have_vcpu_info_placement = 0;
- x86_64_init_pda();
+ /*
+ * Setup percpu state. We only need to do this for 64-bit
+ * because 32-bit already has %fs set properly.
+ */
+ load_percpu_segment(0);
#endif
+ /*
+ * The only reliable way to retain the initial address of the
+ * percpu gdt_page is to remember it here, so we can go and
+ * mark it RW later, when the initial percpu area is freed.
+ */
+ xen_initial_gdt = &per_cpu(gdt_page, 0);
xen_smp_init();
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index bb042608c60..cfd17799bd6 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -19,27 +19,12 @@ void xen_force_evtchn_callback(void)
(void)HYPERVISOR_xen_version(0, NULL);
}
-static void __init __xen_init_IRQ(void)
-{
- int i;
-
- /* Create identity vector->irq map */
- for(i = 0; i < NR_VECTORS; i++) {
- int cpu;
-
- for_each_possible_cpu(cpu)
- per_cpu(vector_irq, cpu)[i] = i;
- }
-
- xen_init_IRQ();
-}
-
static unsigned long xen_save_fl(void)
{
struct vcpu_info *vcpu;
unsigned long flags;
- vcpu = x86_read_percpu(xen_vcpu);
+ vcpu = percpu_read(xen_vcpu);
/* flag has opposite sense of mask */
flags = !vcpu->evtchn_upcall_mask;
@@ -50,6 +35,7 @@ static unsigned long xen_save_fl(void)
*/
return (-flags) & X86_EFLAGS_IF;
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
static void xen_restore_fl(unsigned long flags)
{
@@ -62,7 +48,7 @@ static void xen_restore_fl(unsigned long flags)
make sure we're don't switch CPUs between getting the vcpu
pointer and updating the mask. */
preempt_disable();
- vcpu = x86_read_percpu(xen_vcpu);
+ vcpu = percpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = flags;
preempt_enable_no_resched();
@@ -76,6 +62,7 @@ static void xen_restore_fl(unsigned long flags)
xen_force_evtchn_callback();
}
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
static void xen_irq_disable(void)
{
@@ -83,9 +70,10 @@ static void xen_irq_disable(void)
make sure we're don't switch CPUs between getting the vcpu
pointer and updating the mask. */
preempt_disable();
- x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
+ percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
preempt_enable_no_resched();
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
static void xen_irq_enable(void)
{
@@ -96,7 +84,7 @@ static void xen_irq_enable(void)
the caller is confused and is trying to re-enable interrupts
on an indeterminate processor. */
- vcpu = x86_read_percpu(xen_vcpu);
+ vcpu = percpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = 0;
/* Doesn't matter if we get preempted here, because any
@@ -106,6 +94,7 @@ static void xen_irq_enable(void)
if (unlikely(vcpu->evtchn_upcall_pending))
xen_force_evtchn_callback();
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
static void xen_safe_halt(void)
{
@@ -123,11 +112,13 @@ static void xen_halt(void)
}
static const struct pv_irq_ops xen_irq_ops __initdata = {
- .init_IRQ = __xen_init_IRQ,
- .save_fl = xen_save_fl,
- .restore_fl = xen_restore_fl,
- .irq_disable = xen_irq_disable,
- .irq_enable = xen_irq_enable,
+ .init_IRQ = xen_init_IRQ,
+
+ .save_fl = PV_CALLEE_SAVE(xen_save_fl),
+ .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
+ .irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
+ .irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
+
.safe_halt = xen_safe_halt,
.halt = xen_halt,
#ifdef CONFIG_X86_64
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 688936044dc..d2e8ed1aff3 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -47,6 +47,7 @@
#include <asm/tlbflush.h>
#include <asm/fixmap.h>
#include <asm/mmu_context.h>
+#include <asm/setup.h>
#include <asm/paravirt.h>
#include <asm/linkage.h>
@@ -55,6 +56,8 @@
#include <xen/page.h>
#include <xen/interface/xen.h>
+#include <xen/interface/version.h>
+#include <xen/hvc-console.h>
#include "multicalls.h"
#include "mmu.h"
@@ -114,6 +117,37 @@ static inline void check_zero(void)
#endif /* CONFIG_XEN_DEBUG_FS */
+
+/*
+ * Identity map, in addition to plain kernel map. This needs to be
+ * large enough to allocate page table pages to allocate the rest.
+ * Each page can map 2MB.
+ */
+static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
+
+#ifdef CONFIG_X86_64
+/* l3 pud for userspace vsyscall mapping */
+static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Note about cr3 (pagetable base) values:
+ *
+ * xen_cr3 contains the current logical cr3 value; it contains the
+ * last set cr3. This may not be the current effective cr3, because
+ * its update may be being lazily deferred. However, a vcpu looking
+ * at its own cr3 can use this value knowing that it everything will
+ * be self-consistent.
+ *
+ * xen_current_cr3 contains the actual vcpu cr3; it is set once the
+ * hypercall to set the vcpu cr3 is complete (so it may be a little
+ * out of date, but it will never be set early). If one vcpu is
+ * looking at another vcpu's cr3 value, it should use this variable.
+ */
+DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
+DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
+
+
/*
* Just beyond the highest usermode address. STACK_TOP_MAX has a
* redzone above it, so round it up to a PGD boundary.
@@ -154,13 +188,13 @@ void xen_setup_mfn_list_list(void)
{
unsigned pfn, idx;
- for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
+ for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
unsigned topidx = p2m_top_index(pfn);
p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
}
- for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
+ for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
}
@@ -179,7 +213,7 @@ void __init xen_build_dynamic_phys_to_machine(void)
unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
unsigned pfn;
- for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
+ for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
unsigned topidx = p2m_top_index(pfn);
p2m_top[topidx] = &mfn_list[pfn];
@@ -207,7 +241,7 @@ static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
BUG_ON(p == NULL);
- for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
+ for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
p[i] = INVALID_P2M_ENTRY;
if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
@@ -407,7 +441,8 @@ out:
preempt_enable();
}
-pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
{
/* Just return the pte as-is. We preserve the bits on commit */
return *ptep;
@@ -457,28 +492,33 @@ pteval_t xen_pte_val(pte_t pte)
{
return pte_mfn_to_pfn(pte.pte);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
pgdval_t xen_pgd_val(pgd_t pgd)
{
return pte_mfn_to_pfn(pgd.pgd);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
pte_t xen_make_pte(pteval_t pte)
{
pte = pte_pfn_to_mfn(pte);
return native_make_pte(pte);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
pgd_t xen_make_pgd(pgdval_t pgd)
{
pgd = pte_pfn_to_mfn(pgd);
return native_make_pgd(pgd);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
pmdval_t xen_pmd_val(pmd_t pmd)
{
return pte_mfn_to_pfn(pmd.pmd);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
void xen_set_pud_hyper(pud_t *ptr, pud_t val)
{
@@ -555,12 +595,14 @@ pmd_t xen_make_pmd(pmdval_t pmd)
pmd = pte_pfn_to_mfn(pmd);
return native_make_pmd(pmd);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
#if PAGETABLE_LEVELS == 4
pudval_t xen_pud_val(pud_t pud)
{
return pte_mfn_to_pfn(pud.pud);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
pud_t xen_make_pud(pudval_t pud)
{
@@ -568,6 +610,7 @@ pud_t xen_make_pud(pudval_t pud)
return native_make_pud(pud);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
pgd_t *xen_get_user_pgd(pgd_t *pgd)
{
@@ -661,12 +704,11 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
* For 64-bit, we must skip the Xen hole in the middle of the address
* space, just after the big x86-64 virtual hole.
*/
-static int xen_pgd_walk(struct mm_struct *mm,
- int (*func)(struct mm_struct *mm, struct page *,
- enum pt_level),
- unsigned long limit)
+static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
+ int (*func)(struct mm_struct *mm, struct page *,
+ enum pt_level),
+ unsigned long limit)
{
- pgd_t *pgd = mm->pgd;
int flush = 0;
unsigned hole_low, hole_high;
unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
@@ -753,6 +795,14 @@ out:
return flush;
}
+static int xen_pgd_walk(struct mm_struct *mm,
+ int (*func)(struct mm_struct *mm, struct page *,
+ enum pt_level),
+ unsigned long limit)
+{
+ return __xen_pgd_walk(mm, mm->pgd, func, limit);
+}
+
/* If we're using split pte locks, then take the page's lock and
return a pointer to it. Otherwise return NULL. */
static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
@@ -854,7 +904,7 @@ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
xen_mc_batch();
- if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) {
+ if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
/* re-enable interrupts for flushing */
xen_mc_issue(0);
@@ -871,7 +921,8 @@ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
if (user_pgd) {
xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
- xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
+ xen_do_pin(MMUEXT_PIN_L4_TABLE,
+ PFN_DOWN(__pa(user_pgd)));
}
}
#else /* CONFIG_X86_32 */
@@ -986,7 +1037,8 @@ static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
pgd_t *user_pgd = xen_get_user_pgd(pgd);
if (user_pgd) {
- xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
+ xen_do_pin(MMUEXT_UNPIN_TABLE,
+ PFN_DOWN(__pa(user_pgd)));
xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
}
}
@@ -998,7 +1050,7 @@ static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
PT_PMD);
#endif
- xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT);
+ __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
xen_mc_issue(0);
}
@@ -1053,18 +1105,14 @@ static void drop_other_mm_ref(void *info)
struct mm_struct *mm = info;
struct mm_struct *active_mm;
-#ifdef CONFIG_X86_64
- active_mm = read_pda(active_mm);
-#else
- active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
-#endif
+ active_mm = percpu_read(cpu_tlbstate.active_mm);
if (active_mm == mm)
leave_mm(smp_processor_id());
/* If this cpu still has a stale cr3 reference, then make sure
it has been flushed. */
- if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
+ if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
load_cr3(swapper_pg_dir);
arch_flush_lazy_cpu_mode();
}
@@ -1072,7 +1120,7 @@ static void drop_other_mm_ref(void *info)
static void xen_drop_mm_ref(struct mm_struct *mm)
{
- cpumask_t mask;
+ cpumask_var_t mask;
unsigned cpu;
if (current->active_mm == mm) {
@@ -1084,7 +1132,16 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
}
/* Get the "official" set of cpus referring to our pagetable. */
- mask = mm->cpu_vm_mask;
+ if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
+ for_each_online_cpu(cpu) {
+ if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask)
+ && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
+ continue;
+ smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
+ }
+ return;
+ }
+ cpumask_copy(mask, &mm->cpu_vm_mask);
/* It's possible that a vcpu may have a stale reference to our
cr3, because its in lazy mode, and it hasn't yet flushed
@@ -1093,11 +1150,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
if needed. */
for_each_online_cpu(cpu) {
if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
- cpu_set(cpu, mask);
+ cpumask_set_cpu(cpu, mask);
}
- if (!cpus_empty(mask))
- smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
+ if (!cpumask_empty(mask))
+ smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
+ free_cpumask_var(mask);
}
#else
static void xen_drop_mm_ref(struct mm_struct *mm)
@@ -1136,6 +1194,709 @@ void xen_exit_mmap(struct mm_struct *mm)
spin_unlock(&mm->page_table_lock);
}
+static __init void xen_pagetable_setup_start(pgd_t *base)
+{
+}
+
+static __init void xen_pagetable_setup_done(pgd_t *base)
+{
+ xen_setup_shared_info();
+}
+
+static void xen_write_cr2(unsigned long cr2)
+{
+ percpu_read(xen_vcpu)->arch.cr2 = cr2;
+}
+
+static unsigned long xen_read_cr2(void)
+{
+ return percpu_read(xen_vcpu)->arch.cr2;
+}
+
+unsigned long xen_read_cr2_direct(void)
+{
+ return percpu_read(xen_vcpu_info.arch.cr2);
+}
+
+static void xen_flush_tlb(void)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs;
+
+ preempt_disable();
+
+ mcs = xen_mc_entry(sizeof(*op));
+
+ op = mcs.args;
+ op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
+
+static void xen_flush_tlb_single(unsigned long addr)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs;
+
+ preempt_disable();
+
+ mcs = xen_mc_entry(sizeof(*op));
+ op = mcs.args;
+ op->cmd = MMUEXT_INVLPG_LOCAL;
+ op->arg1.linear_addr = addr & PAGE_MASK;
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
+
+static void xen_flush_tlb_others(const struct cpumask *cpus,
+ struct mm_struct *mm, unsigned long va)
+{
+ struct {
+ struct mmuext_op op;
+ DECLARE_BITMAP(mask, NR_CPUS);
+ } *args;
+ struct multicall_space mcs;
+
+ BUG_ON(cpumask_empty(cpus));
+ BUG_ON(!mm);
+
+ mcs = xen_mc_entry(sizeof(*args));
+ args = mcs.args;
+ args->op.arg2.vcpumask = to_cpumask(args->mask);
+
+ /* Remove us, and any offline CPUS. */
+ cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
+ if (unlikely(cpumask_empty(to_cpumask(args->mask))))
+ goto issue;
+
+ if (va == TLB_FLUSH_ALL) {
+ args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+ } else {
+ args->op.cmd = MMUEXT_INVLPG_MULTI;
+ args->op.arg1.linear_addr = va;
+ }
+
+ MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
+
+issue:
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static unsigned long xen_read_cr3(void)
+{
+ return percpu_read(xen_cr3);
+}
+
+static void set_current_cr3(void *v)
+{
+ percpu_write(xen_current_cr3, (unsigned long)v);
+}
+
+static void __xen_write_cr3(bool kernel, unsigned long cr3)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs;
+ unsigned long mfn;
+
+ if (cr3)
+ mfn = pfn_to_mfn(PFN_DOWN(cr3));
+ else
+ mfn = 0;
+
+ WARN_ON(mfn == 0 && kernel);
+
+ mcs = __xen_mc_entry(sizeof(*op));
+
+ op = mcs.args;
+ op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
+ op->arg1.mfn = mfn;
+
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ if (kernel) {
+ percpu_write(xen_cr3, cr3);
+
+ /* Update xen_current_cr3 once the batch has actually
+ been submitted. */
+ xen_mc_callback(set_current_cr3, (void *)cr3);
+ }
+}
+
+static void xen_write_cr3(unsigned long cr3)
+{
+ BUG_ON(preemptible());
+
+ xen_mc_batch(); /* disables interrupts */
+
+ /* Update while interrupts are disabled, so its atomic with
+ respect to ipis */
+ percpu_write(xen_cr3, cr3);
+
+ __xen_write_cr3(true, cr3);
+
+#ifdef CONFIG_X86_64
+ {
+ pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
+ if (user_pgd)
+ __xen_write_cr3(false, __pa(user_pgd));
+ else
+ __xen_write_cr3(false, 0);
+ }
+#endif
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
+}
+
+static int xen_pgd_alloc(struct mm_struct *mm)
+{
+ pgd_t *pgd = mm->pgd;
+ int ret = 0;
+
+ BUG_ON(PagePinned(virt_to_page(pgd)));
+
+#ifdef CONFIG_X86_64
+ {
+ struct page *page = virt_to_page(pgd);
+ pgd_t *user_pgd;
+
+ BUG_ON(page->private != 0);
+
+ ret = -ENOMEM;
+
+ user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ page->private = (unsigned long)user_pgd;
+
+ if (user_pgd != NULL) {
+ user_pgd[pgd_index(VSYSCALL_START)] =
+ __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
+ ret = 0;
+ }
+
+ BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
+ }
+#endif
+
+ return ret;
+}
+
+static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+ pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+ if (user_pgd)
+ free_page((unsigned long)user_pgd);
+#endif
+}
+
+#ifdef CONFIG_HIGHPTE
+static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
+{
+ pgprot_t prot = PAGE_KERNEL;
+
+ if (PagePinned(page))
+ prot = PAGE_KERNEL_RO;
+
+ if (0 && PageHighMem(page))
+ printk("mapping highpte %lx type %d prot %s\n",
+ page_to_pfn(page), type,
+ (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
+
+ return kmap_atomic_prot(page, type, prot);
+}
+#endif
+
+#ifdef CONFIG_X86_32
+static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+{
+ /* If there's an existing pte, then don't allow _PAGE_RW to be set */
+ if (pte_val_ma(*ptep) & _PAGE_PRESENT)
+ pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
+ pte_val_ma(pte));
+
+ return pte;
+}
+
+/* Init-time set_pte while constructing initial pagetables, which
+ doesn't allow RO pagetable pages to be remapped RW */
+static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
+{
+ pte = mask_rw_pte(ptep, pte);
+
+ xen_set_pte(ptep, pte);
+}
+#endif
+
+/* Early in boot, while setting up the initial pagetable, assume
+ everything is pinned. */
+static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
+{
+#ifdef CONFIG_FLATMEM
+ BUG_ON(mem_map); /* should only be used early */
+#endif
+ make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+
+/* Early release_pte assumes that all pts are pinned, since there's
+ only init_mm and anything attached to that is pinned. */
+static void xen_release_pte_init(unsigned long pfn)
+{
+ make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+}
+
+static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+ struct mmuext_op op;
+ op.cmd = cmd;
+ op.arg1.mfn = pfn_to_mfn(pfn);
+ if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+ BUG();
+}
+
+/* This needs to make sure the new pte page is pinned iff its being
+ attached to a pinned pagetable. */
+static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
+{
+ struct page *page = pfn_to_page(pfn);
+
+ if (PagePinned(virt_to_page(mm->pgd))) {
+ SetPagePinned(page);
+
+ vm_unmap_aliases();
+ if (!PageHighMem(page)) {
+ make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
+ if (level == PT_PTE && USE_SPLIT_PTLOCKS)
+ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+ } else {
+ /* make sure there are no stray mappings of
+ this page */
+ kmap_flush_unused();
+ }
+ }
+}
+
+static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
+{
+ xen_alloc_ptpage(mm, pfn, PT_PTE);
+}
+
+static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
+{
+ xen_alloc_ptpage(mm, pfn, PT_PMD);
+}
+
+/* This should never happen until we're OK to use struct page */
+static void xen_release_ptpage(unsigned long pfn, unsigned level)
+{
+ struct page *page = pfn_to_page(pfn);
+
+ if (PagePinned(page)) {
+ if (!PageHighMem(page)) {
+ if (level == PT_PTE && USE_SPLIT_PTLOCKS)
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
+ make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+ }
+ ClearPagePinned(page);
+ }
+}
+
+static void xen_release_pte(unsigned long pfn)
+{
+ xen_release_ptpage(pfn, PT_PTE);
+}
+
+static void xen_release_pmd(unsigned long pfn)
+{
+ xen_release_ptpage(pfn, PT_PMD);
+}
+
+#if PAGETABLE_LEVELS == 4
+static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
+{
+ xen_alloc_ptpage(mm, pfn, PT_PUD);
+}
+
+static void xen_release_pud(unsigned long pfn)
+{
+ xen_release_ptpage(pfn, PT_PUD);
+}
+#endif
+
+void __init xen_reserve_top(void)
+{
+#ifdef CONFIG_X86_32
+ unsigned long top = HYPERVISOR_VIRT_START;
+ struct xen_platform_parameters pp;
+
+ if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
+ top = pp.virt_start;
+
+ reserve_top_address(-top);
+#endif /* CONFIG_X86_32 */
+}
+
+/*
+ * Like __va(), but returns address in the kernel mapping (which is
+ * all we have until the physical memory mapping has been set up.
+ */
+static void *__ka(phys_addr_t paddr)
+{
+#ifdef CONFIG_X86_64
+ return (void *)(paddr + __START_KERNEL_map);
+#else
+ return __va(paddr);
+#endif
+}
+
+/* Convert a machine address to physical address */
+static unsigned long m2p(phys_addr_t maddr)
+{
+ phys_addr_t paddr;
+
+ maddr &= PTE_PFN_MASK;
+ paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
+
+ return paddr;
+}
+
+/* Convert a machine address to kernel virtual */
+static void *m2v(phys_addr_t maddr)
+{
+ return __ka(m2p(maddr));
+}
+
+static void set_page_prot(void *addr, pgprot_t prot)
+{
+ unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
+ pte_t pte = pfn_pte(pfn, prot);
+
+ if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
+ BUG();
+}
+
+static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
+{
+ unsigned pmdidx, pteidx;
+ unsigned ident_pte;
+ unsigned long pfn;
+
+ ident_pte = 0;
+ pfn = 0;
+ for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
+ pte_t *pte_page;
+
+ /* Reuse or allocate a page of ptes */
+ if (pmd_present(pmd[pmdidx]))
+ pte_page = m2v(pmd[pmdidx].pmd);
+ else {
+ /* Check for free pte pages */
+ if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
+ break;
+
+ pte_page = &level1_ident_pgt[ident_pte];
+ ident_pte += PTRS_PER_PTE;
+
+ pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
+ }
+
+ /* Install mappings */
+ for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
+ pte_t pte;
+
+ if (pfn > max_pfn_mapped)
+ max_pfn_mapped = pfn;
+
+ if (!pte_none(pte_page[pteidx]))
+ continue;
+
+ pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
+ pte_page[pteidx] = pte;
+ }
+ }
+
+ for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
+ set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
+
+ set_page_prot(pmd, PAGE_KERNEL_RO);
+}
+
+#ifdef CONFIG_X86_64
+static void convert_pfn_mfn(void *v)
+{
+ pte_t *pte = v;
+ int i;
+
+ /* All levels are converted the same way, so just treat them
+ as ptes. */
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ pte[i] = xen_make_pte(pte[i].pte);
+}
+
+/*
+ * Set up the inital kernel pagetable.
+ *
+ * We can construct this by grafting the Xen provided pagetable into
+ * head_64.S's preconstructed pagetables. We copy the Xen L2's into
+ * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
+ * means that only the kernel has a physical mapping to start with -
+ * but that's enough to get __va working. We need to fill in the rest
+ * of the physical mapping once some sort of allocator has been set
+ * up.
+ */
+__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+ unsigned long max_pfn)
+{
+ pud_t *l3;
+ pmd_t *l2;
+
+ /* Zap identity mapping */
+ init_level4_pgt[0] = __pgd(0);
+
+ /* Pre-constructed entries are in pfn, so convert to mfn */
+ convert_pfn_mfn(init_level4_pgt);
+ convert_pfn_mfn(level3_ident_pgt);
+ convert_pfn_mfn(level3_kernel_pgt);
+
+ l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
+ l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
+
+ memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+ memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+
+ l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
+ l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
+ memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+
+ /* Set up identity map */
+ xen_map_identity_early(level2_ident_pgt, max_pfn);
+
+ /* Make pagetable pieces RO */
+ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
+ set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
+
+ /* Pin down new L4 */
+ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
+ PFN_DOWN(__pa_symbol(init_level4_pgt)));
+
+ /* Unpin Xen-provided one */
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+ /* Switch over */
+ pgd = init_level4_pgt;
+
+ /*
+ * At this stage there can be no user pgd, and no page
+ * structure to attach it to, so make sure we just set kernel
+ * pgd.
+ */
+ xen_mc_batch();
+ __xen_write_cr3(true, __pa(pgd));
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+
+ reserve_early(__pa(xen_start_info->pt_base),
+ __pa(xen_start_info->pt_base +
+ xen_start_info->nr_pt_frames * PAGE_SIZE),
+ "XEN PAGETABLES");
+
+ return pgd;
+}
+#else /* !CONFIG_X86_64 */
+static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
+
+__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+ unsigned long max_pfn)
+{
+ pmd_t *kernel_pmd;
+
+ init_pg_tables_start = __pa(pgd);
+ init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+ max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
+
+ kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
+ memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
+
+ xen_map_identity_early(level2_kernel_pgt, max_pfn);
+
+ memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
+ set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
+ __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
+
+ set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
+ set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
+
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+ xen_write_cr3(__pa(swapper_pg_dir));
+
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
+
+ return swapper_pg_dir;
+}
+#endif /* CONFIG_X86_64 */
+
+static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
+{
+ pte_t pte;
+
+ phys >>= PAGE_SHIFT;
+
+ switch (idx) {
+ case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
+#ifdef CONFIG_X86_F00F_BUG
+ case FIX_F00F_IDT:
+#endif
+#ifdef CONFIG_X86_32
+ case FIX_WP_TEST:
+ case FIX_VDSO:
+# ifdef CONFIG_HIGHMEM
+ case FIX_KMAP_BEGIN ... FIX_KMAP_END:
+# endif
+#else
+ case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+ case FIX_APIC_BASE: /* maps dummy local APIC */
+#endif
+ pte = pfn_pte(phys, prot);
+ break;
+
+ default:
+ pte = mfn_pte(phys, prot);
+ break;
+ }
+
+ __native_set_fixmap(idx, pte);
+
+#ifdef CONFIG_X86_64
+ /* Replicate changes to map the vsyscall page into the user
+ pagetable vsyscall mapping. */
+ if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
+ unsigned long vaddr = __fix_to_virt(idx);
+ set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
+ }
+#endif
+}
+
+__init void xen_post_allocator_init(void)
+{
+ pv_mmu_ops.set_pte = xen_set_pte;
+ pv_mmu_ops.set_pmd = xen_set_pmd;
+ pv_mmu_ops.set_pud = xen_set_pud;
+#if PAGETABLE_LEVELS == 4
+ pv_mmu_ops.set_pgd = xen_set_pgd;
+#endif
+
+ /* This will work as long as patching hasn't happened yet
+ (which it hasn't) */
+ pv_mmu_ops.alloc_pte = xen_alloc_pte;
+ pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
+ pv_mmu_ops.release_pte = xen_release_pte;
+ pv_mmu_ops.release_pmd = xen_release_pmd;
+#if PAGETABLE_LEVELS == 4
+ pv_mmu_ops.alloc_pud = xen_alloc_pud;
+ pv_mmu_ops.release_pud = xen_release_pud;
+#endif
+
+#ifdef CONFIG_X86_64
+ SetPagePinned(virt_to_page(level3_user_vsyscall));
+#endif
+ xen_mark_init_mm_pinned();
+}
+
+
+const struct pv_mmu_ops xen_mmu_ops __initdata = {
+ .pagetable_setup_start = xen_pagetable_setup_start,
+ .pagetable_setup_done = xen_pagetable_setup_done,
+
+ .read_cr2 = xen_read_cr2,
+ .write_cr2 = xen_write_cr2,
+
+ .read_cr3 = xen_read_cr3,
+ .write_cr3 = xen_write_cr3,
+
+ .flush_tlb_user = xen_flush_tlb,
+ .flush_tlb_kernel = xen_flush_tlb,
+ .flush_tlb_single = xen_flush_tlb_single,
+ .flush_tlb_others = xen_flush_tlb_others,
+
+ .pte_update = paravirt_nop,
+ .pte_update_defer = paravirt_nop,
+
+ .pgd_alloc = xen_pgd_alloc,
+ .pgd_free = xen_pgd_free,
+
+ .alloc_pte = xen_alloc_pte_init,
+ .release_pte = xen_release_pte_init,
+ .alloc_pmd = xen_alloc_pte_init,
+ .alloc_pmd_clone = paravirt_nop,
+ .release_pmd = xen_release_pte_init,
+
+#ifdef CONFIG_HIGHPTE
+ .kmap_atomic_pte = xen_kmap_atomic_pte,
+#endif
+
+#ifdef CONFIG_X86_64
+ .set_pte = xen_set_pte,
+#else
+ .set_pte = xen_set_pte_init,
+#endif
+ .set_pte_at = xen_set_pte_at,
+ .set_pmd = xen_set_pmd_hyper,
+
+ .ptep_modify_prot_start = __ptep_modify_prot_start,
+ .ptep_modify_prot_commit = __ptep_modify_prot_commit,
+
+ .pte_val = PV_CALLEE_SAVE(xen_pte_val),
+ .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
+
+ .make_pte = PV_CALLEE_SAVE(xen_make_pte),
+ .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
+
+#ifdef CONFIG_X86_PAE
+ .set_pte_atomic = xen_set_pte_atomic,
+ .set_pte_present = xen_set_pte_at,
+ .pte_clear = xen_pte_clear,
+ .pmd_clear = xen_pmd_clear,
+#endif /* CONFIG_X86_PAE */
+ .set_pud = xen_set_pud_hyper,
+
+ .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
+ .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
+
+#if PAGETABLE_LEVELS == 4
+ .pud_val = PV_CALLEE_SAVE(xen_pud_val),
+ .make_pud = PV_CALLEE_SAVE(xen_make_pud),
+ .set_pgd = xen_set_pgd_hyper,
+
+ .alloc_pud = xen_alloc_pte_init,
+ .release_pud = xen_release_pte_init,
+#endif /* PAGETABLE_LEVELS == 4 */
+
+ .activate_mm = xen_activate_mm,
+ .dup_mmap = xen_dup_mmap,
+ .exit_mmap = xen_exit_mmap,
+
+ .lazy_mode = {
+ .enter = paravirt_enter_lazy_mmu,
+ .leave = xen_leave_lazy,
+ },
+
+ .set_fixmap = xen_set_fixmap,
+};
+
+
#ifdef CONFIG_XEN_DEBUG_FS
static struct dentry *d_mmu_debug;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 98d71659da5..24d1b44a337 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -54,4 +54,7 @@ pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t
void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte);
+unsigned long xen_read_cr2_direct(void);
+
+extern const struct pv_mmu_ops xen_mmu_ops;
#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 8ea8a0d0b0d..c738644b543 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -154,7 +154,7 @@ void xen_mc_flush(void)
ret, smp_processor_id());
dump_stack();
for (i = 0; i < b->mcidx; i++) {
- printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
+ printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
i+1, b->mcidx,
b->debug[i].op,
b->debug[i].args[0],
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 85893824161..9e565da5d1f 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -19,8 +19,10 @@ DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
paired with xen_mc_issue() */
static inline void xen_mc_batch(void)
{
+ unsigned long flags;
/* need to disable interrupts until this entry is complete */
- local_irq_save(__get_cpu_var(xen_mc_irq_flags));
+ local_irq_save(flags);
+ __get_cpu_var(xen_mc_irq_flags) = flags;
}
static inline struct multicall_space xen_mc_entry(size_t args)
@@ -39,7 +41,7 @@ static inline void xen_mc_issue(unsigned mode)
xen_mc_flush();
/* restore flags saved in xen_mc_batch */
- local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
+ local_irq_restore(percpu_read(xen_mc_irq_flags));
}
/* Set up a callback to be called when the current batch is flushed */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index d6790108388..15c6c68db6a 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -28,6 +28,9 @@
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
+extern void xen_sysenter_target(void);
+extern void xen_syscall_target(void);
+extern void xen_syscall32_target(void);
/**
@@ -110,7 +113,6 @@ static __cpuinit int register_callback(unsigned type, const void *func)
void __cpuinit xen_enable_sysenter(void)
{
- extern void xen_sysenter_target(void);
int ret;
unsigned sysenter_feature;
@@ -132,8 +134,6 @@ void __cpuinit xen_enable_syscall(void)
{
#ifdef CONFIG_X86_64
int ret;
- extern void xen_syscall_target(void);
- extern void xen_syscall32_target(void);
ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
if (ret != 0) {
@@ -160,7 +160,8 @@ void __init xen_arch_setup(void)
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
if (!xen_feature(XENFEAT_auto_translated_physmap))
- HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
+ HYPERVISOR_vm_assist(VMASST_CMD_enable,
+ VMASST_TYPE_pae_extended_cr3);
if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index d77da613b1d..035582ae815 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -33,7 +33,7 @@
#include "xen-ops.h"
#include "mmu.h"
-cpumask_t xen_cpu_initialized_map;
+cpumask_var_t xen_cpu_initialized_map;
static DEFINE_PER_CPU(int, resched_irq);
static DEFINE_PER_CPU(int, callfunc_irq);
@@ -50,11 +50,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
*/
static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
{
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_resched_count++;
-#else
- add_pda(irq_resched_count, 1);
-#endif
+ inc_irq_stat(irq_resched_count);
return IRQ_HANDLED;
}
@@ -78,7 +74,7 @@ static __cpuinit void cpu_bringup(void)
xen_setup_cpu_clockevents();
cpu_set(cpu, cpu_online_map);
- x86_write_percpu(cpu_state, CPU_ONLINE);
+ percpu_write(cpu_state, CPU_ONLINE);
wmb();
/* We can take interrupts now: we're officially "up". */
@@ -158,7 +154,7 @@ static void __init xen_fill_possible_map(void)
{
int i, rc;
- for (i = 0; i < NR_CPUS; i++) {
+ for (i = 0; i < nr_cpu_ids; i++) {
rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
if (rc >= 0) {
num_processors++;
@@ -174,7 +170,7 @@ static void __init xen_smp_prepare_boot_cpu(void)
/* We've switched to the "real" per-cpu gdt, so make sure the
old memory can be recycled */
- make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
+ make_lowmem_page_readwrite(xen_initial_gdt);
xen_setup_vcpu_info_placement();
}
@@ -192,11 +188,14 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
if (xen_smp_intr_init(0))
BUG();
- xen_cpu_initialized_map = cpumask_of_cpu(0);
+ if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL))
+ panic("could not allocate xen_cpu_initialized_map\n");
+
+ cpumask_copy(xen_cpu_initialized_map, cpumask_of(0));
/* Restrict the possible_map according to max_cpus. */
while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
- for (cpu = NR_CPUS - 1; !cpu_possible(cpu); cpu--)
+ for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--)
continue;
cpu_clear(cpu, cpu_possible_map);
}
@@ -221,7 +220,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
struct vcpu_guest_context *ctxt;
struct desc_struct *gdt;
- if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
+ if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
return 0;
ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
@@ -236,6 +235,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
ctxt->user_regs.ss = __KERNEL_DS;
#ifdef CONFIG_X86_32
ctxt->user_regs.fs = __KERNEL_PERCPU;
+#else
+ ctxt->gs_base_kernel = per_cpu_offset(cpu);
#endif
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
@@ -280,23 +281,14 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
struct task_struct *idle = idle_task(cpu);
int rc;
-#ifdef CONFIG_X86_64
- /* Allocate node local memory for AP pdas */
- WARN_ON(cpu == 0);
- if (cpu > 0) {
- rc = get_local_pda(cpu);
- if (rc)
- return rc;
- }
-#endif
-
-#ifdef CONFIG_X86_32
- init_gdt(cpu);
per_cpu(current_task, cpu) = idle;
+#ifdef CONFIG_X86_32
irq_ctx_init(cpu);
#else
- cpu_pda(cpu)->pcurrent = idle;
clear_tsk_thread_flag(idle, TIF_FORK);
+ per_cpu(kernel_stack, cpu) =
+ (unsigned long)task_stack_page(idle) -
+ KERNEL_STACK_OFFSET + THREAD_SIZE;
#endif
xen_setup_timer(cpu);
xen_init_lock_cpu(cpu);
@@ -362,7 +354,7 @@ static void xen_cpu_die(unsigned int cpu)
alternatives_smp_switch(0);
}
-static void xen_play_dead(void)
+static void __cpuinit xen_play_dead(void) /* used only with CPU_HOTPLUG */
{
play_dead_common();
HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
@@ -408,24 +400,23 @@ static void xen_smp_send_reschedule(int cpu)
xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
}
-static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
+static void xen_send_IPI_mask(const struct cpumask *mask,
+ enum ipi_vector vector)
{
unsigned cpu;
- cpus_and(mask, mask, cpu_online_map);
-
- for_each_cpu_mask_nr(cpu, mask)
+ for_each_cpu_and(cpu, mask, cpu_online_mask)
xen_send_IPI_one(cpu, vector);
}
-static void xen_smp_send_call_function_ipi(cpumask_t mask)
+static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
{
int cpu;
xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
/* Make sure other vcpus get a chance to run if they need to. */
- for_each_cpu_mask_nr(cpu, mask) {
+ for_each_cpu(cpu, mask) {
if (xen_vcpu_stolen(cpu)) {
HYPERVISOR_sched_op(SCHEDOP_yield, 0);
break;
@@ -435,18 +426,15 @@ static void xen_smp_send_call_function_ipi(cpumask_t mask)
static void xen_smp_send_call_function_single_ipi(int cpu)
{
- xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
+ xen_send_IPI_mask(cpumask_of(cpu),
+ XEN_CALL_FUNCTION_SINGLE_VECTOR);
}
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_interrupt();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_call_count++;
-#else
- add_pda(irq_call_count, 1);
-#endif
+ inc_irq_stat(irq_call_count);
irq_exit();
return IRQ_HANDLED;
@@ -456,11 +444,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_single_interrupt();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_call_count++;
-#else
- add_pda(irq_call_count, 1);
-#endif
+ inc_irq_stat(irq_call_count);
irq_exit();
return IRQ_HANDLED;
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 2a234db5949..95be7b43472 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -6,6 +6,7 @@
#include <asm/xen/hypercall.h>
#include <asm/xen/page.h>
+#include <asm/fixmap.h>
#include "xen-ops.h"
#include "mmu.h"
@@ -35,7 +36,8 @@ void xen_post_suspend(int suspend_cancelled)
pfn_to_mfn(xen_start_info->console.domU.mfn);
} else {
#ifdef CONFIG_SMP
- xen_cpu_initialized_map = cpu_online_map;
+ BUG_ON(xen_cpu_initialized_map == NULL);
+ cpumask_copy(xen_cpu_initialized_map, cpu_online_mask);
#endif
xen_vcpu_restore();
}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index c9f7cda48ed..14f24062349 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -132,8 +132,7 @@ static void do_stolen_accounting(void)
*snap = state;
/* Add the appropriate number of ticks of stolen time,
- including any left-overs from last time. Passing NULL to
- account_steal_time accounts the time as stolen. */
+ including any left-overs from last time. */
stolen = runnable + offline + __get_cpu_var(residual_stolen);
if (stolen < 0)
@@ -141,11 +140,10 @@ static void do_stolen_accounting(void)
ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
__get_cpu_var(residual_stolen) = stolen;
- account_steal_time(NULL, ticks);
+ account_steal_ticks(ticks);
/* Add the appropriate number of ticks of blocked time,
- including any left-overs from last time. Passing idle to
- account_steal_time accounts the time as idle/wait. */
+ including any left-overs from last time. */
blocked += __get_cpu_var(residual_blocked);
if (blocked < 0)
@@ -153,7 +151,7 @@ static void do_stolen_accounting(void)
ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
__get_cpu_var(residual_blocked) = blocked;
- account_steal_time(idle_task(smp_processor_id()), ticks);
+ account_idle_ticks(ticks);
}
/*
@@ -437,7 +435,7 @@ void xen_setup_timer(int cpu)
evt = &per_cpu(xen_clock_events, cpu);
memcpy(evt, xen_clockevent, sizeof(*evt));
- evt->cpumask = cpumask_of_cpu(cpu);
+ evt->cpumask = cpumask_of(cpu);
evt->irq = irq;
setup_runstate_info(cpu);
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
new file mode 100644
index 00000000000..79d7362ad6d
--- /dev/null
+++ b/arch/x86/xen/xen-asm.S
@@ -0,0 +1,142 @@
+/*
+ * Asm versions of Xen pv-ops, suitable for either direct use or
+ * inlining. The inline versions are the same as the direct-use
+ * versions, with the pre- and post-amble chopped off.
+ *
+ * This code is encoded for size rather than absolute efficiency, with
+ * a view to being able to inline as much as possible.
+ *
+ * We only bother with direct forms (ie, vcpu in percpu data) of the
+ * operations here; the indirect forms are better handled in C, since
+ * they're generally too large to inline anyway.
+ */
+
+#include <asm/asm-offsets.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+
+#include "xen-asm.h"
+
+/*
+ * Enable events. This clears the event mask and tests the pending
+ * event status with one and operation. If there are pending events,
+ * then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+ /* Unmask events */
+ movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+
+ /*
+ * Preempt here doesn't matter because that will deal with any
+ * pending interrupts. The pending check may end up being run
+ * on the wrong CPU, but that doesn't hurt.
+ */
+
+ /* Test for pending */
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
+ jz 1f
+
+2: call check_events
+1:
+ENDPATCH(xen_irq_enable_direct)
+ ret
+ ENDPROC(xen_irq_enable_direct)
+ RELOC(xen_irq_enable_direct, 2b+1)
+
+
+/*
+ * Disabling events is simply a matter of making the event mask
+ * non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+ movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ENDPATCH(xen_irq_disable_direct)
+ ret
+ ENDPROC(xen_irq_disable_direct)
+ RELOC(xen_irq_disable_direct, 0)
+
+/*
+ * (xen_)save_fl is used to get the current interrupt enable status.
+ * Callers expect the status to be in X86_EFLAGS_IF, and other bits
+ * may be set in the return value. We take advantage of this by
+ * making sure that X86_EFLAGS_IF has the right value (and other bits
+ * in that byte are 0), but other bits in the return value are
+ * undefined. We need to toggle the state of the bit, because Xen and
+ * x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ setz %ah
+ addb %ah, %ah
+ENDPATCH(xen_save_fl_direct)
+ ret
+ ENDPROC(xen_save_fl_direct)
+ RELOC(xen_save_fl_direct, 0)
+
+
+/*
+ * In principle the caller should be passing us a value return from
+ * xen_save_fl_direct, but for robustness sake we test only the
+ * X86_EFLAGS_IF flag rather than the whole byte. After setting the
+ * interrupt mask state, it checks for unmasked pending events and
+ * enters the hypervisor to get them delivered if so.
+ */
+ENTRY(xen_restore_fl_direct)
+#ifdef CONFIG_X86_64
+ testw $X86_EFLAGS_IF, %di
+#else
+ testb $X86_EFLAGS_IF>>8, %ah
+#endif
+ setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ /*
+ * Preempt here doesn't matter because that will deal with any
+ * pending interrupts. The pending check may end up being run
+ * on the wrong CPU, but that doesn't hurt.
+ */
+
+ /* check for unmasked and pending */
+ cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
+ jz 1f
+2: call check_events
+1:
+ENDPATCH(xen_restore_fl_direct)
+ ret
+ ENDPROC(xen_restore_fl_direct)
+ RELOC(xen_restore_fl_direct, 2b+1)
+
+
+/*
+ * Force an event check by making a hypercall, but preserve regs
+ * before making the call.
+ */
+check_events:
+#ifdef CONFIG_X86_32
+ push %eax
+ push %ecx
+ push %edx
+ call xen_force_evtchn_callback
+ pop %edx
+ pop %ecx
+ pop %eax
+#else
+ push %rax
+ push %rcx
+ push %rdx
+ push %rsi
+ push %rdi
+ push %r8
+ push %r9
+ push %r10
+ push %r11
+ call xen_force_evtchn_callback
+ pop %r11
+ pop %r10
+ pop %r9
+ pop %r8
+ pop %rdi
+ pop %rsi
+ pop %rdx
+ pop %rcx
+ pop %rax
+#endif
+ ret
diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h
new file mode 100644
index 00000000000..465276467a4
--- /dev/null
+++ b/arch/x86/xen/xen-asm.h
@@ -0,0 +1,12 @@
+#ifndef _XEN_XEN_ASM_H
+#define _XEN_XEN_ASM_H
+
+#include <linux/linkage.h>
+
+#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x) .globl x##_end; x##_end=.
+
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI 0x80000000
+
+#endif
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 42786f59d9c..88e15deb8b8 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -1,117 +1,43 @@
/*
- Asm versions of Xen pv-ops, suitable for either direct use or inlining.
- The inline versions are the same as the direct-use versions, with the
- pre- and post-amble chopped off.
-
- This code is encoded for size rather than absolute efficiency,
- with a view to being able to inline as much as possible.
-
- We only bother with direct forms (ie, vcpu in pda) of the operations
- here; the indirect forms are better handled in C, since they're
- generally too large to inline anyway.
+ * Asm versions of Xen pv-ops, suitable for either direct use or
+ * inlining. The inline versions are the same as the direct-use
+ * versions, with the pre- and post-amble chopped off.
+ *
+ * This code is encoded for size rather than absolute efficiency, with
+ * a view to being able to inline as much as possible.
+ *
+ * We only bother with direct forms (ie, vcpu in pda) of the
+ * operations here; the indirect forms are better handled in C, since
+ * they're generally too large to inline anyway.
*/
-#include <linux/linkage.h>
-
-#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
-#include <asm/percpu.h>
#include <asm/processor-flags.h>
#include <asm/segment.h>
#include <xen/interface/xen.h>
-#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
-#define ENDPATCH(x) .globl x##_end; x##_end=.
-
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI 0x80000000
-
-/*
- Enable events. This clears the event mask and tests the pending
- event status with one and operation. If there are pending
- events, then enter the hypervisor to get them handled.
- */
-ENTRY(xen_irq_enable_direct)
- /* Unmask events */
- movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-
- /* Preempt here doesn't matter because that will deal with
- any pending interrupts. The pending check may end up being
- run on the wrong CPU, but that doesn't hurt. */
-
- /* Test for pending */
- testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
- jz 1f
-
-2: call check_events
-1:
-ENDPATCH(xen_irq_enable_direct)
- ret
- ENDPROC(xen_irq_enable_direct)
- RELOC(xen_irq_enable_direct, 2b+1)
-
-
-/*
- Disabling events is simply a matter of making the event mask
- non-zero.
- */
-ENTRY(xen_irq_disable_direct)
- movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-ENDPATCH(xen_irq_disable_direct)
- ret
- ENDPROC(xen_irq_disable_direct)
- RELOC(xen_irq_disable_direct, 0)
+#include "xen-asm.h"
/*
- (xen_)save_fl is used to get the current interrupt enable status.
- Callers expect the status to be in X86_EFLAGS_IF, and other bits
- may be set in the return value. We take advantage of this by
- making sure that X86_EFLAGS_IF has the right value (and other bits
- in that byte are 0), but other bits in the return value are
- undefined. We need to toggle the state of the bit, because
- Xen and x86 use opposite senses (mask vs enable).
+ * Force an event check by making a hypercall, but preserve regs
+ * before making the call.
*/
-ENTRY(xen_save_fl_direct)
- testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
- setz %ah
- addb %ah,%ah
-ENDPATCH(xen_save_fl_direct)
- ret
- ENDPROC(xen_save_fl_direct)
- RELOC(xen_save_fl_direct, 0)
-
-
-/*
- In principle the caller should be passing us a value return
- from xen_save_fl_direct, but for robustness sake we test only
- the X86_EFLAGS_IF flag rather than the whole byte. After
- setting the interrupt mask state, it checks for unmasked
- pending events and enters the hypervisor to get them delivered
- if so.
- */
-ENTRY(xen_restore_fl_direct)
- testb $X86_EFLAGS_IF>>8, %ah
- setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
- /* Preempt here doesn't matter because that will deal with
- any pending interrupts. The pending check may end up being
- run on the wrong CPU, but that doesn't hurt. */
-
- /* check for unmasked and pending */
- cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
- jz 1f
-2: call check_events
-1:
-ENDPATCH(xen_restore_fl_direct)
+check_events:
+ push %eax
+ push %ecx
+ push %edx
+ call xen_force_evtchn_callback
+ pop %edx
+ pop %ecx
+ pop %eax
ret
- ENDPROC(xen_restore_fl_direct)
- RELOC(xen_restore_fl_direct, 2b+1)
/*
- We can't use sysexit directly, because we're not running in ring0.
- But we can easily fake it up using iret. Assuming xen_sysexit
- is jumped to with a standard stack frame, we can just strip it
- back to a standard iret frame and use iret.
+ * We can't use sysexit directly, because we're not running in ring0.
+ * But we can easily fake it up using iret. Assuming xen_sysexit is
+ * jumped to with a standard stack frame, we can just strip it back to
+ * a standard iret frame and use iret.
*/
ENTRY(xen_sysexit)
movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
@@ -122,33 +48,31 @@ ENTRY(xen_sysexit)
ENDPROC(xen_sysexit)
/*
- This is run where a normal iret would be run, with the same stack setup:
- 8: eflags
- 4: cs
- esp-> 0: eip
-
- This attempts to make sure that any pending events are dealt
- with on return to usermode, but there is a small window in
- which an event can happen just before entering usermode. If
- the nested interrupt ends up setting one of the TIF_WORK_MASK
- pending work flags, they will not be tested again before
- returning to usermode. This means that a process can end up
- with pending work, which will be unprocessed until the process
- enters and leaves the kernel again, which could be an
- unbounded amount of time. This means that a pending signal or
- reschedule event could be indefinitely delayed.
-
- The fix is to notice a nested interrupt in the critical
- window, and if one occurs, then fold the nested interrupt into
- the current interrupt stack frame, and re-process it
- iteratively rather than recursively. This means that it will
- exit via the normal path, and all pending work will be dealt
- with appropriately.
-
- Because the nested interrupt handler needs to deal with the
- current stack state in whatever form its in, we keep things
- simple by only using a single register which is pushed/popped
- on the stack.
+ * This is run where a normal iret would be run, with the same stack setup:
+ * 8: eflags
+ * 4: cs
+ * esp-> 0: eip
+ *
+ * This attempts to make sure that any pending events are dealt with
+ * on return to usermode, but there is a small window in which an
+ * event can happen just before entering usermode. If the nested
+ * interrupt ends up setting one of the TIF_WORK_MASK pending work
+ * flags, they will not be tested again before returning to
+ * usermode. This means that a process can end up with pending work,
+ * which will be unprocessed until the process enters and leaves the
+ * kernel again, which could be an unbounded amount of time. This
+ * means that a pending signal or reschedule event could be
+ * indefinitely delayed.
+ *
+ * The fix is to notice a nested interrupt in the critical window, and
+ * if one occurs, then fold the nested interrupt into the current
+ * interrupt stack frame, and re-process it iteratively rather than
+ * recursively. This means that it will exit via the normal path, and
+ * all pending work will be dealt with appropriately.
+ *
+ * Because the nested interrupt handler needs to deal with the current
+ * stack state in whatever form its in, we keep things simple by only
+ * using a single register which is pushed/popped on the stack.
*/
ENTRY(xen_iret)
/* test eflags for special cases */
@@ -158,13 +82,15 @@ ENTRY(xen_iret)
push %eax
ESP_OFFSET=4 # bytes pushed onto stack
- /* Store vcpu_info pointer for easy access. Do it this
- way to avoid having to reload %fs */
+ /*
+ * Store vcpu_info pointer for easy access. Do it this way to
+ * avoid having to reload %fs
+ */
#ifdef CONFIG_SMP
GET_THREAD_INFO(%eax)
- movl TI_cpu(%eax),%eax
- movl __per_cpu_offset(,%eax,4),%eax
- mov per_cpu__xen_vcpu(%eax),%eax
+ movl TI_cpu(%eax), %eax
+ movl __per_cpu_offset(,%eax,4), %eax
+ mov per_cpu__xen_vcpu(%eax), %eax
#else
movl per_cpu__xen_vcpu, %eax
#endif
@@ -172,37 +98,46 @@ ENTRY(xen_iret)
/* check IF state we're restoring */
testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
- /* Maybe enable events. Once this happens we could get a
- recursive event, so the critical region starts immediately
- afterwards. However, if that happens we don't end up
- resuming the code, so we don't have to be worried about
- being preempted to another CPU. */
+ /*
+ * Maybe enable events. Once this happens we could get a
+ * recursive event, so the critical region starts immediately
+ * afterwards. However, if that happens we don't end up
+ * resuming the code, so we don't have to be worried about
+ * being preempted to another CPU.
+ */
setz XEN_vcpu_info_mask(%eax)
xen_iret_start_crit:
/* check for unmasked and pending */
cmpw $0x0001, XEN_vcpu_info_pending(%eax)
- /* If there's something pending, mask events again so we
- can jump back into xen_hypervisor_callback */
+ /*
+ * If there's something pending, mask events again so we can
+ * jump back into xen_hypervisor_callback
+ */
sete XEN_vcpu_info_mask(%eax)
popl %eax
- /* From this point on the registers are restored and the stack
- updated, so we don't need to worry about it if we're preempted */
+ /*
+ * From this point on the registers are restored and the stack
+ * updated, so we don't need to worry about it if we're
+ * preempted
+ */
iret_restore_end:
- /* Jump to hypervisor_callback after fixing up the stack.
- Events are masked, so jumping out of the critical
- region is OK. */
+ /*
+ * Jump to hypervisor_callback after fixing up the stack.
+ * Events are masked, so jumping out of the critical region is
+ * OK.
+ */
je xen_hypervisor_callback
1: iret
xen_iret_end_crit:
-.section __ex_table,"a"
+.section __ex_table, "a"
.align 4
- .long 1b,iret_exc
+ .long 1b, iret_exc
.previous
hyper_iret:
@@ -212,55 +147,55 @@ hyper_iret:
.globl xen_iret_start_crit, xen_iret_end_crit
/*
- This is called by xen_hypervisor_callback in entry.S when it sees
- that the EIP at the time of interrupt was between xen_iret_start_crit
- and xen_iret_end_crit. We're passed the EIP in %eax so we can do
- a more refined determination of what to do.
-
- The stack format at this point is:
- ----------------
- ss : (ss/esp may be present if we came from usermode)
- esp :
- eflags } outer exception info
- cs }
- eip }
- ---------------- <- edi (copy dest)
- eax : outer eax if it hasn't been restored
- ----------------
- eflags } nested exception info
- cs } (no ss/esp because we're nested
- eip } from the same ring)
- orig_eax }<- esi (copy src)
- - - - - - - - -
- fs }
- es }
- ds } SAVE_ALL state
- eax }
- : :
- ebx }<- esp
- ----------------
-
- In order to deliver the nested exception properly, we need to shift
- everything from the return addr up to the error code so it
- sits just under the outer exception info. This means that when we
- handle the exception, we do it in the context of the outer exception
- rather than starting a new one.
-
- The only caveat is that if the outer eax hasn't been
- restored yet (ie, it's still on stack), we need to insert
- its value into the SAVE_ALL state before going on, since
- it's usermode state which we eventually need to restore.
+ * This is called by xen_hypervisor_callback in entry.S when it sees
+ * that the EIP at the time of interrupt was between
+ * xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in
+ * %eax so we can do a more refined determination of what to do.
+ *
+ * The stack format at this point is:
+ * ----------------
+ * ss : (ss/esp may be present if we came from usermode)
+ * esp :
+ * eflags } outer exception info
+ * cs }
+ * eip }
+ * ---------------- <- edi (copy dest)
+ * eax : outer eax if it hasn't been restored
+ * ----------------
+ * eflags } nested exception info
+ * cs } (no ss/esp because we're nested
+ * eip } from the same ring)
+ * orig_eax }<- esi (copy src)
+ * - - - - - - - -
+ * fs }
+ * es }
+ * ds } SAVE_ALL state
+ * eax }
+ * : :
+ * ebx }<- esp
+ * ----------------
+ *
+ * In order to deliver the nested exception properly, we need to shift
+ * everything from the return addr up to the error code so it sits
+ * just under the outer exception info. This means that when we
+ * handle the exception, we do it in the context of the outer
+ * exception rather than starting a new one.
+ *
+ * The only caveat is that if the outer eax hasn't been restored yet
+ * (ie, it's still on stack), we need to insert its value into the
+ * SAVE_ALL state before going on, since it's usermode state which we
+ * eventually need to restore.
*/
ENTRY(xen_iret_crit_fixup)
/*
- Paranoia: Make sure we're really coming from kernel space.
- One could imagine a case where userspace jumps into the
- critical range address, but just before the CPU delivers a GP,
- it decides to deliver an interrupt instead. Unlikely?
- Definitely. Easy to avoid? Yes. The Intel documents
- explicitly say that the reported EIP for a bad jump is the
- jump instruction itself, not the destination, but some virtual
- environments get this wrong.
+ * Paranoia: Make sure we're really coming from kernel space.
+ * One could imagine a case where userspace jumps into the
+ * critical range address, but just before the CPU delivers a
+ * GP, it decides to deliver an interrupt instead. Unlikely?
+ * Definitely. Easy to avoid? Yes. The Intel documents
+ * explicitly say that the reported EIP for a bad jump is the
+ * jump instruction itself, not the destination, but some
+ * virtual environments get this wrong.
*/
movl PT_CS(%esp), %ecx
andl $SEGMENT_RPL_MASK, %ecx
@@ -270,15 +205,17 @@ ENTRY(xen_iret_crit_fixup)
lea PT_ORIG_EAX(%esp), %esi
lea PT_EFLAGS(%esp), %edi
- /* If eip is before iret_restore_end then stack
- hasn't been restored yet. */
+ /*
+ * If eip is before iret_restore_end then stack
+ * hasn't been restored yet.
+ */
cmp $iret_restore_end, %eax
jae 1f
- movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */
+ movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */
movl %eax, PT_EAX(%esp)
- lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
+ lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */
/* set up the copy */
1: std
@@ -286,20 +223,6 @@ ENTRY(xen_iret_crit_fixup)
rep movsl
cld
- lea 4(%edi),%esp /* point esp to new frame */
+ lea 4(%edi), %esp /* point esp to new frame */
2: jmp xen_do_upcall
-
-/*
- Force an event check by making a hypercall,
- but preserve regs before making the call.
- */
-check_events:
- push %eax
- push %ecx
- push %edx
- call xen_force_evtchn_callback
- pop %edx
- pop %ecx
- pop %eax
- ret
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 05794c566e8..02f496a8dba 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -1,174 +1,45 @@
/*
- Asm versions of Xen pv-ops, suitable for either direct use or inlining.
- The inline versions are the same as the direct-use versions, with the
- pre- and post-amble chopped off.
-
- This code is encoded for size rather than absolute efficiency,
- with a view to being able to inline as much as possible.
-
- We only bother with direct forms (ie, vcpu in pda) of the operations
- here; the indirect forms are better handled in C, since they're
- generally too large to inline anyway.
+ * Asm versions of Xen pv-ops, suitable for either direct use or
+ * inlining. The inline versions are the same as the direct-use
+ * versions, with the pre- and post-amble chopped off.
+ *
+ * This code is encoded for size rather than absolute efficiency, with
+ * a view to being able to inline as much as possible.
+ *
+ * We only bother with direct forms (ie, vcpu in pda) of the
+ * operations here; the indirect forms are better handled in C, since
+ * they're generally too large to inline anyway.
*/
-#include <linux/linkage.h>
-
-#include <asm/asm-offsets.h>
-#include <asm/processor-flags.h>
#include <asm/errno.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
#include <asm/segment.h>
#include <xen/interface/xen.h>
-#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
-#define ENDPATCH(x) .globl x##_end; x##_end=.
-
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI 0x80000000
-
-#if 1
-/*
- x86-64 does not yet support direct access to percpu variables
- via a segment override, so we just need to make sure this code
- never gets used
- */
-#define BUG ud2a
-#define PER_CPU_VAR(var, off) 0xdeadbeef
-#endif
-
-/*
- Enable events. This clears the event mask and tests the pending
- event status with one and operation. If there are pending
- events, then enter the hypervisor to get them handled.
- */
-ENTRY(xen_irq_enable_direct)
- BUG
-
- /* Unmask events */
- movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
-
- /* Preempt here doesn't matter because that will deal with
- any pending interrupts. The pending check may end up being
- run on the wrong CPU, but that doesn't hurt. */
-
- /* Test for pending */
- testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
- jz 1f
-
-2: call check_events
-1:
-ENDPATCH(xen_irq_enable_direct)
- ret
- ENDPROC(xen_irq_enable_direct)
- RELOC(xen_irq_enable_direct, 2b+1)
-
-/*
- Disabling events is simply a matter of making the event mask
- non-zero.
- */
-ENTRY(xen_irq_disable_direct)
- BUG
-
- movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
-ENDPATCH(xen_irq_disable_direct)
- ret
- ENDPROC(xen_irq_disable_direct)
- RELOC(xen_irq_disable_direct, 0)
-
-/*
- (xen_)save_fl is used to get the current interrupt enable status.
- Callers expect the status to be in X86_EFLAGS_IF, and other bits
- may be set in the return value. We take advantage of this by
- making sure that X86_EFLAGS_IF has the right value (and other bits
- in that byte are 0), but other bits in the return value are
- undefined. We need to toggle the state of the bit, because
- Xen and x86 use opposite senses (mask vs enable).
- */
-ENTRY(xen_save_fl_direct)
- BUG
-
- testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
- setz %ah
- addb %ah,%ah
-ENDPATCH(xen_save_fl_direct)
- ret
- ENDPROC(xen_save_fl_direct)
- RELOC(xen_save_fl_direct, 0)
-
-/*
- In principle the caller should be passing us a value return
- from xen_save_fl_direct, but for robustness sake we test only
- the X86_EFLAGS_IF flag rather than the whole byte. After
- setting the interrupt mask state, it checks for unmasked
- pending events and enters the hypervisor to get them delivered
- if so.
- */
-ENTRY(xen_restore_fl_direct)
- BUG
-
- testb $X86_EFLAGS_IF>>8, %ah
- setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
- /* Preempt here doesn't matter because that will deal with
- any pending interrupts. The pending check may end up being
- run on the wrong CPU, but that doesn't hurt. */
-
- /* check for unmasked and pending */
- cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
- jz 1f
-2: call check_events
-1:
-ENDPATCH(xen_restore_fl_direct)
- ret
- ENDPROC(xen_restore_fl_direct)
- RELOC(xen_restore_fl_direct, 2b+1)
-
-
-/*
- Force an event check by making a hypercall,
- but preserve regs before making the call.
- */
-check_events:
- push %rax
- push %rcx
- push %rdx
- push %rsi
- push %rdi
- push %r8
- push %r9
- push %r10
- push %r11
- call xen_force_evtchn_callback
- pop %r11
- pop %r10
- pop %r9
- pop %r8
- pop %rdi
- pop %rsi
- pop %rdx
- pop %rcx
- pop %rax
- ret
+#include "xen-asm.h"
ENTRY(xen_adjust_exception_frame)
- mov 8+0(%rsp),%rcx
- mov 8+8(%rsp),%r11
+ mov 8+0(%rsp), %rcx
+ mov 8+8(%rsp), %r11
ret $16
hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
/*
- Xen64 iret frame:
-
- ss
- rsp
- rflags
- cs
- rip <-- standard iret frame
-
- flags
-
- rcx }
- r11 }<-- pushed by hypercall page
-rsp -> rax }
+ * Xen64 iret frame:
+ *
+ * ss
+ * rsp
+ * rflags
+ * cs
+ * rip <-- standard iret frame
+ *
+ * flags
+ *
+ * rcx }
+ * r11 }<-- pushed by hypercall page
+ * rsp->rax }
*/
ENTRY(xen_iret)
pushq $0
@@ -177,8 +48,8 @@ ENDPATCH(xen_iret)
RELOC(xen_iret, 1b+1)
/*
- sysexit is not used for 64-bit processes, so it's
- only ever used to return to 32-bit compat userspace.
+ * sysexit is not used for 64-bit processes, so it's only ever used to
+ * return to 32-bit compat userspace.
*/
ENTRY(xen_sysexit)
pushq $__USER32_DS
@@ -193,13 +64,15 @@ ENDPATCH(xen_sysexit)
RELOC(xen_sysexit, 1b+1)
ENTRY(xen_sysret64)
- /* We're already on the usermode stack at this point, but still
- with the kernel gs, so we can easily switch back */
- movq %rsp, %gs:pda_oldrsp
- movq %gs:pda_kernelstack,%rsp
+ /*
+ * We're already on the usermode stack at this point, but
+ * still with the kernel gs, so we can easily switch back
+ */
+ movq %rsp, PER_CPU_VAR(old_rsp)
+ movq PER_CPU_VAR(kernel_stack), %rsp
pushq $__USER_DS
- pushq %gs:pda_oldrsp
+ pushq PER_CPU_VAR(old_rsp)
pushq %r11
pushq $__USER_CS
pushq %rcx
@@ -210,13 +83,15 @@ ENDPATCH(xen_sysret64)
RELOC(xen_sysret64, 1b+1)
ENTRY(xen_sysret32)
- /* We're already on the usermode stack at this point, but still
- with the kernel gs, so we can easily switch back */
- movq %rsp, %gs:pda_oldrsp
- movq %gs:pda_kernelstack, %rsp
+ /*
+ * We're already on the usermode stack at this point, but
+ * still with the kernel gs, so we can easily switch back
+ */
+ movq %rsp, PER_CPU_VAR(old_rsp)
+ movq PER_CPU_VAR(kernel_stack), %rsp
pushq $__USER32_DS
- pushq %gs:pda_oldrsp
+ pushq PER_CPU_VAR(old_rsp)
pushq %r11
pushq $__USER32_CS
pushq %rcx
@@ -227,28 +102,27 @@ ENDPATCH(xen_sysret32)
RELOC(xen_sysret32, 1b+1)
/*
- Xen handles syscall callbacks much like ordinary exceptions,
- which means we have:
- - kernel gs
- - kernel rsp
- - an iret-like stack frame on the stack (including rcx and r11):
- ss
- rsp
- rflags
- cs
- rip
- r11
- rsp-> rcx
-
- In all the entrypoints, we undo all that to make it look
- like a CPU-generated syscall/sysenter and jump to the normal
- entrypoint.
+ * Xen handles syscall callbacks much like ordinary exceptions, which
+ * means we have:
+ * - kernel gs
+ * - kernel rsp
+ * - an iret-like stack frame on the stack (including rcx and r11):
+ * ss
+ * rsp
+ * rflags
+ * cs
+ * rip
+ * r11
+ * rsp->rcx
+ *
+ * In all the entrypoints, we undo all that to make it look like a
+ * CPU-generated syscall/sysenter and jump to the normal entrypoint.
*/
.macro undo_xen_syscall
- mov 0*8(%rsp),%rcx
- mov 1*8(%rsp),%r11
- mov 5*8(%rsp),%rsp
+ mov 0*8(%rsp), %rcx
+ mov 1*8(%rsp), %r11
+ mov 5*8(%rsp), %rsp
.endm
/* Normal 64-bit system call target */
@@ -275,7 +149,7 @@ ENDPROC(xen_sysenter_target)
ENTRY(xen_syscall32_target)
ENTRY(xen_sysenter_target)
- lea 16(%rsp), %rsp /* strip %rcx,%r11 */
+ lea 16(%rsp), %rsp /* strip %rcx, %r11 */
mov $-ENOSYS, %rax
pushq $VGCF_in_syscall
jmp hypercall_iret
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index d7422dc2a55..2f5ef2632ea 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -10,9 +10,12 @@
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
+extern void *xen_initial_gdt;
+
struct trap_info;
void xen_copy_trap_info(struct trap_info *traps);
+DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
DECLARE_PER_CPU(unsigned long, xen_cr3);
DECLARE_PER_CPU(unsigned long, xen_current_cr3);
@@ -22,6 +25,13 @@ extern struct shared_info *HYPERVISOR_shared_info;
void xen_setup_mfn_list_list(void);
void xen_setup_shared_info(void);
+void xen_setup_machphys_mapping(void);
+pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
+void xen_ident_map_ISA(void);
+void xen_reserve_top(void);
+
+void xen_leave_lazy(void);
+void xen_post_allocator_init(void);
char * __init xen_memory_setup(void);
void __init xen_arch_setup(void);
@@ -49,7 +59,7 @@ bool xen_vcpu_stolen(int vcpu);
void xen_mark_init_mm_pinned(void);
-void __init xen_setup_vcpu_info_placement(void);
+void xen_setup_vcpu_info_placement(void);
#ifdef CONFIG_SMP
void xen_smp_init(void);
@@ -58,7 +68,7 @@ void __init xen_init_spinlocks(void);
__cpuinit void xen_init_lock_cpu(int cpu);
void xen_uninit_lock_cpu(int cpu);
-extern cpumask_t xen_cpu_initialized_map;
+extern cpumask_var_t xen_cpu_initialized_map;
#else
static inline void xen_smp_init(void) {}
#endif